# Collaborative Filtering Techniques

In [None]:
import random
import pandas as pd

## Introduce Data Sparsity

To simulate real-world data where users rate only a subset of items, have each user rate a random N selection of programs.

### Each User randomly rates range(M,N) items ---> use for LOW density & HIGH sparsity

In [None]:
user_ratings = pd.read_csv('../university_user_ratings.csv').set_index('Unnamed: 0')
user_ratings.index.names = ['index']

data_sparse_user_ratings = pd.DataFrame().reindex_like(user_ratings)

for user_index in range(user_ratings.shape[1]):
    # Each user randomly rate K (10-30) items
    k = random.randint(500,2000)
    uni_id_to_rate = random.sample(list(range(user_ratings.shape[0])), k=k) # choose which uni_ids will be evaluated and put in list
    for uni_id in uni_id_to_rate:
        data_sparse_user_ratings.loc[f'uni_id_{uni_id}', f'userid_{user_index}'] \
            = user_ratings.loc[f'uni_id_{uni_id}', f'userid_{user_index}']

### Each User randomly NOT rates range(M,N) items ---> use for HIGH density & LOW sparsity

In [None]:
# user_ratings = pd.read_csv('../university_user_ratings.csv').set_index('Unnamed: 0')
# user_ratings.index.names = ['index']
# data_sparse_user_ratings = user_ratings.copy()

# for user_index in range(user_ratings.shape[1]):
#     # Each user randomly rate K (10-30) items
#     k = random.randint(100,200)
#     uni_id_to_rate = random.sample(list(range(user_ratings.shape[0])), k=k) # choose which uni_ids will be evaluated and put in list
#     for uni_id in uni_id_to_rate:
#         data_sparse_user_ratings.loc[f'uni_id_{uni_id}', f'userid_{user_index}'] \
#             = None


### Calculate sparsity and density of User-Item Matrix

In [None]:
melted_df_ratings = data_sparse_user_ratings.reset_index().melt(id_vars=['index'], var_name='user_id', value_name='ratings')
melted_df_ratings.rename(columns={'index': 'uni_id'}, inplace=True)
melted_df_ratings.to_csv('../user_item_df.csv', index=False)

# calculate density & sparsity
tmp = melted_df_ratings.count()
actual_ratings = tmp['ratings']
total_possible_entries = tmp['user_id']
sparsity = 1 - (actual_ratings / total_possible_entries)
density = actual_ratings / total_possible_entries
print(f"Sparsity: {sparsity:.4f}")
print(f"Density: {density:.4f}")

ratings_matrix = melted_df_ratings.pivot(index='uni_id', columns='user_id', values='ratings')
# print(ratings_matrix)
print('User-Item Matrix:')
print(ratings_matrix.head(5))

## Techniques

In [None]:
# Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data.
# $ conda install -c conda-forge scikit-surprise
from surprise import KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVD, NMF

### Build Dataset

In [None]:
data = pd.read_csv('../user_item_df.csv').dropna()

reader = Reader() #default is already 1-5
dataset = Dataset.load_from_df(data[['user_id','uni_id','ratings']], reader) #It must have three columns, corresponding to the user (raw) ids, the item (raw) ids, and the ratings, in this order.

data.count()

### User-based CF using kNN Algorithm

In [None]:
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': True,  # User-based collaborative filtering
    'min_support': 3,   # Minimum number of common items for similarity
    'shrinkage': 100    # Shrinkage parameter in case of sparse data
}

# Define the algorithm
user_cf = KNNBasic(k=20, min_k=1,sim_options=sim_options,verbose=True)

# Perform 5-fold cross-validation
user_cf_cv_results = cross_validate(user_cf, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)

### Item-based CF using kNN Algorithm

In [None]:
# Define Item-Based CF algorithm
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': False,  # Item-based collaborative filtering
    'min_support': 5,   # Minimum number of common items for similarity
    'shrinkage': 100    # Shrinkage parameter in case of sparse data
}
item_cf = KNNBasic(k=20, min_k=1,sim_options=sim_options, verbose=True)

# Perform 5-fold cross-validation
item_cf_cv_results = cross_validate(item_cf, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)


### Singular Vector Decomposition (SVD)

In [None]:
# Define SVD algorithm
svd_algo = SVD(n_factors=10, n_epochs=20,verbose=True)

# Perform 5-fold cross-validation
svd_cv_results = cross_validate(svd_algo, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)


### Non-Negative Matrix Factorization

In [None]:
# Define NMF algorithm
nmf_algo = NMF(n_factors=10, n_epochs=20,biased=False)

# Perform 5-fold cross-validation
nmf_cv_results = cross_validate(nmf_algo, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)

## Comparison

In [None]:
def print_cv_results(algo_name, cv_results):
    mean_rmse = cv_results['test_rmse'].mean()
    mean_mae = cv_results['test_mae'].mean()
    std_rmse = cv_results['test_rmse'].std()
    std_mae = cv_results['test_mae'].std()
    print(f"{algo_name} - RMSE: {mean_rmse:.4f} (± {std_rmse:.4f}), MAE: {mean_mae:.4f} (± {std_mae:.4f})")

print_cv_results("User-Based CF", user_cf_cv_results)
print_cv_results("Item-Based CF", item_cf_cv_results)
print_cv_results("SVD", svd_cv_results)
print_cv_results("NMF", nmf_cv_results)