# Collaborative Filtering Techniques

In [None]:
import random
import pandas as pd


## Introduce Data Sparsity

To simulate real-world data where users rate only a subset of items, have each user rate a random N selection of programs.

In [None]:
user_ratings = pd.read_csv('../university_user_ratings.csv').set_index('Unnamed: 0')
user_ratings.index.names = ['index']

### Adjust sparsity and density of User-Item Matrix

In [13]:
def create_sparse_dataset(df_ratings, sparsity_level):
    """
    Create a sparse dataset by selecting a subset of ratings.

    Parameters:
    - ratings_df: DataFrame matrix containing the full ratings 
    - sparsity_level: Percentage of ratings to keep (e.g., 0.1 for 10%)

    Returns:
    - Sparse ratings DataFrame
    """

    ### Calculate number of ratings, each user make (sparsity percent * total university)
    rating_per_user = sparsity_level * df_ratings.shape[0]
    
    sparse_ratings_df = pd.DataFrame().reindex_like(df_ratings)
    for user_index in range(df_ratings.shape[1]):
        # Each user randomly rate 0.9 - 1.1 of ratings avg
        rating_per_user_threshold = random.randint(round(0.9 * rating_per_user), round(1.1 * rating_per_user))
        uni_id_to_rate = random.sample(list(range(df_ratings.shape[0])), k=rating_per_user_threshold) 
        for uni_id in uni_id_to_rate:
            sparse_ratings_df.loc[f'uni_id_{uni_id}', f'userid_{user_index}'] \
             = df_ratings.loc[f'uni_id_{uni_id}', f'userid_{user_index}']
    return sparse_ratings_df


In [None]:
sparsity_level = float(input("Choose sparsity level (0-1): "))


sparse_ratings_df = create_sparse_dataset (user_ratings, sparsity_level)
print(f"Sparsity: {sparsity_level:.2f}")
print(f"Density: {1-sparsity_level:.2f}")

Sparsity: 0.12
Density: 0.88


### Adjust sparsity and density of User-Item Matrix

## Techniques

In [17]:
# Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data.
# $ conda install -c conda-forge scikit-surprise
from surprise import KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVD, NMF

### Build Dataset

In [None]:
# reshape DataFrame
melted_df_ratings = sparse_ratings_df.reset_index().melt(id_vars=['index'], var_name='user_id', value_name='ratings')
melted_df_ratings.rename(columns={'index': 'uni_id'}, inplace=True)

# remove NaN value
melted_df_ratings.dropna(inplace=True)

reader = Reader() #default is already 1-5
dataset = Dataset.load_from_df(melted_df_ratings[['user_id','uni_id','ratings']], reader) #It must have three columns, corresponding to the user (raw) ids, the item (raw) ids, and the ratings, in this order.

# melted_df_ratings.count()

uni_id     690533
user_id    690533
ratings    690533
dtype: int64

### User-based CF using kNN Algorithm

In [None]:
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': True,  # User-based collaborative filtering
    'min_support': 3,   # Minimum number of common items for similarity
    'shrinkage': 100    # Shrinkage parameter in case of sparse data
}

# Define the algorithm
user_cf = KNNBasic(k=20, min_k=1,sim_options=sim_options,verbose=True)

# Perform 5-fold cross-validation
user_cf_cv_results = cross_validate(user_cf, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


### Item-based CF using kNN Algorithm

In [None]:
# Define Item-Based CF algorithm
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': False,  # Item-based collaborative filtering
    'min_support': 5,   # Minimum number of common items for similarity
    'shrinkage': 100    # Shrinkage parameter in case of sparse data
}
item_cf = KNNBasic(k=20, min_k=1,sim_options=sim_options, verbose=True)

# Perform 5-fold cross-validation
item_cf_cv_results = cross_validate(item_cf, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)


### Singular Vector Decomposition (SVD)

In [None]:
# Define SVD algorithm
svd_algo = SVD(n_factors=10, n_epochs=20,verbose=True)

# Perform 5-fold cross-validation
svd_cv_results = cross_validate(svd_algo, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)


### Non-Negative Matrix Factorization

In [None]:
# Define NMF algorithm
nmf_algo = NMF(n_factors=10, n_epochs=20,biased=False)

# Perform 5-fold cross-validation
nmf_cv_results = cross_validate(nmf_algo, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)

## Comparison

In [None]:
def print_cv_results(algo_name, cv_results):
    mean_rmse = cv_results['test_rmse'].mean()
    mean_mae = cv_results['test_mae'].mean()
    std_rmse = cv_results['test_rmse'].std()
    std_mae = cv_results['test_mae'].std()
    print(f"{algo_name} - RMSE: {mean_rmse:.4f} (± {std_rmse:.4f}), MAE: {mean_mae:.4f} (± {std_mae:.4f})")

print_cv_results("User-Based CF", user_cf_cv_results)
print_cv_results("Item-Based CF", item_cf_cv_results)
print_cv_results("SVD", svd_cv_results)
print_cv_results("NMF", nmf_cv_results)