# Collaborative Filtering Techniques

In [1]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm

## Introduce Data Sparsity

To simulate real-world data where users rate only a subset of items, have each user rate a random N selection of programs.

In [2]:
user_ratings = pd.read_csv('../university_user_ratings.csv').set_index('Unnamed: 0')
user_ratings.index.names = ['index']

### Adjust sparsity and density of User-Item Matrix

In [3]:
def create_sparse_dataset(df_ratings_matrix, density_level):
    """
    Create a sparse dataset by selecting a subset of ratings.

    Parameters:
    - ratings_df: DataFrame matrix containing the full ratings
    - density_level: Percentage of ratings to keep (e.g., 0.1 for 10%)

    Returns:
    - Sparse ratings DataFrame
    """

    ### Calculate number of ratings, each user make (sparsity percent * total university)
    total_ratings = density_level * df_ratings_matrix.size
    # rating_per_user = total_ratings / df_ratings_matrix.shape[1]

    # Randomly decide the number of ratings per uni_id
    uni_id_weights = np.random.dirichlet(alpha=np.ones(df_ratings_matrix.shape[0]))
    uni_ratings_distribution = np.random.multinomial(total_ratings, uni_id_weights)

    sparse_ratings_df = pd.DataFrame().reindex_like(df_ratings_matrix)
    for uni_id, num_ratings_for_uni in enumerate(tqdm(uni_ratings_distribution)):
        # Select random users to rate this uni_id
        users_to_rate = random.sample(
            list(df_ratings_matrix.columns), k=min(num_ratings_for_uni, len(df_ratings_matrix.columns))
        )
        for user_id in users_to_rate:
            # Copy the rating from the original matrix
            sparse_ratings_df.loc[f'uni_id_{uni_id}', user_id] = df_ratings_matrix.loc[f'uni_id_{uni_id}', user_id]

    return sparse_ratings_df


In [4]:
sparsity_level = float(input("Choose sparsity level (0-1): "))


sparse_ratings_df = create_sparse_dataset (user_ratings, 1-sparsity_level)
print(f"Sparsity: {sparsity_level:.2f}")
print(f"Density: {1-sparsity_level:.2f}")

100%|██████████| 10404/10404 [06:31<00:00, 26.58it/s]

Sparsity: 0.00
Density: 1.00





## Techniques

In [5]:
# Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data.
# $ conda install -c conda-forge scikit-surprise
from surprise import Dataset, Reader, KNNBasic, SVD, NMF, accuracy

### Build Dataset

In [6]:
avg_rating_user = np.round((1 - sparsity_level) * user_ratings.shape[0])
avg_rating_uni = np.round((1 - sparsity_level) * user_ratings.shape[1])
print(f"Average rating per user: {avg_rating_user:.2f}")
print(f"Average rating per uni: {avg_rating_uni:.2f}")

Average rating per user: 10404.00
Average rating per uni: 554.00


In [7]:
# reshape DataFrame
melted_sparse_ratings = sparse_ratings_df.reset_index().melt(id_vars=['index'], var_name='user_id', value_name='ratings')
melted_sparse_ratings.rename(columns={'index': 'uni_id'}, inplace=True)
train_df = melted_sparse_ratings.dropna(inplace=False)

# reshape DataFrame
melted_user_ratings = user_ratings.reset_index().melt(id_vars=['index'], var_name='user_id', value_name='ratings')
melted_user_ratings.rename(columns={'index': 'uni_id'}, inplace=True)
test_df = melted_user_ratings.copy()

In [8]:
reader = Reader() #default is already 1-5

sparse_rating_dataset = Dataset.load_from_df(train_df[['user_id', 'uni_id', 'ratings']], reader)
trainset = sparse_rating_dataset.build_full_trainset()

full_rating_dataset = Dataset.load_from_df(test_df[['user_id', 'uni_id', 'ratings']], reader)
testset = full_rating_dataset.build_full_trainset().build_testset()

In [9]:
# Filter the train_df based on the valid_uni_ids
valid_uni_ids = train_df['uni_id'].value_counts()[train_df['uni_id'].value_counts() >= avg_rating_uni].index
df = train_df[train_df['uni_id'].isin(valid_uni_ids)]

dataset = Dataset.load_from_df(df[['user_id', 'uni_id', 'ratings']], reader)
memory_based_train = dataset.build_full_trainset()

In [10]:
# # Define SVD algorithm
# svd_algo = SVD(n_factors=15, n_epochs=20,verbose=True)

# svd_algo.fit(trainset)
# svd_predictions = svd_algo.test(testset)
# accuracy.rmse((svd_predictions), verbose=True)

In [11]:
# # Define NMF algorithm
# nmf_algo = NMF(n_factors=15, n_epochs=20,biased=False)

# nmf_algo.fit(trainset)
# nmf_predictions = nmf_algo.test(testset)

In [12]:
# sim_options = {
#     'name': 'cosine',  # Use cosine similarity
#     'user_based': True,  # User-based collaborative filtering
#     'min_support': 5,   # Minimum number of common items for similarity
# }

# # Define the algorithm
# user_cf_algo = KNNBasic(k=20, min_k=1,sim_options=sim_options,verbose=True)

# user_cf_algo.fit(memory_based_train)
# ub_predictions = user_cf_algo.test(testset)

In [13]:
# Define Item-Based CF algorithm
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': False,  # Item-based collaborative filtering
    'min_support': 5,   # Minimum number of common items for similarity
}
item_cf_algo = KNNBasic(k=20, min_k=1,sim_options=sim_options, verbose=True)

item_cf_algo.fit(memory_based_train)
ib_predictions = item_cf_algo.test(testset)
print("Item-Based CF")
accuracy.rmse(ib_predictions, verbose=True)
accuracy.mae(ib_predictions, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Item-Based CF
RMSE: 0.7050
MAE:  0.5613


0.5613336043936576

In [14]:
# print("User-Based CF")
# accuracy.rmse(ub_predictions, verbose=True)
# accuracy.mae(ub_predictions, verbose=True)
# print("Item-Based CF")
# accuracy.rmse(ib_predictions, verbose=True)
# accuracy.mae(ib_predictions, verbose=True)
# print("SVD")
# accuracy.rmse(svd_predictions, verbose=True)
# accuracy.mae(svd_predictions, verbose=True)
# print("NMF")
# accuracy.rmse(nmf_predictions, verbose=True)
# accuracy.mae(nmf_predictions, verbose=True)

In [15]:
# from surprise import accuracy, Dataset, SVD
# from surprise.model_selection import KFold

# sparse_rating_dataset = Dataset.load_from_df(sparse_ratings_df[['user_id', 'uni_id', 'ratings']], reader)
# trainset = sparse_rating_dataset.build_full_trainset()

# svd_algo = SVD(n_factors=15, n_epochs=20,verbose=False)

# full_rating_dataset = Dataset.load_from_df(full_ratings_df[['user_id', 'uni_id', 'ratings']], reader)
# testset = full_rating_dataset.build_full_trainset().build_testset()

# svd_algo.fit(trainset)
# predictions = svd_algo.test(testset)
# # # RMSE should be low as we are biased
# # accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)