# Collaborative Filtering Techniques

In [51]:
import random
import pandas as pd


## Introduce Data Sparsity

To simulate real-world data where users rate only a subset of items, have each user rate a random N selection of programs.

In [52]:
user_ratings = pd.read_csv('../university_user_ratings.csv').set_index('Unnamed: 0')
user_ratings.index.names = ['index']

### Adjust sparsity and density of User-Item Matrix

In [53]:
def create_sparse_dataset(df_ratings_matrix, density_level):
    """
    Create a sparse dataset by selecting a subset of ratings.

    Parameters:
    - ratings_df: DataFrame matrix containing the full ratings 
    - sparsity_level: Percentage of ratings to keep (e.g., 0.1 for 10%)

    Returns:
    - Sparse ratings DataFrame
    """

    ### Calculate number of ratings, each user make (sparsity percent * total university)
    rating_per_user = density_level * df_ratings_matrix.shape[0]
    
    sparse_ratings_df = pd.DataFrame().reindex_like(df_ratings_matrix)
    for user_index in range(df_ratings_matrix.shape[1]):
        # Each user randomly rate 0.9 - 1.1 of ratings avg
        rating_per_user_threshold = random.randint(round(0.9 * rating_per_user), round(1.1 * rating_per_user))
        uni_id_to_rate = random.sample(list(range(df_ratings_matrix.shape[0])), k=rating_per_user_threshold) 
        for uni_id in uni_id_to_rate:
            sparse_ratings_df.loc[f'uni_id_{uni_id}', f'userid_{user_index}'] \
             = df_ratings_matrix.loc[f'uni_id_{uni_id}', f'userid_{user_index}']
    return sparse_ratings_df


In [54]:
sparsity_level = float(input("Choose sparsity level (0-1): "))


sparse_ratings_df = create_sparse_dataset (user_ratings, 1-sparsity_level)
print(f"Sparsity: {sparsity_level:.2f}")
print(f"Density: {1-sparsity_level:.2f}")

Sparsity: 0.12
Density: 0.88


## Techniques

In [55]:
# Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data.
# $ conda install -c conda-forge scikit-surprise
from surprise.model_selection import cross_validate
from surprise import Dataset, Reader, KNNBasic, SVD, NMF, accuracy


### Build Dataset

In [59]:
# build DataFrame for Train set
melted_sparse_ratings = sparse_ratings_df.reset_index().melt(id_vars=['index'], var_name='user_id', value_name='ratings')
melted_sparse_ratings.rename(columns={'index': 'uni_id'}, inplace=True)
train_df = melted_sparse_ratings.dropna(inplace=False)

# build DataFrame for Test set
melted_user_ratings = user_ratings.reset_index().melt(id_vars=['index'], var_name='user_id', value_name='ratings')
melted_user_ratings.rename(columns={'index': 'uni_id'}, inplace=True)
test_df = melted_user_ratings.copy()


In [57]:
reader = Reader() #default is already 1-5

sparse_rating_dataset = Dataset.load_from_df(train_df[['user_id', 'uni_id', 'ratings']], reader)
trainset = sparse_rating_dataset.build_full_trainset()

full_rating_dataset = Dataset.load_from_df(test_df[['user_id', 'uni_id', 'ratings']], reader)
testset = full_rating_dataset.build_full_trainset().build_testset()

### User-based CF using kNN Algorithm

In [None]:
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': True,  # User-based collaborative filtering
    'min_support': 5,   # Minimum number of common items for similarity
}

# Define the algorithm
user_cf_algo = KNNBasic(k=20, min_k=1,sim_options=sim_options,verbose=True)

user_cf_algo.fit(trainset)
ub_predictions = user_cf_algo.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


### Item-based CF using kNN Algorithm


In [None]:
# Define Item-Based CF algorithm
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': False,  # Item-based collaborative filtering
    'min_support': 5,   # Minimum number of common items for similarity
}
item_cf_algo = KNNBasic(k=20, min_k=1,sim_options=sim_options, verbose=True)

item_cf_algo.fit(trainset)
ib_predictions = item_cf_algo.test(testset)

### Singular Vector Decomposition (SVD)

In [None]:
# Define SVD algorithm
svd_algo = SVD(n_factors=15, n_epochs=20,verbose=True)

svd_algo.fit(trainset)
svd_predictions = svd_algo.test(testset)

### Non-Negative Matrix Factorization


In [None]:
# Define NMF algorithm
nmf_algo = NMF(n_factors=15, n_epochs=20,biased=False)

nmf_algo.fit(trainset)
nmf_predictions = nmf_algo.test(testset)

### Comparison

In [None]:
print("User-Based CF")
accuracy.rmse(ub_predictions, verbose=True)
print("Item-Based CF")
accuracy.rmse(ib_predictions, verbose=True)
print("SVD")
accuracy.rmse(svd_predictions, verbose=True)
print("NMF")
accuracy.rmse(nmf_predictions, verbose=True)