In [2]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# import os
from surprise import BaselineOnly, Dataset, Reader, SVD, NMF, SVDpp, accuracy, PredictionImpossible, KNNWithMeans, KNNBasic, NormalPredictor, KNNWithZScore, KNNBaseline, SlopeOne, CoClustering
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV, PredefinedKFold
from surprise.model_selection.split import LeaveOneOut
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict
import pandas as pd
import tempfile

In [19]:

np.random.seed(42)

genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
              'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
              'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

column_names = ["item","title","genres","movie_name","movie_year","(no genres listed)","Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","IMAX","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western","user","rating","rating_timestamp","rating_year","rating_month","rating_season,tag","tag_timestamp","cleaned_tag","tag_length","tag_year"]

data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)

reader = Reader(rating_scale=(1, 5))

ratings = data[['user', 'item', 'rating']]
ratings = ratings.iloc[1:]
ratings["rating"] = ratings["rating"].astype(float)

# custom_data = Dataset.load_from_df(ratings, reader)

# print(custom_data)

  data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)
  data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)


For initial testing creating a dataset of 10k users

In [21]:
ratings = ratings.sample(n=10000, random_state=42)

In [22]:
custom_data = Dataset.load_from_df(ratings, reader)

print(custom_data)

<surprise.dataset.DatasetAutoFolds object at 0x33a9e3400>


In [37]:
# Train-test split
trainset, testset = train_test_split(custom_data, test_size=0.2)

# Convert trainset to dataframe (for content-based model)
train_df = pd.DataFrame(trainset.build_testset(), columns=['user', 'item', 'rating'])
test_df = pd.DataFrame(testset, columns=['user', 'item', 'rating'])


# Step 1: Filter users with >= 5 test ratings
test_user_counts = test_df['user'].value_counts()
eligible_users = test_user_counts[test_user_counts >= 5].index.tolist()

# Different number of known ratings to test
# known_ratings_list = [5]
known_ratings_list = [5, 10, 20, 30, 40]

In [24]:
rmse_results = []

# Step 2: Keep only eligible users in test set
filtered_test_df = test_df[test_df['user'].isin(eligible_users)]

# Custom SVD to raise prediction errors when needed (optional)
class ColdStartSVD(SVD):
    def estimate(self, u, i):
        if not self.trainset.knows_user(u) or not self.trainset.knows_item(i):
            raise PredictionImpossible("Cold start issue: user or item not in training set.")
        return super().estimate(u, i)

Function to get the best params for a given model

In [41]:
# measures=["rmse"]
measures=["rmse", "mae", "mse"]

def perform_grid_search(algo, params, data):
    
    gs = GridSearchCV(algo, params, measures=measures, cv=3)

    gs.fit(data)

    for measure in measures:
        # best RMSE score
        print(gs.best_score[measure])
        print(f"best {measure} score {gs.best_score}")

        # combination of parameters that gave the best measure score
        print(gs.best_params[measure])
    return gs.best_estimator["rmse"]

Function to run the cold start train test split 

In [35]:

def run_cold_start_model(algo):
    # Iterate over each number of known ratings
    for known_ratings in known_ratings_list:
        print(f"Processing for {known_ratings} known ratings per user...")
        
        # Step 3: Reduce training data to 'known_ratings' ratings per user for these test users (simulate cold start)
        limited_train_rows = []
        for user in eligible_users:
            user_ratings = train_df[train_df['user'] == user]
            if len(user_ratings) > known_ratings:
                sampled = user_ratings.sample(known_ratings, random_state=42)
            else:
                sampled = user_ratings
            limited_train_rows.append(sampled)

        # Step 4: Add all training data from non-eligible users (normal users)
        non_eligible_users_df = train_df[~train_df['user'].isin(eligible_users)]
        cold_start_train_df = pd.concat(limited_train_rows + [non_eligible_users_df], ignore_index=True)

        # Build training set for Surprise
        reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
        cold_start_data = Dataset.load_from_df(cold_start_train_df[['user', 'item', 'rating']], reader)
        cold_start_trainset = cold_start_data.build_full_trainset()

        # Train the model
        # algo = ColdStartSVD()
        # algo = algo()
        algo.fit(cold_start_trainset)

        # Build the final test set for Surprise
        final_testset = [tuple(x) for x in filtered_test_df.to_numpy()]
        predictions = algo.test(final_testset)

        # Evaluate
        rmse = accuracy.rmse(predictions, verbose=False)
        rmse_results.append((known_ratings, rmse))
        print(f"RMSE on filtered cold-start test users (with {known_ratings} training ratings each): {rmse}")

    # Print all RMSE results
    for known_ratings, rmse in rmse_results:
        print(f"RMSE for {known_ratings} known ratings: {rmse}")

In [42]:
param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}

best_svd = perform_grid_search(SVD, param_grid, custom_data)

1.0303712425750293
best rmse score {'rmse': 1.0303712425750293, 'mae': 0.8435369064795406, 'mse': 1.0617148153875902}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
0.8435369064795406
best mae score {'rmse': 1.0303712425750293, 'mae': 0.8435369064795406, 'mse': 1.0617148153875902}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
1.0617148153875902
best mse score {'rmse': 1.0303712425750293, 'mae': 0.8435369064795406, 'mse': 1.0617148153875902}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [43]:
run_cold_start_model(best_svd)

Processing for 5 known ratings per user...
RMSE on filtered cold-start test users (with 5 training ratings each): 0.9579924066377772
Processing for 10 known ratings per user...
RMSE on filtered cold-start test users (with 10 training ratings each): 0.9772117992917813
Processing for 20 known ratings per user...
RMSE on filtered cold-start test users (with 20 training ratings each): 0.9663555320219779
Processing for 30 known ratings per user...
RMSE on filtered cold-start test users (with 30 training ratings each): 0.9502920502105732
Processing for 40 known ratings per user...
RMSE on filtered cold-start test users (with 40 training ratings each): 0.9832672069186676
RMSE for 5 known ratings: 1.0996772608383736
RMSE for 5 known ratings: 1.1581671794221506
RMSE for 5 known ratings: 0.9743168733829947
RMSE for 10 known ratings: 0.9664987786841481
RMSE for 20 known ratings: 0.9541863029376978
RMSE for 30 known ratings: 0.9894655011102981
RMSE for 40 known ratings: 0.982622313329475
RMSE for 

In [None]:
# Algorithms initialisation

baseline_algo = BaselineOnly()
svd_algo = SVD()
svdpp_algo = SVDpp()
nmf_algo = NMF()
normalPredictor_algo = NormalPredictor()
knnbasic_algo = KNNBasic()
knnwithMeans_algo = KNNWithMeans()
knnwithZScore_algo = KNNWithZScore()
knnbaseline_algo = KNNBaseline()
slopeone_algo = SlopeOne()
coClustering_algo = CoClustering()

# originally i planned to iterate over an entire list of all algorythms but this took too long, 
# so I just did it manually for each algo instance above
algos = [
    baseline_algo, 
    svd_algo, 
    svdpp_algo, 
    nmf_algo, 
    normalPredictor_algo, 
    knnbasic_algo,
    knnwithMeans_algo, 
    knnwithZScore_algo,
    slopeone_algo,  
    coClustering_algo, 
    knnbaseline_algo
    ]


In [5]:
# Iterators
from surprise.model_selection import KFold, RepeatedKFold, ShuffleSplit, LeaveOneOut
# capture each CV approach in a dictionary - iterators
# there are a variety of fold sizes per CV approach
iterators = {}
folds = [3, 5, 10, 20]

for fold in folds:
    iterators.update({
        f"KFold_{fold}":KFold(n_splits=fold),
        f"RepeatedKFold_{fold}":RepeatedKFold(n_splits=fold),
        f"ShuffleSplit_{fold}":ShuffleSplit(n_splits=fold),
        f"LeaveOneOut_{fold}":LeaveOneOut(n_splits=fold)
    })

# kfold_iter = KFold(n_splits=3)
# repeated_kfold_iter = RepeatedKFold(n_splits=3)
# shuffleSplit_iter = ShuffleSplit(n_splits=3)
# leaveOneOut_iter = LeaveOneOut(n_splits=3)

# iterators_ = [
#     kfold_iter,
#     repeated_kfold_iter,
#     shuffleSplit_iter,
#     leaveOneOut_iter
# ]

In [None]:
# Cross validation function defination 
#  assigns cv name and metrics to dictionary
def perform_cross_validation(algo, data, cv_name, dataset_name, cv=5, measures=["RMSE", "MSE", "MAE"]):
    # surprise cross_validate function
    results = cross_validate(algo, data, measures=measures, cv=cv, verbose=True)
    # print cv technique name
    print("cv", cv.__class__.__name__)
    # Assign to dictionary
    return {
        "Algorithm":f"{algo.__class__.__name__} - {cv_name}",
        "Dataset": dataset_name,
        "RMSE": results['test_rmse'].mean(),
        "MSE": results['test_mse'].mean(),
        "MAE": results['test_mae'].mean(),
        "FitTime":sum(results["fit_time"]) / len(results["fit_time"]),
        "TestTime":sum(results["test_time"]) / len(results["test_time"])
    }



In [7]:
# empty list to cpature results
results_list = []

In [8]:
# CV fold size variation function definition 
# run cross validation with a variety of fold sizes for each algorithm 
# appends the metrics to the results list
def runFoldSizeVariations(algo):
    for i_key, i_val in iterators.items():
        print("iterator", i_key)
        results_list.append(perform_cross_validation(algo, custom_data, i_key, "Custom", i_val))

In [9]:
# BaselineOnly - TEST
runFoldSizeVariations(baseline_algo)

iterator KFold_3
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MSE, MAE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8631  0.8637  0.8645  0.8638  0.0005  
MSE (testset)     0.7450  0.7460  0.7473  0.7461  0.0009  
MAE (testset)     0.6712  0.6716  0.6718  0.6715  0.0003  
Fit time          1.07    1.33    1.23    1.21    0.11    
Test time         1.47    1.43    1.61    1.50    0.08    
cv KFold
iterator RepeatedKFold_3
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [10]:
# SVD
runFoldSizeVariations(svd_algo)

iterator KFold_3
Evaluating RMSE, MSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8428  0.8414  0.8416  0.8419  0.0006  
MSE (testset)     0.7104  0.7079  0.7083  0.7089  0.0011  
MAE (testset)     0.6498  0.6482  0.6479  0.6486  0.0008  
Fit time          8.15    8.07    7.51    7.91    0.29    
Test time         2.29    2.02    2.03    2.11    0.12    
cv KFold
iterator RepeatedKFold_3
Evaluating RMSE, MSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8427  0.8405  0.8428  0.8429  0.8419  
MSE (testset)     0.7101  0.7065  0.7103  0.7105  0.7088  
MAE (testset)     0.6487  0.6476  0.6491  0.6493  0.6490  
Fit time          7.66    8.20    7.66    8.44    7.97    
Test time         2.24    2.38    2.03    2.05    1.98    
cv RepeatedKFold
iterator ShuffleSplit_3
Evaluating RMSE, MSE, MAE of algorithm SVD on 3 split(s).

                  Fold 

In [11]:
# SVDpp
runFoldSizeVariations(svdpp_algo)

iterator KFold_3
Evaluating RMSE, MSE, MAE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8267  0.8263  0.8266  0.8265  0.0002  
MSE (testset)     0.6834  0.6827  0.6832  0.6831  0.0003  
MAE (testset)     0.6337  0.6339  0.6340  0.6339  0.0001  
Fit time          108.22  108.42  109.25  108.63  0.44    
Test time         61.23   56.59   56.19   58.00   2.29    
cv KFold
iterator RepeatedKFold_3
Evaluating RMSE, MSE, MAE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8269  0.8271  0.8268  0.8259  0.8272  
MSE (testset)     0.6838  0.6841  0.6835  0.6821  0.6843  
MAE (testset)     0.6341  0.6337  0.6346  0.6331  0.6342  
Fit time          107.98  109.20  110.82  112.10  107.42  
Test time         55.98   61.86   67.76   64.35   60.10   
cv RepeatedKFold
iterator ShuffleSplit_3


KeyboardInterrupt: 

In [None]:
# NMF
runFoldSizeVariations(nmf_algo)

In [None]:
# NormalPredictor
runFoldSizeVariations(normalPredictor_algo)

In [None]:
# KNNBasic
runFoldSizeVariations(knnbasic_algo)

In [None]:
# KNNWithMeans
runFoldSizeVariations(knnwithMeans_algo)

In [None]:
# KNNWithZScore
runFoldSizeVariations(knnwithZScore_algo)

In [None]:
# KNNBaseline
runFoldSizeVariations(knnbaseline_algo)

In [None]:
# SlopeOne
runFoldSizeVariations(slopeone_algo)

In [None]:
# CoClustering
runFoldSizeVariations(coClustering_algo)

In [None]:
# results_list = results_list[~results_list['Algorithm'].str.contains('SVDpp')]
results_list = [item for item in results_list if 'SVDpp' not in item['Algorithm']]


In [None]:

# for algo in algos:
#     for i_key, i_val in iterators.items():
#         print("iterator", i_key)
#         results_list_2.append(perform_cross_validation(algo, _100k_data, i_key, "100k", i_val))

# print(results_list_2)

In [None]:
# Ratings sample

# chunksize=100000
# rating_cols=['userId', 'movieId', 'user_rating']
# date_parser = lambda x: pd.to_datetime(x, unit='s')

# ratings_data = pd.DataFrame()  # initialize an empty dataframe
# for index, chunk in enumerate(pd.read_csv('../samples/combined_movies_ratings_tags.csv',
#                          chunksize=chunksize,
#                          usecols=rating_cols,
#                          )):
    
#     ratings_data = pd.concat([ratings_data, chunk])
#     print(f"Chunk {index} loaded")
# print("ratings dataset loaded")

# ratings_data = ratings_data.reindex(columns=rating_cols)

In [None]:
# Some inital testing

# A reader is still needed but only the rating_scale param is required.
# reader = Reader(rating_scale=(1, 5))

# # The columns must correspond to user id, item id and ratings (in that order).
# ratings_cv = Dataset.load_from_df(ratings_data, reader)
# svd_ratings = cross_validate(svd_algo, ratings_cv, cv=5)