In [54]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# import os
from surprise import BaselineOnly, Dataset, Reader, SVD, NMF, SVDpp, accuracy, PredictionImpossible, KNNWithMeans, KNNBasic, NormalPredictor, KNNWithZScore, KNNBaseline, SlopeOne, CoClustering
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV, PredefinedKFold
from surprise.model_selection.split import LeaveOneOut
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict
import pandas as pd
import tempfile

In [55]:

np.random.seed(42)

genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
              'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
              'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

column_names = ["item","title","genres","movie_name","movie_year","(no genres listed)","Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","IMAX","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western","user","rating","rating_timestamp","rating_year","rating_month","rating_season,tag","tag_timestamp","cleaned_tag","tag_length","tag_year"]

data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)

reader = Reader(rating_scale=(1, 5))

ratings = data[['user', 'item', 'rating']]
ratings = ratings.iloc[1:]
ratings["rating"] = ratings["rating"].astype(float)


  data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)
  data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)


For initial testing creating a dataset of 10k users

In [56]:
custom_data = Dataset.load_from_df(ratings, reader)

<h4>Train test split</h4>

In [57]:
# Train-test split
trainset, testset = train_test_split(custom_data, test_size=0.2)

# Convert trainset to dataframe (for content-based model)
train_df = pd.DataFrame(trainset.build_testset(), columns=['user', 'item', 'rating'])
test_df = pd.DataFrame(testset, columns=['user', 'item', 'rating'])


# Step 1: Filter users with >= 5 test ratings
test_user_counts = test_df['user'].value_counts()
eligible_users = test_user_counts[test_user_counts >= 5].index.tolist()

# Different number of known ratings to test
# known_ratings_list = [5]
known_ratings_list = [5, 10, 20]

Function to get the best params for a given model

In [58]:

measures=["rmse", "mae", "mse"]

def perform_grid_search(algo, params):
    
    gs = GridSearchCV(algo, params, measures=measures, cv=3,  joblib_verbose=0)

    gs.fit(custom_data)

    
    # best RMSE score
    print(gs.best_score)


    # combination of parameters that gave the best measure score
    print(gs.best_params)
    return gs.best_estimator["rmse"]

Function to run the cold start train test split 

In [59]:
def run_cold_start_models(models, model_names, train_df, test_df, eligible_users, known_ratings_list, k=10):
    all_results = []

    for algo, model_name in zip(models, model_names):
        rmse_results = []
        mae_results = []
        mse_results = []
        ndcg_results = []

        # Iterate over each number of known ratings
        for known_ratings in known_ratings_list:
            print(f"Processing for {known_ratings} known ratings per user with {model_name}...")
            
            # Step 3: Reduce training data to 'known_ratings' ratings per user for these test users (simulate cold start)
            limited_train_rows = []
            for user in eligible_users:
                user_ratings = train_df[train_df['user'] == user]
                if len(user_ratings) > known_ratings:
                    sampled = user_ratings.sample(known_ratings, random_state=42)
                else:
                    sampled = user_ratings
                limited_train_rows.append(sampled)

            # Step 4: Add all training data from non-eligible users (normal users)
            non_eligible_users_df = train_df[~train_df['user'].isin(eligible_users)]
            cold_start_train_df = pd.concat(limited_train_rows + [non_eligible_users_df], ignore_index=True)

            # Build training set for Surprise
            reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
            cold_start_data = Dataset.load_from_df(cold_start_train_df[['user', 'item', 'rating']], reader)
            cold_start_trainset = cold_start_data.build_full_trainset()

            # Train the model
            algo.fit(cold_start_trainset)

            # Build the final test set for Surprise
            final_testset = [tuple(x) for x in test_df.to_numpy()]
            predictions = algo.test(final_testset)

            # Evaluate
            rmse = accuracy.rmse(predictions, verbose=False)
            mae = accuracy.mae(predictions, verbose=False)
            mse = accuracy.mse(predictions, verbose=False)
            
            rmse_results.append((known_ratings, rmse))
            mae_results.append((known_ratings, mae))
            mse_results.append((known_ratings, mse))
            
            print(f"RMSE on filtered cold-start test users (with {known_ratings} training ratings each) for {model_name}: {rmse}")
            print(f"MAE on filtered cold-start test users (with {known_ratings} training ratings each) for {model_name}: {mae}")
            print(f"MSE on filtered cold-start test users (with {known_ratings} training ratings each) for {model_name}: {mse}")

            # Calculate NDCG@K
            # Calculate NDCG@K
            ndcg_scores = []
            for user_id in eligible_users:
                user_ratings = train_df[train_df['user'] == user_id]
                
                # Predict ratings for unseen movies
                all_movies = set(train_df['item'].unique())
                seen_movies = set(user_ratings['item'])
                unseen_movies = list(all_movies - seen_movies)
                
                # Filter test data for the current user
                user_test_ratings = test_df[test_df['user'] == user_id]
                if user_test_ratings.empty:
                    continue  # Skip user if no test data
                
                predictions = [ (movie_id, algo.predict(user_id, movie_id).est) for movie_id in unseen_movies ]
                predictions.sort(key=lambda x: x[1], reverse=True)
                
                # Top-K recommended movies
                top_k_movies = [movie_id for movie_id, _ in predictions[:k]]
                
                # Movies the user actually liked (rating >= 4) in the test set
                true_liked_movies = user_test_ratings[user_test_ratings['rating'] >= 4]['item'].tolist()
                
                if not true_liked_movies:
                    continue  # Skip user if no strong likes in the test set
                
                # --- Calculate NDCG@K ---
                dcg = 0.0
                for idx, movie_id in enumerate(top_k_movies):
                    if movie_id in true_liked_movies:
                        dcg += 1.0 / np.log2(idx + 2)  # +2 because index starts from 0
                
                # Ideal DCG (IDCG)
                ideal_relevant = min(len(true_liked_movies), k)
                idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_relevant))
                
                ndcg = dcg / idcg if idcg > 0 else 0.0
                ndcg_scores.append(ndcg)

            average_ndcg = np.mean(ndcg_scores)
            ndcg_results.append((known_ratings, average_ndcg))
            print(f"NDCG@{k} on filtered cold-start test users (with {known_ratings} training ratings each) for {model_name}: {average_ndcg}")

        # Create a DataFrame to store the results for the current model
        results_df = pd.DataFrame({
            'Model': model_name,
            'Known Ratings': [r[0] for r in rmse_results],
            'RMSE': [r[1] for r in rmse_results],
            'MAE': [r[1] for r in mae_results],
            'MSE': [r[1] for r in mse_results],
            'NDCG@K': [r[1] for r in ndcg_results]
        })

        # Append the results to the all_results list
        all_results.append(results_df)

    # Concatenate all results into a single DataFrame
    final_results_df = pd.concat(all_results, ignore_index=True)
    return final_results_df



In [60]:
models = []
model_names = []

<h3>Grid search for best params<h3>

First create a reusable function to run grid search for any model

<h4>SVD</h4>

In [61]:
param_grid_svd = {
    'n_factors': [20, 50, 100, 150, 200],
    'n_epochs': [20, 30, 40, 50, 60],
    'lr_all': [0.002, 0.005, 0.007, 0.01, 0.02],
    'reg_all': [0.02, 0.05, 0.1, 0.15, 0.2]
}

# best_svd = perform_grid_search(SVD, param_grid_svd)

# models += [best_svd]
models += [SVD()]
model_names += ['SVD']

<h4>SVD++</h4>

In [62]:
param_grid_svdpp = {
    'n_factors': [20, 50, 100, 150, 200],
    'n_epochs': [10, 20, 30, 40, 50],
    'lr_all': [0.002, 0.005, 0.007, 0.01, 0.02],
    'reg_all': [0.02, 0.05, 0.1, 0.15, 0.2]
}

# best_svdpp = perform_grid_search(SVDpp, param_grid_svdpp)

# models += [best_svdpp]
models += [SVDpp()]
model_names += ['SVD++']

<h4>NMF</h4>

In [63]:
param_grid_nmf = {
    'n_factors': [15, 30, 50, 100, 150],
    'n_epochs': [50, 100, 150, 200, 300],
    'reg_pu': [0.06, 0.08, 0.1, 0.2, 0.3],
    'reg_qi': [0.06, 0.08, 0.1, 0.2, 0.3]
}

# best_nmf = perform_grid_search(NMF, param_grid_nmf)

# models += [best_nmf]
models += [NMF()]
model_names += ['NMF']

<h4>KNN basic</h4>

In [64]:
param_grid_knn_basic = {
    'k': [10, 20, 30, 40, 50],
    'min_k': [1, 3, 5, 7, 10],
    'sim_options': {
        'name': ['cosine', 'pearson', 'pearson_baseline'],
        'user_based': [True, False]
    }
}

# best_knn_basic = perform_grid_search(KNNBasic, param_grid_knn_basic)

# models += [best_knn_basic]
models += [KNNBasic()]

model_names += ['KNNBasic']

<h4>KNN with means</h4>

In [65]:
param_grid_knn_means = {
    'k': [10, 20, 30, 40, 50],
    'min_k': [1, 3, 5, 7, 10],
    'sim_options': {
        'name': ['cosine', 'pearson_baseline'],
        'user_based': [True, False]
    }
}

# best_knn_with_means = perform_grid_search(KNNWithMeans, param_grid_knn_means)

# models += [best_knn_with_means]
models += [KNNWithMeans()]
model_names += ['KNNWithMeans']

<h4>KNN with Z Score</h4>

In [66]:
param_grid_knn_zscore = {
    'k': [10, 20, 30, 40, 50],
    'min_k': [1, 3, 5, 7, 10],
    'sim_options': {
        'name': ['cosine', 'pearson'],
        'user_based': [True, False]
    }
}

# best_knn_z_score = perform_grid_search(KNNWithZScore, param_grid_knn_zscore)

# models += [best_knn_z_score]
models += [KNNWithZScore()]
model_names += ['KNNWithZScore']

<h4>KNN with Baseline</h4>

In [67]:
param_grid_knn_baseline = {
    'k': [10, 20, 30, 40, 50],
    'min_k': [1, 3, 5, 7, 10],
    'sim_options': {
        'name': ['pearson_baseline', 'cosine'],
        'user_based': [True, False]
    },
    'bsl_options': {
        'method': ['sgd', 'als'],
        'reg': [0.01, 0.02, 0.05, 0.1],
        'learning_rate': [0.002, 0.005, 0.01]
    }
}

# best_knn_baseline = perform_grid_search(KNNBaseline, param_grid_knn_baseline)

# models += [best_knn_baseline]
models += [KNNBaseline()]
model_names += ['KNNBaseline']

<h4>CoClustering</h4>

In [68]:
param_grid_coclustering = {
    'n_cltr_u': [3, 5, 10, 15, 20],
    'n_cltr_i': [3, 5, 10, 15, 20],
    'n_epochs': [10, 20, 30, 40, 50]
}

# best_coclustering = perform_grid_search(CoClustering, param_grid_coclustering)

# models += [best_coclustering]
models += [CoClustering()]
model_names += ['CoClustering']

<h4>Baseline Only</h4>

In [69]:
param_grid_baseline_only = {
    'bsl_options': {
        'method': ['sgd', 'als'],
        'reg': [0.01, 0.02, 0.05, 0.1],
        'learning_rate': [0.002, 0.005, 0.01],
        'n_epochs': [5, 10, 20, 30, 40]
    }
}

# best_baseline_only = perform_grid_search(BaselineOnly, param_grid_baseline_only)

# models += [best_baseline_only]
models += [BaselineOnly()]
model_names += ['BaselineOnly']

<h4>Models with no parameters</h4>

In [70]:
slopeone_algo = SlopeOne()
normalPredictor_algo = NormalPredictor()


models += [slopeone_algo, normalPredictor_algo]

model_names += ['SlopeOne()', "NormalPredictor()"]

<h4>Run best models with cold start data</h4>

In [71]:
final_results_df = run_cold_start_models(models, model_names, train_df, test_df, eligible_users, known_ratings_list)
print(final_results_df)

Processing for 5 known ratings per user with SVD...
RMSE on filtered cold-start test users (with 5 training ratings each) for SVD: 0.9464617680481648
MAE on filtered cold-start test users (with 5 training ratings each) for SVD: 0.7420066046025349
MSE on filtered cold-start test users (with 5 training ratings each) for SVD: 0.8957898783768582
NDCG@10 on filtered cold-start test users (with 5 training ratings each) for SVD: 0.03455584352712358
Processing for 10 known ratings per user with SVD...
RMSE on filtered cold-start test users (with 10 training ratings each) for SVD: 0.9208013568007083
MAE on filtered cold-start test users (with 10 training ratings each) for SVD: 0.7199915547661484
MSE on filtered cold-start test users (with 10 training ratings each) for SVD: 0.8478751386860253
NDCG@10 on filtered cold-start test users (with 10 training ratings each) for SVD: 0.034934546034699435
Processing for 20 known ratings per user with SVD...
RMSE on filtered cold-start test users (with 20 t

In [73]:
final_results_df.to_csv('final_results_df.csv')