In [57]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from pathlib import Path
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [50]:
#Baseline Collaborative Filtering Model

#Load data
OUTPUT_DIR = Path('../data/processed')
df_train = pd.read_csv(OUTPUT_DIR/'train_data.csv')
df_test = pd.read_csv(OUTPUT_DIR/'test_data.csv')

#Define rating scale
reader = Reader(rating_scale=(0.5, 5))

#Prepare training data in Surprise format
train_data = Dataset.load_from_df(df_train[['userId', 'movieId', 'rating']], reader)
trainset = train_data.build_full_trainset()

#Prepare test data in Surprise format
testset = [(row['userId'], row['movieId'], row['rating']) for index, row in df_test.iterrows()]

#Initialize SVD model
model = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)

#Train model
model.fit(trainset)

#Test and evaluate model
predictions = model.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)



RMSE: 0.8359
MAE:  0.6353


In [54]:
#Evaluating how the baseline model performs on cold start movies. Note there are no cold start users in the test set.

#Interpreting results: 
#There were 100 cold start movies, appearing 257 times in the test set. The model's rmse and mae on the cold start set is essentially identical to the overall baseline model.
#This suggests that the model is predicting based on the global mean rating in the dataset and the user's average rating behavior.
#It is not using movie specific information so it cant distinguish between cold start movies.

#Analyzing cold start movie performance
train_movies = df_train['movieId'].unique()
test_movies = df_test['movieId'].unique()
cold_start_movies = set(test_movies) - set(train_movies)
print(f"Cold start movies: {len(cold_start_movies)}")

#Filter test set to only include cold start movies
df_test_cold = df_test[df_test['movieId'].isin(cold_start_movies)]
print(f"Test set for cold start movies: {len(df_test_cold)}")

#Test set for cold start movies only
testset_cold = [(row['userId'], row['movieId'], row['rating']) for index, row in df_test_cold.iterrows()]

#Test and evaluate model on cold start movies
predictions_cold = model.test(testset_cold)

rmse_cold = accuracy.rmse(predictions)
mae_cold = accuracy.mae(predictions)
print(f"rmse_cold: {rmse_cold:.4f}")
print(f"mae_cold: {mae_cold:.4f}")


Cold start movies: 100
Test set for cold start movies: 257
RMSE: 0.8359
MAE:  0.6353
rmse_cold: 0.8359
mae_cold: 0.6353


In [None]:
# Save baseline results

# Path to results folder 
results_path = '../results/baseline_metrics.json'

# Compile metrics
baseline_results = {
    'model': 'SVD',
    'parameters': {
        'n_factors': 100,
        'n_epochs': 20,
        'lr_all': 0.005,
        'reg_all': 0.02
    },
    'overall_performance': {
        'rmse': float(rmse),
        'mae': float(mae)
    },
    'cold_start_performance': {
        'rmse': float(rmse_cold),
        'mae': float(mae_cold),
        'n_movies': len(cold_start_movies),
        'n_ratings': len(testset_cold)
    }
}

# Save to JSON
with open(results_path, 'w') as f:
    json.dump(baseline_results, f, indent=4)
