In [1]:
import pandas as pd

train_df = pd.read_csv("../data/processed/train.csv")
test_df= pd.read_csv("../data/processed/test.csv")
user_item_matrix = train_df.pivot_table(
        index='userId',
        columns='movieId',
        values='rating'
    ).fillna(0)
user_item_matrix.head()
    

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# 1) Calculate average rating and rating counts per movie in the train set
popularity = train_df.groupby("movieId")["rating"].agg(["mean", "count"])

popularity.columns = ["mean_rating", "rating_count"]

# 2) Filter out movies with fewer than 50 ratings to avoid noise
popularity = popularity[popularity["rating_count"] >= 50]

# 3) Sort by mean rating (descending)
popularity.sort_values("mean_rating", ascending=False, inplace=True)

# Now you have a fallback list for cold-start users
top_movies_for_cold_start = popularity.head(10)
top_movies_for_cold_start
 


Unnamed: 0_level_0,mean_rating,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,4.455253,257
912,4.313253,83
858,4.292763,152
1213,4.279412,102
904,4.278571,70
4973,4.276596,94
1198,4.273006,163
48516,4.269231,91
1221,4.268868,106
750,4.263514,74


In [6]:
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split


# 1) Convert your Pandas train DataFrame into Surprise format
reader = Reader(rating_scale=(0.5, 5.0))  # or (1,5) depending on your dataset
train_data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
full_trainset = train_data.build_full_trainset()

# 2) Initialize the model (SVD)
model = SVD(n_factors=50, reg_all=0.02, biased=True, random_state=42)

# 3) Train the model
model.fit(full_trainset)

# 4) Evaluate on the test set
test_data = Dataset.load_from_df(test_df[['userId', 'movieId', 'rating']], reader)
testset = test_data.build_full_trainset().build_testset()  # testset is a list of (user, item, rating)
predictions = model.test(testset)

rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse:.4f}")


RMSE: 0.8808
Test RMSE: 0.8808


In [7]:

from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [20, 50, 100],
    'reg_all': [0.02, 0.1, 0.4],
    'lr_all': [0.002, 0.005],
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(train_data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])  # best hyperparams


0.877753918571274
{'n_factors': 20, 'reg_all': 0.1, 'lr_all': 0.005}
