In [3]:
import pandas as pd
from surprise import Dataset, Reader, SVD, KNNBasic
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy

# Load ratings
ratings_path = '../data/ml-latest-small/ratings.csv'  
df_ratings = pd.read_csv(ratings_path)

# Surprise Reader: scale is 0.5 to 5.0
reader = Reader(rating_scale=(0.5, 5.0))

# Load into Surprise format
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

# Train-test split (80/20)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

print(f"Trainset size: {trainset.n_ratings}")
print(f"Testset size: {len(testset)}")

Trainset size: 80668
Testset size: 20168


In [4]:
# SVD = Singular Value Decomposition (matrix factorization)
algo = SVD(
    n_factors=100,      # number of latent factors
    n_epochs=20,        # number of training iterations
    lr_all=0.005,       # learning rate
    reg_all=0.02,       # regularization
    random_state=42
)

algo.fit(trainset)


predictions = algo.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")

RMSE: 0.8807
MAE:  0.6766
RMSE: 0.8807
MAE:  0.6766


In [5]:
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100, 150, 200],
    'n_epochs':  [20, 30, 40],
    'lr_all':    [0.002, 0.005, 0.007, 0.01],
    'reg_all':   [0.01, 0.02, 0.04, 0.08]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1)
gs.fit(data)

print("Best RMSE:", gs.best_score['rmse'])
print("Best params:", gs.best_params['rmse'])

# Retrain final model with best params
best_algo = gs.best_estimator['rmse']
best_algo.fit(trainset)

Best RMSE: 0.8502132551307724
Best params: {'n_factors': 150, 'n_epochs': 40, 'lr_all': 0.01, 'reg_all': 0.08}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b9a2017e60>

In [6]:
predictions = best_algo.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")

RMSE: 0.8571
MAE:  0.6554
RMSE: 0.8571
MAE:  0.6554


In [7]:

movies = pd.read_csv('../data/ml-latest-small/processed/movies_fully_enriched_with_content_text.csv')  
ratings = df_ratings 
def get_collaborative_recommendations(user_id, n=10, model=algo):
    """
    Recommend top N movies a user hasn't rated yet, using the trained model.
    """
    # All movie IDs
    all_movie_ids = movies['movieId'].unique()

    # Movies the user has already rated
    user_rated = ratings[ratings['userId'] == user_id]['movieId'].values
    candidates = [mid for mid in all_movie_ids if mid not in user_rated]

    # Predict ratings for all unseen movies
    predictions = []
    for movie_id in candidates:
        est_rating = model.predict(user_id, movie_id).est
        predictions.append((movie_id, est_rating))

    # Sort by predicted rating descending
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_n = predictions[:n]

    # Get movie details
    top_df = pd.DataFrame(top_n, columns=['movieId', 'predicted_rating'])
    top_df = top_df.merge(movies[['movieId', 'title', 'genres']], on='movieId')

    return top_df[['title', 'genres', 'predicted_rating']]

# Test on an active user
print(ratings['userId'].value_counts().head(10))  

# Example
user_id = 599
print(f"\nTop 8 recommendations for user {user_id}:")
print(get_collaborative_recommendations(user_id, n=8))

userId
414    2698
599    2478
474    2108
448    1864
274    1346
610    1302
68     1260
380    1218
606    1115
288    1055
Name: count, dtype: int64

Top 8 recommendations for user 599:
                                              title  \
0  Three Billboards Outside Ebbing, Missouri (2017)   
1              Bridge on the River Kwai, The (1957)   
2               Guess Who's Coming to Dinner (1967)   
3                           Schindler's List (1993)   
4    Cinema Paradiso (Nuovo cinema Paradiso) (1989)   
5                    Outlaw Josey Wales, The (1976)   
6                             Cool Hand Luke (1967)   
7            One Flew Over the Cuckoo's Nest (1975)   

                                    genres  predicted_rating  
0                              Crime|Drama          3.591165  
1                      Adventure|Drama|War          3.563174  
2                                    Drama          3.563031  
3                                Drama|War          3.537565  

In [8]:
import joblib


joblib.dump(best_algo, 'svd_model.joblib')

print("SVD model saved to svd_model.joblib")

SVD model saved to svd_model.joblib


In [9]:
print(ratings['userId'].value_counts().tail(10))  

userId
207    20
257    20
278    20
320    20
406    20
431    20
442    20
569    20
576    20
595    20
Name: count, dtype: int64
