In [2]:
import os
import joblib
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate

In [3]:
# ========= LOAD DATA =========

DATA_DIR = os.path.join('..', '..', 'data', 'cleaned', 'joblib_dataframes')

df_movies = joblib.load(os.path.join(DATA_DIR, 'df_final.joblib'))
df_ratings = joblib.load(os.path.join(DATA_DIR, 'df_final_matrix.joblib'))

# For quick testing, limit size
df_ratings = df_ratings.iloc[:500, :500]

# Convert all to numeric, coerce errors to NaN, then fill NaNs with 0 (assuming 0 = no rating)
df_ratings = df_ratings.apply(pd.to_numeric, errors='coerce').fillna(0)

In [4]:
# ========= COMPUTE USER-USER SIMILARITY =========

user_sim = cosine_similarity(df_ratings)
df_user_sim = pd.DataFrame(user_sim, index=df_ratings.index, columns=df_ratings.index)

def recommend_for_user_weighted(user_id, top_n=5, top_k=5):
    """
    Recommend movies to a user based on weighted ratings from top_k most similar users.

    Parameters:
        user_id (int): ID of the user for whom to recommend.
        top_n (int): Number of movie recommendations to return.
        top_k (int): Number of similar users to consider.

    Returns:
        list: Titles of recommended movies.
    """
    if user_id not in df_user_sim.index:
        return ["User ID not found."]

    # Get top_k most similar users (exclude the user itself)
    sim_scores = df_user_sim.loc[user_id].drop(user_id).nlargest(top_k)

    # Movies already rated by user
    user_movies = set(df_ratings.loc[user_id][df_ratings.loc[user_id] > 0].index)

    weighted_scores = pd.Series(dtype=float)

    for sim_user, score in sim_scores.items():
        sim_user_ratings = df_ratings.loc[sim_user]
        # Select movies rated by sim_user but not by user
        candidate_ratings = sim_user_ratings[(sim_user_ratings > 0) & (~sim_user_ratings.index.isin(user_movies))]
        # Weight by similarity score and accumulate
        weighted_scores = weighted_scores.add(candidate_ratings * score, fill_value=0)

    if weighted_scores.empty:
        return []

    # Pick top_n movies by weighted score
    top_movies = weighted_scores.sort_values(ascending=False).head(top_n).index
    titles = df_movies[df_movies['imdb_id'].isin(top_movies)]['title'].tolist()

    return titles


print("\n=== Weighted User-Based Collaborative Filtering (User 7) ===")
print(recommend_for_user_weighted(7, top_n=5))


=== Weighted User-Based Collaborative Filtering (User 7) ===
['Vertical Limit', 'High Fidelity', 'The Perfect Storm', 'U-571', 'Hanging Up']


In [5]:
# ========= PREPARE DATA FOR SURPRISE SVD =========

# Set index name for resetting
df_ratings.index.name = 'userId'
df_ratings_long = df_ratings.reset_index().melt(id_vars='userId', var_name='imdb_id', value_name='rating')

# Drop zero or missing ratings (Surprise needs explicit ratings)
df_ratings_long = df_ratings_long[df_ratings_long['rating'] > 0]

# Convert IDs to strings (Surprise prefers string IDs)
df_ratings_long['userId'] = df_ratings_long['userId'].astype(str)
df_ratings_long['imdb_id'] = df_ratings_long['imdb_id'].astype(str)

# Prepare Surprise Dataset
reader = Reader(rating_scale=(df_ratings_long['rating'].min(), df_ratings_long['rating'].max()))
data = Dataset.load_from_df(df_ratings_long[['userId', 'imdb_id', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD algorithm with basic parameters (tune for better performance)
algo = SVD(n_factors=50, n_epochs=20, random_state=42)
algo.fit(trainset)

# Cross-validation RMSE/MAE (optional)
cv_results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)


def get_movie_title(movie_id):
    """Return movie title for given imdb_id or 'Unknown Title'."""
    result = df_movies[df_movies['imdb_id'] == movie_id]['title']
    return result.values[0] if not result.empty else "Unknown Title"


def svd_recommend(user_id, top_n=5):
    """
    Recommend movies using trained SVD model for a given user.

    Parameters:
        user_id (int or str): User ID to recommend movies for.
        top_n (int): Number of recommendations.

    Returns:
        list: Titles of recommended movies.
    """
    user_id = str(user_id)

    # Movies the user has already rated
    seen_movies = set(df_ratings_long[df_ratings_long['userId'] == user_id]['imdb_id'])

    # Candidate movies: all movies user hasn't rated
    all_movies = set(df_ratings_long['imdb_id'].unique())
    unseen_movies = all_movies - seen_movies

    # Predict ratings for unseen movies
    predictions = [algo.predict(user_id, movie_id) for movie_id in unseen_movies]

    # Sort by estimated rating descending
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]

    recommended_titles = [get_movie_title(pred.iid) for pred in top_predictions]

    return recommended_titles


test_user_id = df_ratings_long['userId'].iloc[0]  # Example user from dataset
print(f"\n=== SVD Recommendations for User {test_user_id} ===")
print(svd_recommend(test_user_id, top_n=5))


Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9648  0.9644  0.9651  0.9648  0.0003  
MAE (testset)     0.7414  0.7307  0.7391  0.7371  0.0046  
Fit time          0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    

=== SVD Recommendations for User 707 ===
["Le fabuleux destin d'Amélie Poulain", 'The Lord of the Rings: The Fellowship of the Ring', 'The Lord of the Rings: The Two Towers', 'Mulholland Dr.', 'Amores perros']
