# Movie Recommendation Models Notebook

## Imports

In [95]:
# Core libraries
import pandas as pd
import numpy as np

# Similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Matrix factorization
from sklearn.decomposition import TruncatedSVD

# Metrics
from sklearn.metrics import precision_score, recall_score


## Load Processed Data

In [96]:
full_data = pd.read_csv("processed_data/movielens_processed.csv")

print("Data Loaded:")
print(full_data.head())


Data Loaded:
   user_id  item_id  rating                       title  age  gender  \
0      196      242       3                Kolya (1996)   49       0   
1      186      302       3    L.A. Confidential (1997)   39       1   
2       22      377       1         Heavyweights (1994)   25       0   
3      244       51       2  Legends of the Fall (1994)   28       0   
4      166      346       1         Jackie Brown (1997)   47       0   

   occupation  
0          20  
1           6  
2          20  
3          19  
4           3  


## Train-Test Split

In [97]:
train_df, test_df = train_test_split(full_data, test_size=0.2, random_state=42)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")


Train size: 80000, Test size: 20000


## Evaluation Helpers

In [98]:
def precision_recall_at_k(model_func, ratings_matrix, movies_df, sim_matrix=None, 
                          user_factors=None, item_factors=None, k=5):
    precisions, recalls = [], []
    
    for user_id in ratings_matrix.index:
        # get recommendations
        if model_func.__name__ == "recommend_user_cf":
            recs = model_func(user_id, ratings_matrix, sim_matrix, movies_df, k=k)
        elif model_func.__name__ == "recommend_item_cf":
            recs = model_func(user_id, ratings_matrix, sim_matrix, movies_df, k=k)
        elif model_func.__name__ == "recommend_svd_truncated":
            recs = model_func(user_id, user_factors, item_factors, ratings_matrix, movies_df, k=k)
        else:
            continue
        
        # ground truth (items the user actually rated in test set)
        relevant_items = test_df[test_df["user_id"] == user_id]["item_id"].values
        
        if len(relevant_items) == 0 or recs.empty:
            continue
        
        recommended_items = recs["item_id"].values
        
        # binary relevance vectors
        y_true = [1 if item in relevant_items else 0 for item in recommended_items]
        y_pred = [1] * len(recommended_items)
        
        precisions.append(precision_score(y_true, y_pred, zero_division=0))
        recalls.append(recall_score(y_true, y_pred, zero_division=0))
    
    return np.mean(precisions), np.mean(recalls)


## Model 1 - User-Based Collaborative Filtering

In [99]:
# Create user-item rating matrix
user_item_matrix = train_df.pivot(index="user_id", columns="item_id", values="rating").fillna(0)

# Compute cosine similarity between users
user_sim_matrix = cosine_similarity(user_item_matrix)
user_sim_df = pd.DataFrame(user_sim_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)

def recommend_user_cf(user_id, ratings_matrix, sim_matrix, movies_df, k=5):
    # Get similarity scores
    sim_scores = sim_matrix[user_id - 1]  # adjust for 0-index
    weighted_ratings = sim_scores @ ratings_matrix.values
    norm = np.abs(sim_scores).sum()
    preds = weighted_ratings / norm if norm > 0 else weighted_ratings
    
    # Exclude already rated movies
    rated_items = ratings_matrix.loc[user_id][ratings_matrix.loc[user_id] > 0].index
    preds = pd.Series(preds, index=ratings_matrix.columns)
    preds = preds.drop(rated_items, errors="ignore")
    
    # Top-k
    top_k = preds.sort_values(ascending=False).head(k)
    
    # Map to movie titles (deduplicate)
    top_movies = movies_df.drop_duplicates("item_id")
    result = top_movies[top_movies["item_id"].isin(top_k.index)][["item_id", "title"]]
    result = result.set_index("item_id").loc[top_k.index]  # preserve order
    result = result.reset_index().assign(predicted_rating=top_k.values)
    
    return result

# Example recommendation
target_user_id = 13
print(f"Top 5 recommendations for User {target_user_id} (User-Based CF):")
print(recommend_user_cf(target_user_id, user_item_matrix, user_sim_matrix, full_data, k=5))



Top 5 recommendations for User 13 (User-Based CF):
   item_id                                      title  predicted_rating
0       50                           Star Wars (1977)          2.461095
1       79                       Fugitive, The (1993)          1.591377
2      210  Indiana Jones and the Last Crusade (1989)          1.590639
3       69                        Forrest Gump (1994)          1.461275
4      183                               Alien (1979)          1.423157


## Evaluate User-Based CF

In [100]:
p_user, r_user = precision_recall_at_k(
    recommend_user_cf, user_item_matrix, full_data, sim_matrix=user_sim_matrix, k=5
)
print(f"Precision@5 (User-CF): {p_user:.3f}")
print(f"Recall@5    (User-CF): {r_user:.3f}")


Precision@5 (User-CF): 0.316
Recall@5    (User-CF): 0.738


## Model 2 – Item-Based Collaborative Filtering

In [101]:
# Compute cosine similarity between items
item_sim_matrix = cosine_similarity(user_item_matrix.T)
item_sim_df = pd.DataFrame(item_sim_matrix, index=user_item_matrix.columns, columns=user_item_matrix.columns)

def recommend_item_cf(user_id, ratings_matrix, sim_matrix, movies_df, k=5):
    # Get user's ratings
    user_ratings = ratings_matrix.loc[user_id]
    
    # Weighted sum of similarities
    preds = sim_matrix @ user_ratings.values
    preds = pd.Series(preds, index=ratings_matrix.columns)
    
    # Exclude already rated
    preds = preds.drop(user_ratings[user_ratings > 0].index, errors="ignore")
    
    # Top-k predictions
    top_k = preds.sort_values(ascending=False).head(k)
    
    # Map to movie titles (deduplicate + preserve order)
    top_movies = movies_df.drop_duplicates("item_id")
    result = top_movies[top_movies["item_id"].isin(top_k.index)][["item_id", "title"]]
    result = result.set_index("item_id").loc[top_k.index]
    result = result.reset_index().assign(predicted_rating=top_k.values)
    
    return result

# Example recommendation
print(f"Top 5 recommendations for User {target_user_id} (Item-Based CF):")
print(recommend_item_cf(target_user_id, user_item_matrix, item_sim_matrix, full_data, k=5))


Top 5 recommendations for User 13 (Item-Based CF):
   item_id                                      title  predicted_rating
0      210  Indiana Jones and the Last Crusade (1989)        427.071102
1       79                       Fugitive, The (1993)        412.232767
2       50                           Star Wars (1977)        408.059391
3       69                        Forrest Gump (1994)        401.745955
4      196                  Dead Poets Society (1989)        396.253245


## Evaluate Item-Based CF

In [102]:
p_item, r_item = precision_recall_at_k(
    recommend_item_cf, user_item_matrix, full_data, sim_matrix=item_sim_matrix, k=5
)
print(f"Precision@5 (Item-CF): {p_item:.3f}")
print(f"Recall@5    (Item-CF): {r_item:.3f}")


Precision@5 (Item-CF): 0.356
Recall@5    (Item-CF): 0.785


## Model 3 – Matrix Factorization (SVD)

In [103]:
# Apply Truncated SVD on user-item matrix
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_.T

def recommend_svd_truncated(user_id, user_factors, item_factors, ratings_matrix, movies_df, k=5):
    # Get user vector
    user_vector = user_factors[user_id - 1]  # adjust for 0-index
    
    # Predict scores for all items
    scores = user_vector @ item_factors.T
    preds = pd.Series(scores, index=ratings_matrix.columns)
    
    # Exclude already rated
    rated_items = ratings_matrix.loc[user_id][ratings_matrix.loc[user_id] > 0].index
    preds = preds.drop(rated_items, errors="ignore")
    
    # Top-k
    top_k = preds.sort_values(ascending=False).head(k)
    
    # Map to movie titles
    top_movies = movies_df.drop_duplicates("item_id")
    result = top_movies[top_movies["item_id"].isin(top_k.index)][["item_id", "title"]]
    result = result.set_index("item_id").loc[top_k.index]
    result = result.reset_index().assign(predicted_rating=top_k.values)
    
    return result

# Example recommendation
print(f"Top 5 recommendations for User {target_user_id} (SVD with TruncatedSVD):")
print(recommend_svd_truncated(target_user_id, user_factors, item_factors, user_item_matrix, full_data, k=5))


Top 5 recommendations for User 13 (SVD with TruncatedSVD):
   item_id                              title  predicted_rating
0      510      Magnificent Seven, The (1954)          3.045277
1       15          Mr. Holland's Opus (1995)          2.732076
2      467               Bronx Tale, A (1993)          2.286077
3      530  Man Who Would Be King, The (1975)          2.261765
4       63           Santa Clause, The (1994)          2.141904


## Evaluate SVD

In [104]:
p_svd, r_svd = precision_recall_at_k(
    recommend_svd_truncated, user_item_matrix, full_data, 
    user_factors=user_factors, item_factors=item_factors, k=5
)
print(f"Precision@5 (SVD): {p_svd:.3f}")
print(f"Recall@5    (SVD): {r_svd:.3f}")


Precision@5 (SVD): 0.361
Recall@5    (SVD): 0.804
