# Movie Recommendation Models Notebook

## Imports

In [35]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.sparse.linalg import svds


## Load Processed Data

In [18]:
# --- Directories ---
processed_data_dir = Path("processed_data")
processed_file = processed_data_dir / "movielens_processed.csv"

# --- Load processed dataset ---
if not processed_file.exists():
    raise FileNotFoundError("⚠️ movielens_processed.csv not found in processed_data. Run PreProcessing notebook first.")

data = pd.read_csv(processed_file)

print("✔️ Data loaded:", processed_file)
data.head()

✔️ Data loaded: processed_data\movielens_processed.csv


Unnamed: 0,user_id,item_id,rating,title,age,gender,occupation
0,196,242,3,Kolya (1996),49,0,20
1,186,302,3,L.A. Confidential (1997),39,1,6
2,22,377,1,Heavyweights (1994),25,0,20
3,244,51,2,Legends of the Fall (1994),28,0,19
4,166,346,1,Jackie Brown (1997),47,0,3


## Quick Data OverView

In [19]:
print("Number of ratings:", data.shape[0])
print("Number of users:", data['user_id'].nunique())
print("Number of movies:", data['item_id'].nunique())
print(data.info())


Number of ratings: 100000
Number of users: 943
Number of movies: 1682
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   user_id     100000 non-null  int64 
 1   item_id     100000 non-null  int64 
 2   rating      100000 non-null  int64 
 3   title       100000 non-null  object
 4   age         100000 non-null  int64 
 5   gender      100000 non-null  int64 
 6   occupation  100000 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 5.3+ MB
None


## Train-Test Split

In [20]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

print("Train size:", train.shape)
print("Test size :", test.shape)


Train size: (80000, 7)
Test size : (20000, 7)


## Build User-Item Matrix

In [21]:
user_item_matrix = train.pivot(index="user_id", columns="item_id", values="rating")
user_item_matrix = user_item_matrix.fillna(0)

print("User-Item Matrix shape:", user_item_matrix.shape)


User-Item Matrix shape: (943, 1653)


## Helper Functions (Evaluation)

In [23]:
# RMSE & MAE
def get_rmse_mae(true, pred):
    rmse = np.sqrt(mean_squared_error(true, pred))
    mae = mean_absolute_error(true, pred)
    return rmse, mae

# Precision / Recall / F1 @ K
def precision_recall_at_k(pred_matrix, test_df, k=10):
    precisions, recalls = [], []

    for user in test_df['user_id'].unique():
        true_items = test_df[(test_df.user_id == user) & (test_df.rating >= 4)]['item_id'].tolist()
        if not true_items:
            continue

        if user not in pred_matrix.index:
            continue

        preds = pred_matrix.loc[user].sort_values(ascending=False).head(k).index.tolist()

        hits = len(set(preds) & set(true_items))
        precisions.append(hits / k)
        recalls.append(hits / len(true_items))

    p, r = np.mean(precisions), np.mean(recalls)
    f1 = (2 * p * r) / (p + r + 1e-8)
    return p, r, f1


## Model 1 – User-Based Collaborative Filtering

In [24]:
# User similarity
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Prediction function
def predict_user_based(user_item, similarity):
    mean_user_rating = user_item.mean(axis=1).values.reshape(-1, 1)
    ratings_diff = (user_item - mean_user_rating)
    pred = mean_user_rating + similarity.dot(ratings_diff.fillna(0)) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pd.DataFrame(pred, index=user_item.index, columns=user_item.columns)

# Predictions
user_based_preds = predict_user_based(user_item_matrix, user_similarity)
print("User-Based CF done")


User-Based CF done


## Model 2 – Item-Based Collaborative Filtering

In [25]:
# Item similarity
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Prediction function
def predict_item_based(user_item, similarity):
    pred = user_item.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pd.DataFrame(pred, index=user_item.index, columns=user_item.columns)

# Predictions
item_based_preds = predict_item_based(user_item_matrix, item_similarity)
print("Item-Based CF done")


Item-Based CF done


## Model 3 – Matrix Factorization (SVD)

In [26]:
R = user_item_matrix.values
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# SVD decomposition
U, sigma, Vt = svds(R_demeaned, k=50)
sigma = np.diag(sigma)

# Reconstruct predictions
svd_preds = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
svd_preds_df = pd.DataFrame(svd_preds, index=user_item_matrix.index, columns=user_item_matrix.columns)

print("SVD done")


SVD done


## Evaluation Function

In [29]:
def evaluate_model(pred_matrix, test_df, model_name, k=10):
    preds, trues = [], []
    for row in test_df.itertuples():
        if row.user_id in pred_matrix.index and row.item_id in pred_matrix.columns:
            val = pred_matrix.loc[row.user_id, row.item_id]
            if not np.isnan(val):   # skip NaNs
                preds.append(val)
                trues.append(row.rating)

    # Safety check: if no predictions collected
    if len(preds) == 0:
        return [model_name, np.nan, np.nan, np.nan, np.nan, np.nan]

    rmse, mae = get_rmse_mae(trues, preds)
    p, r, f1 = precision_recall_at_k(pred_matrix, test_df, k)
    return [model_name, rmse, mae, p, r, f1]


## Compare All Models

In [30]:
results = []
results.append(evaluate_model(user_based_preds, test, "User-Based CF"))
results.append(evaluate_model(item_based_preds, test, "Item-Based CF"))
results.append(evaluate_model(svd_preds_df, test, "Matrix Factorization (SVD)"))

results_df = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "Precision@10", "Recall@10", "F1@10"])
print("\n=== Evaluation Results ===")
print(results_df)



=== Evaluation Results ===
                        Model      RMSE       MAE  Precision@10  Recall@10  \
0               User-Based CF  2.907783  2.698149      0.091413   0.117625   
1               Item-Based CF  3.160830  2.936643      0.004674   0.005297   
2  Matrix Factorization (SVD)  2.878510  2.610553      0.021848   0.051414   

      F1@10  
0  0.102876  
1  0.004966  
2  0.030665  
