In [10]:
# STEP 6: Generate recommendations for each user

def get_top_n_recommendations(algo, trainset, user_id, n=10):
    """Get top N recommendations for a user"""
    # Get all items
    all_items = set([trainset.to_raw_iid(iid) for iid in range(trainset.n_items)])
    
    # Get items the user has already rated
    try:
        user_inner_id = trainset.to_inner_uid(user_id)
        user_rated = set([trainset.to_raw_iid(iid) for (iid, _) in trainset.ur[user_inner_id]])
    except ValueError:
        # User not in trainset
        user_rated = set()
    
    # Items to recommend = all items minus rated items
    items_to_recommend = list(all_items - user_rated)
    
    # Predict ratings for all these items
    predictions = []
    for item_id in items_to_recommend[:100]:  # Limit for speed, but in practice you'd do all
        pred = algo.predict(user_id, item_id)
        predictions.append((item_id, pred.est))
    
    # Sort by predicted rating and return top N
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:n]

# Generate recommendations for a few test users
print("\nSample recommendations for first 5 test users:")

# Get unique test users
test_users = test_df['user_id'].unique()[:5]  # First 5 users

for user_id in test_users:
    # Get recommendations from each model
    knn_recs = get_top_n_recommendations(best_knn, trainset, user_id, n=10)
    svd_recs = get_top_n_recommendations(best_svd, trainset, user_id, n=10)
    top_pop_recs = get_top_n_recommendations(top_pop_algo, trainset, user_id, n=10)
    
    print(f"\nUser: {user_id}")
    print(f"  KNN Top-3: {[item for item, _ in knn_recs[:3]]}")
    print(f"  SVD Top-3: {[item for item, _ in svd_recs[:3]]}")
    print(f"  TopPop Top-3: {[item for item, _ in top_pop_recs[:3]]}")



Sample recommendations for first 5 test users:

User: AFKKKKYLVYHA2FELEWZGVEW3TXHA
  KNN Top-3: ['B004XV6ST4', 'B003O6E620', 'B08JHX17ZZ']
  SVD Top-3: ['B003ZSN600', 'B004HILZUU', 'B000G6SPHI']
  TopPop Top-3: ['B007CM0K86', 'B00GM5T8PK', 'B002I0IVC4']

User: AEPERIK4W4CBNSELKZTBOQ6F63RA
  KNN Top-3: ['B003INEQ0G', 'B07WS6ZFTG', 'B004QEV0MI']
  SVD Top-3: ['B003ZSN600', 'B004HILZUU', 'B08JHX17ZZ']
  TopPop Top-3: ['B007CM0K86', 'B00GM5T8PK', 'B002I0IVC4']

User: AEK56VNXEFRLSKYEAOBMLNITCBJA
  KNN Top-3: ['B003INEQ0G', 'B004HILZUU', 'B0C5K4M7WJ']
  SVD Top-3: ['B000G6SPHI', 'B08JHX17ZZ', 'B00OBZNI0O']
  TopPop Top-3: ['B007CM0K86', 'B00GM5T8PK', 'B002I0IVC4']

User: AHXO5KWH7Q2A2UTTLGQWIIR2ONPQ
  KNN Top-3: ['B003INEQ0G', 'B004HILZUU', 'B004LVO4M4']
  SVD Top-3: ['B004HILZUU', 'B000G6SPHI', 'B07X13HNK3']
  TopPop Top-3: ['B007CM0K86', 'B00GM5T8PK', 'B002I0IVC4']

User: AF6NX4FJWA5FWR22BE6NGVLXE3HA
  KNN Top-3: ['B0023B14WC', 'B006HZA9XU', 'B000F3AADE']
  SVD Top-3: ['B004HILZUU', 'B00

In [7]:
# STEP 5: Latent Factor Model 


# Define hyperparameters to tune
svd_param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

print("Tuning SVD hyperparameters (this may take several minutes)...")

# Grid search for best SVD parameters
svd_grid = GridSearchCV(SVD, svd_param_grid, measures=['rmse'], cv=5)
svd_grid.fit(data)

# Best parameters and score
print(f"\nBest SVD parameters: {svd_grid.best_params['rmse']}")
print(f"Best SVD RMSE: {svd_grid.best_score['rmse']:.4f}")

# Train final SVD model with best parameters
best_svd = svd_grid.best_estimator['rmse']
best_svd.fit(trainset)

print("SVD Model trained successfully!")


Tuning SVD hyperparameters (this may take several minutes)...

Best SVD parameters: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}
Best SVD RMSE: 0.9543
SVD Model trained successfully!


In [6]:
# STEP 4: Neighborhood-based Model

# Define hyperparameters to tune
knn_param_grid = {
    'k': [20, 30, 40, 50],
    'sim_options': {
        'name': ['cosine', 'pearson'],
        'user_based': [True, False]  # True = user-user, False = item-item
    }
}


print("Tuning hyperparameters with GridSearchCV (this may take a few minutes)...")

# Grid search for best KNN parameters
knn_grid = GridSearchCV(KNNBasic, knn_param_grid, measures=['rmse'], cv=5)
knn_grid.fit(data)

# Best parameters and score
print(f"\nBest KNN parameters: {knn_grid.best_params['rmse']}")
print(f"Best KNN RMSE: {knn_grid.best_score['rmse']:.4f}")

# Train final KNN model with best parameters
best_knn = knn_grid.best_estimator['rmse']
trainset = data.build_full_trainset()
best_knn.fit(trainset)

print("\nKNN Model trained successfully!")

Tuning hyperparameters with GridSearchCV (this may take a few minutes)...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing th

In [5]:
# STEP 3: Implement TopPop Recommender

# TopPop = recommend items with most high ratings (â‰¥3)
top_pop_items = top_items.head(10)['item_id'].tolist()
print(f"Top-10 popular items: {top_pop_items}")

# For Surprise we need to create a custom algorithm
from surprise import AlgoBase

class TopPop(AlgoBase):
    def __init__(self, top_items_list):
        AlgoBase.__init__(self)
        self.top_items = top_items_list
    
    def fit(self, trainset):
        self.trainset = trainset
        return self
    
    def estimate(self, u, i):
        # If item is in top popular list, predict high rating (5)
        # Otherwise predict low rating (1)
        if self.trainset.to_raw_iid(i) in self.top_items:
            return 5.0
        else:
            return 1.0

# Evaluate TopPop with cross-validation
print("\nEvaluating TopPop with 5-fold CV:")
top_pop_algo = TopPop(top_pop_items)
cv_results = cross_validate(top_pop_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

print(f"\nTopPop - Average RMSE: {cv_results['test_rmse'].mean():.4f}")
print(f"TopPop - Average MAE: {cv_results['test_mae'].mean():.4f}")

Top-10 popular items: ['B0086VPUHI', 'B00BN5T30E', 'B07YBXFDYN', 'B00BGA9WK2', 'B007CM0K86', 'B00KIWEMIG', 'B07YBWT3PK', 'B07YBXFF99', 'B004HD55VK', 'B014R4KYMS']

Evaluating TopPop with 5-fold CV:
Evaluating RMSE, MAE of algorithm TopPop on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.3475  3.3470  3.3477  3.3435  3.3437  3.3459  0.0019  
MAE (testset)     3.0909  3.0848  3.0974  3.0869  3.0807  3.0881  0.0057  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.02    0.00    0.00    0.00    0.00    0.01    0.01    

TopPop - Average RMSE: 3.3459
TopPop - Average MAE: 3.0881


In [3]:
# STEP 2: Prepare data for Surprise library

# put data in correct format
reader = Reader(rating_scale=(1, 5))

# Convert to Surprise dataset
data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)

In [2]:
# STEP 1: Import libraries and load data

import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD, KNNBasic, NormalPredictor
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import accuracy
import warnings
warnings.filterwarnings('ignore')

print(" Loading cleaned data")

# Load the cleaned data from Week 6
train_df = pd.read_parquet('cleaned_train.parquet')
test_df = pd.read_parquet('cleaned_test.parquet')
top_items = pd.read_csv('top_items.csv')

print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Top items loaded: {len(top_items)}")

 Loading cleaned data
Training set size: 26580
Test set size: 6645
Top items loaded: 932
