In [5]:
# STEP 3: Implement TopPop Recommender

# TopPop = recommend items with most high ratings (â‰¥3)
top_pop_items = top_items.head(10)['item_id'].tolist()
print(f"Top-10 popular items: {top_pop_items}")

# For Surprise we need to create a custom algorithm
from surprise import AlgoBase

class TopPop(AlgoBase):
    def __init__(self, top_items_list):
        AlgoBase.__init__(self)
        self.top_items = top_items_list
    
    def fit(self, trainset):
        self.trainset = trainset
        return self
    
    def estimate(self, u, i):
        # If item is in top popular list, predict high rating (5)
        # Otherwise predict low rating (1)
        if self.trainset.to_raw_iid(i) in self.top_items:
            return 5.0
        else:
            return 1.0

# Evaluate TopPop with cross-validation
print("\nEvaluating TopPop with 5-fold CV:")
top_pop_algo = TopPop(top_pop_items)
cv_results = cross_validate(top_pop_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

print(f"\nTopPop - Average RMSE: {cv_results['test_rmse'].mean():.4f}")
print(f"TopPop - Average MAE: {cv_results['test_mae'].mean():.4f}")

Top-10 popular items: ['B0086VPUHI', 'B00BN5T30E', 'B07YBXFDYN', 'B00BGA9WK2', 'B007CM0K86', 'B00KIWEMIG', 'B07YBWT3PK', 'B07YBXFF99', 'B004HD55VK', 'B014R4KYMS']

Evaluating TopPop with 5-fold CV:
Evaluating RMSE, MAE of algorithm TopPop on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.3475  3.3470  3.3477  3.3435  3.3437  3.3459  0.0019  
MAE (testset)     3.0909  3.0848  3.0974  3.0869  3.0807  3.0881  0.0057  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.02    0.00    0.00    0.00    0.00    0.01    0.01    

TopPop - Average RMSE: 3.3459
TopPop - Average MAE: 3.0881


In [3]:
# STEP 2: Prepare data for Surprise library

# put data in correct format
reader = Reader(rating_scale=(1, 5))

# Convert to Surprise dataset
data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)

In [2]:
# STEP 1: Import libraries and load data

import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD, KNNBasic, NormalPredictor
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import accuracy
import warnings
warnings.filterwarnings('ignore')

print(" Loading cleaned data")

# Load the cleaned data from Week 6
train_df = pd.read_parquet('cleaned_train.parquet')
test_df = pd.read_parquet('cleaned_test.parquet')
top_items = pd.read_csv('top_items.csv')

print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Top items loaded: {len(top_items)}")

 Loading cleaned data
Training set size: 26580
Test set size: 6645
Top items loaded: 932
