In [None]:
# STEP 4: Generate top-k recommendations for test users

print("\n" + "="*50)
print("STEP 4: Generating top-10 recommendations for test users")
print("="*50)

def get_top_n_recommendations(algo, trainset, user_id, n=10):
    """Get top N recommendations for a user"""
    try:
        # Get all items
        all_items = set([trainset.to_raw_iid(iid) for iid in range(trainset.n_items)])
        
        # Get items the user has already rated
        try:
            user_inner_id = trainset.to_inner_uid(user_id)
            user_rated = set([trainset.to_raw_iid(iid) for (iid, _) in trainset.ur[user_inner_id]])
        except ValueError:
            user_rated = set()
        
        # Items to recommend = all items minus rated items
        items_to_recommend = list(all_items - user_rated)
        
        # Predict ratings for all these items (limit for speed)
        predictions = []
        for item_id in items_to_recommend[:500]:  # Limit for speed
            pred = algo.predict(user_id, item_id)
            predictions.append((item_id, pred.est))
        
        # Sort by predicted rating and return top N
        predictions.sort(key=lambda x: x[1], reverse=True)
        return [item for item, _ in predictions[:n]]
    except Exception as e:
        print(f"Error for user {user_id}: {e}")
        return []

# Get unique test users
test_users = test_df['user_id'].unique()
print(f"Generating recommendations for {len(test_users)} test users...")

# Generate recommendations for each model
user_recommendations = {}

for i, user_id in enumerate(test_users):
    if i % 100 == 0:
        print(f"Processed {i}/{len(test_users)} users...")
    
    user_recommendations[user_id] = {
        'knn': get_top_n_recommendations(knn_model, trainset, user_id, n=10),
        'svd': get_top_n_recommendations(svd_model, trainset, user_id, n=10),
        'top_pop': get_top_n_recommendations(top_pop_model, trainset, user_id, n=10)
    }

print(f"Recommendations generated for all {len(test_users)} users!")


STEP 4: Generating top-10 recommendations for test users
Generating recommendations for 1389 test users...
Processed 0/1389 users...
Processed 100/1389 users...
Processed 200/1389 users...
Processed 300/1389 users...
Processed 400/1389 users...
Processed 500/1389 users...
Processed 600/1389 users...
Processed 700/1389 users...
Processed 800/1389 users...
Processed 900/1389 users...
Processed 1000/1389 users...
Processed 1100/1389 users...
Processed 1200/1389 users...
Processed 1300/1389 users...
Recommendations generated for all 1389 users!


In [32]:
# STEP 3: RMSE Evaluation

def calculate_rmse_fixed(model, test_tuples, model_name=""):
    """Calculate RMSE with proper true ratings"""
    predictions = []
    errors = 0
    
    for user, item, true_rating in test_tuples:
        try:
            # Get prediction from model
            pred = model.predict(user, item)
            
            if pred.est is not None and not np.isnan(pred.est):
                predictions.append({
                    'true': true_rating,
                    'est': float(pred.est)
                })
            else:
                errors += 1
        except Exception as e:
            errors += 1
    
    print(f"  {model_name}: {len(predictions)} valid predictions, {errors} errors")
    
    if len(predictions) == 0:
        return float('nan')
    
    # Calculate RMSE manually
    try:
        squared_errors = [(p['true'] - p['est']) ** 2 for p in predictions]
        mse = np.mean(squared_errors)
        rmse = np.sqrt(mse)
        return rmse
    except Exception as e:
        print(f"  Error in manual RMSE calculation: {e}")
        return float('nan')

# Calculate RMSE for each model
print("\nCalculating RMSE with fixed function...")
top_pop_rmse = calculate_rmse_fixed(top_pop_model, test_tuples, "TopPop")
knn_rmse = calculate_rmse_fixed(knn_model, test_tuples, "KNN")
svd_rmse = calculate_rmse_fixed(svd_model, test_tuples, "SVD")

# Display RMSE results
print("\n" + "-"*30)
print("RMSE RESULTS")
print("-"*30)
print(f"TopPop RMSE: {top_pop_rmse:.4f}")
print(f"KNN RMSE: {knn_rmse:.4f}")
print(f"SVD RMSE: {svd_rmse:.4f}")


Calculating RMSE with fixed function...
  TopPop: 6645 valid predictions, 0 errors
  KNN: 6645 valid predictions, 0 errors
  SVD: 6645 valid predictions, 0 errors

------------------------------
RMSE RESULTS
------------------------------
TopPop RMSE: 3.3354
KNN RMSE: 0.9789
SVD RMSE: 0.9333


In [31]:
# STEP 2: Prepare test data

# Convert test data to list of tuples
test_tuples = [tuple(x) for x in test_df[['user_id', 'item_id', 'rating']].to_numpy()]
print(f"Prepared {len(test_tuples)} test interactions")

# Get unique test users
test_users = test_df['user_id'].unique()
print(f"Found {len(test_users)} unique test users")


Prepared 6645 test interactions
Found 1389 unique test users


In [30]:
# STEP 1: Load saved models and data

# Load the cleaned data
train_df = pd.read_parquet('cleaned_train.parquet')
test_df = pd.read_parquet('cleaned_test.parquet')

# Load the saved models
with open('best_knn_model.pkl', 'rb') as f:
    knn_model = pickle.load(f)

with open('best_svd_model.pkl', 'rb') as f:
    svd_model = pickle.load(f)

with open('trainset.pkl', 'rb') as f:
    trainset = pickle.load(f)

# RECREATE TopPop model
print("\nRecreating TopPop model...")
top_items = pd.read_csv('top_items.csv')
top_pop_items = top_items.head(10)['item_id'].tolist()
top_pop_model = TopPop(top_pop_items)
top_pop_model.fit(trainset)

print(f"TopPop model recreated with {len(top_pop_items)} popular items")
print(f"Top items: {top_pop_items[:5]}...")

print("\nModels and data loaded successfully!")
print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")


Recreating TopPop model...
TopPop model recreated with 10 popular items
Top items: ['B0086VPUHI', 'B00BN5T30E', 'B07YBXFDYN', 'B00BGA9WK2', 'B007CM0K86']...

Models and data loaded successfully!
Train set size: 26580
Test set size: 6645


In [29]:
# DEFINE THE TOPPOP CLASS

class TopPop(AlgoBase):
    def __init__(self, top_items_list=None):
        AlgoBase.__init__(self)
        self.top_items = top_items_list if top_items_list else []
    
    def fit(self, trainset):
        self.trainset = trainset
        return self
    
    def estimate(self, u, i):
        try:
            raw_iid = self.trainset.to_raw_iid(i)
            if raw_iid in self.top_items:
                return 5.0
            else:
                return 1.0
        except:
            return 1.0


In [28]:
import pandas as pd
import numpy as np
import pickle
from surprise import AlgoBase, accuracy
import warnings
warnings.filterwarnings('ignore')