In [8]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
import numpy as np

# Define the file path
data_path = 'musical_instruments_reviews'
df_data = pd.read_csv(data_path, sep='\t')

# Drop the 'timestamp:float' column as it is not needed anymore
df_data = df_data.drop(columns=['timestamp:float'])

# Read data so that it fits the requirements of surprise
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order)
data = Dataset.load_from_df(df_data[['user_id:token', 'item_id:token', 'rating:float']], reader)

# Test set is made of 25% of the ratings
trainset, testset = train_test_split(data, test_size=0.25)

# Create an instance of the SVD algorithm
algo = SVD()

# Train the algorithm on the trainset
algo.fit(trainset)

# Predict ratings for the testset
predictions = algo.test(testset)

# Calculate RMSE and MAE
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Calculate MRR@10, NDCG@10, and HR@10
def calculate_metrics_at_k(predictions, k=10):
    user_est_true = {}
    for uid, _, true_r, est, _ in predictions:
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))
    
    mrr = 0
    ndcg = 0
    hr = 0
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated rating in descending order
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        user_ratings = user_ratings[:k]
        
        # Calculate MRR@10
        for rank, (_, true_r) in enumerate(user_ratings, start=1):
            if true_r >= 4:  # Assuming a relevant rating is 4 or higher
                mrr += 1 / rank
                break
        
        # Calculate NDCG@10
        dcg = sum((true_r >= 4) / np.log2(rank + 1) for rank, (_, true_r) in enumerate(user_ratings, start=1))
        idcg = sum(1 / np.log2(rank + 1) for rank in range(1, min(k, len(user_ratings)) + 1))
        ndcg += dcg / idcg if idcg > 0 else 0
        
        # Calculate HR@10
        hr += any(true_r >= 4 for _, true_r in user_ratings)
    
    mrr /= len(user_est_true)
    ndcg /= len(user_est_true)
    hr /= len(user_est_true)
    
    return mrr, ndcg, hr

mrr, ndcg, hr = calculate_metrics_at_k(predictions, k=10)
print(f'MRR@10: {mrr:.4f}')
print(f'NDCG@10: {ndcg:.4f}')
print(f'HR@10: {hr:.4f}')

RMSE: 1.0619
MAE:  0.7493
MRR@10: 0.8407
NDCG@10: 0.8275
HR@10: 0.8592
