In [2]:
from surprise import SVD, accuracy
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

In [8]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
import numpy as np

# Define the file path
data_path = 'musical_instruments_reviews'
df_data = pd.read_csv(data_path, sep='\t')

# Drop the 'timestamp:float' column as it is not needed anymore
df_data = df_data.drop(columns=['timestamp:float'])

# Read data so that it fits the requirements of surprise
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order)
data = Dataset.load_from_df(df_data[['user_id:token', 'item_id:token', 'rating:float']], reader)

# Test set is made of 25% of the ratings
trainset, testset = train_test_split(data, test_size=0.25)

# Create an instance of the SVD algorithm
algo = SVD()

# Train the algorithm on the trainset
algo.fit(trainset)

# Predict ratings for the testset
predictions = algo.test(testset)

# Calculate RMSE and MAE
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Calculate MRR@10, NDCG@10, and HR@10
def calculate_metrics_at_k(predictions, k=10):
    user_est_true = {}
    for uid, _, true_r, est, _ in predictions:
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))
    
    mrr = 0
    ndcg = 0
    hr = 0
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated rating in descending order
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        user_ratings = user_ratings[:k]
        
        # Calculate MRR@10
        for rank, (_, true_r) in enumerate(user_ratings, start=1):
            if true_r >= 4:  # Assuming a relevant rating is 4 or higher
                mrr += 1 / rank
                break
        
        # Calculate NDCG@10
        dcg = sum((true_r >= 4) / np.log2(rank + 1) for rank, (_, true_r) in enumerate(user_ratings, start=1))
        idcg = sum(1 / np.log2(rank + 1) for rank in range(1, min(k, len(user_ratings)) + 1))
        ndcg += dcg / idcg if idcg > 0 else 0
        
        # Calculate HR@10
        hr += any(true_r >= 4 for _, true_r in user_ratings)
    
    mrr /= len(user_est_true)
    ndcg /= len(user_est_true)
    hr /= len(user_est_true)
    
    return mrr, ndcg, hr

mrr, ndcg, hr = calculate_metrics_at_k(predictions, k=10)
print(f'MRR@10: {mrr:.4f}')
print(f'NDCG@10: {ndcg:.4f}')
print(f'HR@10: {hr:.4f}')

RMSE: 1.0619
MAE:  0.7493
MRR@10: 0.8407
NDCG@10: 0.8275
HR@10: 0.8592


In [6]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate

# Define the file path
data_path = 'musical_instruments_reviews'
df_data = pd.read_csv(data_path, sep='\t')

# Drop the 'timestamp:float' column as it is not needed anymore
df_data = df_data.drop(columns=['timestamp:float'])

# Read data so that it fits the requirements of surprise
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order)
data = Dataset.load_from_df(df_data[['user_id:token', 'item_id:token', 'rating:float']], reader)

# Test set is made of 25% of the ratings
trainset, testset = train_test_split(data, test_size=0.25)

# Create an instance of the SVD algorithm
algo = SVD()

# Perform cross-validation
results = cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

# Print the results
print(results)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0199  1.0588  1.0823  1.1232  1.0555  1.0679  0.0341  
MAE (testset)     0.7163  0.7507  0.7600  0.7728  0.7356  0.7471  0.0196  
Fit time          0.06    0.05    0.05    0.05    0.05    0.05    0.01    
Test time         0.01    0.00    0.00    0.00    0.00    0.00    0.00    
{'test_rmse': array([1.0199267 , 1.05875004, 1.08229234, 1.12317493, 1.05549585]), 'test_mae': array([0.71633223, 0.75072519, 0.75996365, 0.77282427, 0.7355886 ]), 'fit_time': (0.06410908699035645, 0.05234360694885254, 0.05190873146057129, 0.050002098083496094, 0.04785585403442383), 'test_time': (0.005278110504150391, 0.00433802604675293, 0.004034996032714844, 0.004123687744140625, 0.003971099853515625)}


In [3]:
data_path = 'musical_instruments_reviews'
df_data = pd.read_csv(data_path, sep='\t')

# drop timestamp, rating_age, and rating_age_days columns as it is not needed anymore
df_data = df_data.drop(columns=['timestamp:float'])


# test set is made of 25% of the ratings.
trainset, testset = train_test_split(df_data, test_size=0.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# reader = Reader(line_format="user_id:token item_id:token timestamp:float rating:float", sep="\t")

# data = Dataset.load_from_file(file_path, reader=reader)
# data = Dataset.load_from_file(data_path, reader=reader)

# trainset, testset = train_test_split(df_training_svd, test_size=0.25)

# read data so that it fits the requirements of suprise
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
trainset = Dataset.load_from_df(trainset[['user_id:token', 'item_id:token', 'rating:float']], reader).build_full_trainset()

AttributeError: 'DataFrame' object has no attribute 'raw_ratings'

In [23]:
# fit model
algo = SVD()
# algo.fit(trainset)

In [24]:
cross_validate(algo, trainset, measures=["RMSE", "MAE"], cv=5, verbose=True)

AttributeError: 'Trainset' object has no attribute 'raw_ratings'

In [26]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

# Define the file path
data_path = 'musical_instruments_reviews'

# Read the CSV file into a DataFrame
df_training_svd = pd.read_csv(data_path, sep='\t')

# Drop the 'timestamp:float' column as it is not needed anymore
df_training_svd = df_training_svd.drop(columns=['timestamp:float'])

# Define the reader with the rating scale
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order)
trainset = Dataset.load_from_df(df_training_svd[['user_id:token', 'item_id:token', 'rating:float']], reader)

# Create an instance of the SVD algorithm
algo = SVD()

# Perform cross-validation
results = cross_validate(algo, trainset, measures=["RMSE", "MAE"], cv=5, verbose=True)

# Print the results
print(results)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0745  1.0872  1.0795  1.0464  1.0319  1.0639  0.0211  
MAE (testset)     0.7501  0.7602  0.7527  0.7383  0.7204  0.7444  0.0139  
Fit time          0.07    0.05    0.05    0.05    0.05    0.06    0.01    
Test time         0.00    0.00    0.00    0.01    0.00    0.00    0.00    
{'test_rmse': array([1.07445898, 1.08719934, 1.07952739, 1.04639543, 1.03187788]), 'test_mae': array([0.75008002, 0.76023609, 0.75274104, 0.73834708, 0.72041545]), 'fit_time': (0.06657624244689941, 0.051113128662109375, 0.05460309982299805, 0.05214500427246094, 0.052848100662231445), 'test_time': (0.004228830337524414, 0.00386810302734375, 0.003763914108276367, 0.0051081180572509766, 0.004650115966796875)}


In [None]:
accuracy