In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from surprise import Dataset, Reader, SVD, SVDpp
from surprise.model_selection import cross_validate as surprise_cv, GridSearchCV
from auto_surprise.engine import Engine

In [None]:

def load_data(path_train_tbr, path_train_ratings, path_submission=None):
    tbr = pd.read_csv(path_train_tbr)
    ratings = pd.read_csv(path_train_ratings)
    if 'sid_pid' in ratings.columns:
        ratings[['sid','pid']] = (
            ratings['sid_pid'].str.split('_', expand=True)
            .astype(int)
        )
        ratings = ratings[['sid', 'pid', 'rating']]
    submission = pd.read_csv(path_submission) if path_submission else None
    return tbr, ratings, submission

def integrate_wishlist(ratings, tbr):
    item_means = ratings.groupby('pid')['rating'].mean()
    global_mean = ratings['rating'].mean()
    tbr_unique = tbr.loc[
        ~tbr.set_index(['sid','pid']).index.isin(
            ratings.set_index(['sid','pid']).index
        )
    ].copy()
    tbr_unique['rating'] = tbr_unique['pid'].map(item_means).fillna(global_mean)
    return pd.concat([ratings, tbr_unique[['sid','pid','rating']]], ignore_index=True)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def svdpp_cv(ratings, n_splits=5, n_factors=50, lr=0.005, reg=0.02, n_epochs=20):
    reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
    data = Dataset.load_from_df(ratings[['sid','pid','rating']], reader)
    algo = SVDpp(n_factors=n_factors, lr_all=lr, reg_all=reg, n_epochs=n_epochs)
    res = surprise_cv(algo, data, measures=['RMSE'], cv=n_splits, verbose=False)
    return np.mean(res['test_rmse']), np.std(res['test_rmse'])

def tune_svdpp_params(ratings):
    reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
    data = Dataset.load_from_df(ratings[['sid','pid','rating']], reader)
    engine = Engine(verbose=True, algorithms=['svd', 'svdpp'])
    best_algo, best_params, best_score, tasks = engine.train(
        data=data, 
        target_metric='test_rmse', 
        cpu_time_limit=60 * 60 * 11, 
        max_evals=100
    )
    return best_algo, best_params, best_score
    


In [39]:
tbr, ratings, _ = load_data('data/train_tbr.csv', 'data/train_ratings.csv')
ext = integrate_wishlist(ratings, tbr)

In [None]:
sample_ext = ext.sample(n=300000, random_state=42)

In [43]:
print("Tuning SVD++ hyperparameters...")
best_algo, best_params, best_score = tune_svdpp_params(sample_ext)

Tuning SVD++ hyperparameters...


Evaluating RMSE, MAE, MSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2262  1.2297  1.2256  1.2247  1.2210  1.2255  0.0028  
MAE (testset)     0.9706  0.9756  0.9712  0.9697  0.9679  0.9710  0.0026  
MSE (testset)     1.5036  1.5123  1.5021  1.4998  1.4909  1.5017  0.0068  
Fit time          0.23    0.30    0.30    0.29    0.30    0.28    0.03    
Test time         0.11    0.11    0.21    0.21    0.21    0.17    0.05    


Starting process with svd algorithm
Starting process with svdpp algorithm
Evaluating RMSE, MAE, MSE of algorithm SVD on 5 split(s).                                                               
                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std                                                   
RMSE (testset)    1.0367  1.0375  1.0454  1.0453  1.0392  1.0408  0.0038  
MAE (testset)     0.7937  0.7938  0.8013  0.8004  0.7966  0.7971  0.0032  
MSE (testset)     1.0748  1.0763  1.0929  1.0927  1.0799  1.0833  0.0079  
Fit time          5.12    5.01    5.14    5.14    5.13    5.11    0.05    
Test time         0.32    0.27    0.17    0.28    0.27    0.26    0.05    
Evaluating RMSE, MAE, MSE of algorithm SVD on 5 split(s).                                                               
                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std                                                   
RMSE (testset)    0.8921  0.9058  0.8997  0.8937  0.8950  0.8972  0

job exception: This job has timed out. The results will still be used



 87%|████████████████████████████████████▌     | 87/100 [10:59:59<1:38:37, 455.17s/trial, best loss: 0.8116688533798818]


In [None]:
print(best_algo, best_params, best_score)

In [None]:
print("Running with best SVD++ settings:")
mean_rmse, std_rmse = svdpp_cv(ext, best_params)

In [None]:
print(f"SVD++ (optimized): RMSE = {mean_rmse:.4f} +/- {std_rmse:.4f}")