# Performance Comparison with Surprise

In [1]:
from resype.preprocessing import *
from resype.content_based import ContentBasedModel
from sklearn.ensemble import RandomForestRegressor
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/phd/gchua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/phd/gchua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading the Data

In [2]:
transaction_list, user_df = create_user_feature() # for example only
item_df = create_item_feature(num_features = 300)
transaction_list = pd.read_csv("sample_data/ratings_5k.csv")
# transaction_list = transaction_list[:1000]
unique_users = transaction_list.userId.unique()
unique_items= item_df.movieId.unique()
item_df = item_df.set_index('movieId')
item_df = item_df.loc[unique_items, :]
item_df = item_df.reset_index()
user_df = user_df.set_index('userId')
user_df = user_df.loc[unique_users, :]
user_df = user_df.reset_index()

## Loading Resype's Content-based Recommend

In [3]:
cb = ContentBasedModel(user_df,
                        item_df,
                        transaction_list,
                        item_id_name='movieId',
                        user_id_name='userId',
                        target_name='rating',
                        timestamp_name='timestamp')
cb.split_train_test(train_ratio = 0.7)
model = RandomForestRegressor(random_state=202109)
cb.fit_ml_cb(model)
preds_array = cb.reco_ml_cb_tt() #To make predictions as an array
resype_mse, resype_mae = cb.evaluate_test_set()
cb.df_train[['userId','movieId','rating','timestamp']].to_csv('train_set.csv', index=False)
cb.df_test[['userId','movieId','rating','timestamp']].to_csv('test_set.csv', index=False)

proceed


## Loading Surprise

In [4]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(pd.read_csv(
    'train_set.csv').drop(columns='timestamp'), reader=reader)
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.000001)

algo = SVD()

algo.fit(trainset)
predictions = []
for i, x in cb.df_test.iterrows():
    predictions.append(algo.predict(x['userId'], x['movieId']).est)
svd_mae = mean_absolute_error(predictions, cb.df_test['rating'].values)
svd_mse = mean_squared_error(predictions, cb.df_test['rating'].values)

In [5]:
from surprise import (NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans,
                      KNNWithZScore, KNNBaseline, NMF, SlopeOne, CoClustering)
import pandas as pd
import numpy as np



algos = [NormalPredictor(), BaselineOnly(), KNNBasic(), KNNWithMeans(), 
         KNNWithZScore(), KNNBaseline(), SVD(), NMF(), SlopeOne(),
         CoClustering()]
algo_names = ["NormalPredictor", "BaselineOnly", "KNNBasic", "KNNWithMeans",
              "KNNWithZScore", "KNNBaseline", "SVD", "NMF", "SlopeOne",
              "CoClustering"]

rmses = np.zeros(len(algos))
mses = np.zeros(len(algos))
maes = np.zeros(len(algos))

for i, algo in enumerate(algos):
    algo.fit(trainset)
    predictions = []
    for u, x in cb.df_test.iterrows():
        predictions.append(algo.predict(x['userId'], x['movieId']).est)
    mae = mean_absolute_error(predictions, cb.df_test['rating'].values)
    mse = mean_squared_error(predictions, cb.df_test['rating'].values)
    mses[i] = mse
    maes[i] = mae

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  algo.fit(trainset)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  algo.fit(trainset)


## Performance Results

In [6]:
cols = ['Prediction Algorithm', 'MSE', 'MAE']
df = pd.DataFrame(columns=cols)
df['Prediction Algorithm'] = algo_names +['Resype (Random Forest)']
df['MSE'] = list(mses) + [resype_mse]
df['MAE'] = list(maes) + [resype_mae]
df

Unnamed: 0,Prediction Algorithm,MSE,MAE
0,NormalPredictor,2.05108,1.135742
1,BaselineOnly,0.858667,0.735921
2,KNNBasic,1.362782,0.906558
3,KNNWithMeans,1.17348,0.85023
4,KNNWithZScore,1.185011,0.842193
5,KNNBaseline,1.057957,0.796983
6,SVD,0.862225,0.730675
7,NMF,1.36095,0.921752
8,SlopeOne,1.222082,0.869121
9,CoClustering,1.29921,0.900984
