In [1]:
import surprise
import os
import pandas as pd
import numpy as np
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.reader import Reader
from surprise import Dataset
from surprise import accuracy
from surprise import BaselineOnly
from surprise.model_selection import train_test_split
from collections import defaultdict
import functions as f

In [2]:
df = pd.read_csv('beer_reviews.csv')

In [3]:
df.columns

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

In [4]:
breweries = df[['brewery_id','brewery_name']].drop_duplicates()

In [5]:
brewery_dict = breweries.set_index('brewery_id').to_dict()['brewery_name']

In [6]:
beer = df[['beer_beerid','beer_name']].drop_duplicates()

In [7]:
beer_dict = beer.set_index('beer_beerid').to_dict()['beer_name']

In [8]:
surprise_df = df[['review_profilename','beer_beerid','review_overall']]

In [9]:
reader = surprise.Reader(rating_scale=(0,5))

data = Dataset.load_from_df(surprise_df, reader)

In [20]:
baseline = cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6054  0.6066  0.6083  0.6073  0.6072  0.6070  0.0010  
MAE (testset)     0.4533  0.4538  0.4544  0.4544  0.4537  0.4539  0.0004  
Fit time          5.38    5.31    5.59    5.70    5.47    5.49    0.14    
Test time         4.34    4.19    4.26    7.12    4.16    4.81    1.16    


In [21]:
svd = cross_validate(surprise.SVD(), data, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6072  0.6089  0.6069  0.6088  0.6070  0.6077  0.0009  
MAE (testset)     0.4549  0.4553  0.4547  0.4550  0.4542  0.4548  0.0004  
Fit time          97.47   97.57   107.44  97.74   105.92  101.23  4.48    
Test time         5.84    5.72    11.11   5.33    7.24    7.05    2.13    


In [15]:
from surprise import SVD, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.606902,5.221247,4.431237
SVD,0.607842,111.25495,6.599376
KNNBaseline,0.608893,167.119418,223.458283
KNNBasic,0.628321,150.484961,169.329469
KNNWithMeans,0.632695,163.313102,203.128058
KNNWithZScore,0.637511,203.990611,221.000835
CoClustering,0.673719,54.711746,6.077905
NormalPredictor,0.99609,3.298289,5.76675


In [36]:
algo = SVD()
cross_validate(algo, data, measures=['RMSE','MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.6069  0.6071  0.6073  0.6057  0.6028  0.6069  0.6032  0.6070  0.6025  0.6045  0.6054  0.0019  
MAE (testset)     0.4544  0.4535  0.4540  0.4532  0.4510  0.4539  0.4516  0.4533  0.4507  0.4526  0.4528  0.0012  
Fit time          116.20  116.46  113.95  111.29  111.32  114.91  111.95  114.21  110.25  111.46  113.20  2.11    
Test time         2.64    2.21    2.11    3.97    3.42    2.64    2.32    3.62    3.47    2.26    2.87    0.65    


{'test_rmse': array([0.60692667, 0.60711864, 0.60731105, 0.60565722, 0.60279991,
        0.60694482, 0.60316991, 0.60698378, 0.60248231, 0.60454224]),
 'test_mae': array([0.45437835, 0.4535438 , 0.45398415, 0.45315411, 0.45095426,
        0.45394147, 0.45161204, 0.45333785, 0.45069494, 0.45262459]),
 'fit_time': (116.19934582710266,
  116.4635021686554,
  113.9470911026001,
  111.28920388221741,
  111.31957292556763,
  114.91191697120667,
  111.94657325744629,
  114.210618019104,
  110.24592781066895,
  111.45900201797485),
 'test_time': (2.637769937515259,
  2.213810920715332,
  2.108522891998291,
  3.966815948486328,
  3.4205799102783203,
  2.6384470462799072,
  2.323370933532715,
  3.6192991733551025,
  3.4705379009246826,
  2.256937026977539)}

In [11]:
data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = f.precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

NameError: name 'SVD' is not defined

In [44]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = SVD()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.9384


0.9384491236565119

In [48]:
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [49]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
9795,330,174,5.0,5.0,{'was_impossible': False},115,305,0.0
8139,152,272,5.0,5.0,{'was_impossible': False},84,157,0.0
23176,907,198,5.0,5.0,{'was_impossible': False},110,96,0.0
1138,523,169,5.0,5.0,{'was_impossible': False},82,87,0.0
15583,181,260,1.0,1.0,{'was_impossible': False},312,90,0.0
24151,507,316,5.0,5.0,{'was_impossible': False},42,77,0.0
23983,181,1215,1.0,1.0,{'was_impossible': False},312,19,0.0
10111,181,1165,1.0,1.0,{'was_impossible': False},312,9,0.0
16218,507,313,5.0,5.0,{'was_impossible': False},42,257,0.0
6207,181,758,1.0,1.0,{'was_impossible': False},312,14,0.0


In [50]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
10249,167,169,1.0,4.399153,{'was_impossible': False},48,87,3.399153
2488,887,183,1.0,4.404189,{'was_impossible': False},128,225,3.404189
10295,901,56,1.0,4.425839,{'was_impossible': False},91,295,3.425839
1210,314,56,1.0,4.426705,{'was_impossible': False},175,295,3.426705
6373,253,192,1.0,4.43118,{'was_impossible': False},80,82,3.43118
22160,534,93,1.0,4.472527,{'was_impossible': False},63,78,3.472527
9595,588,98,1.0,4.587681,{'was_impossible': False},167,301,3.587681
743,481,318,1.0,4.647793,{'was_impossible': False},44,234,3.647793
20194,295,183,1.0,4.690554,{'was_impossible': False},151,225,3.690554
2585,405,452,5.0,1.256944,{'was_impossible': False},563,51,3.743056
