In [11]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split as sklearn_tt_split

from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split

# Basic Algorithms
from surprise import NormalPredictor
from surprise import BaselineOnly

# KNN algorithms
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import KNNWithMeans
from surprise import KNNBaseline

# Matrix based Algorithms
from surprise import SVD
from surprise import SVDpp #note, pretty slow
from surprise import NMF

# Other
from surprise import SlopeOne
from surprise import CoClustering

## File Details

To start with, this is basically just a copy paste of the surprise normal sample from Week 12, but then rejigged to use the subset of the data. Currently, 200k records. Still just using BeerId and reviewer id as is.

This one also does the inbuilt 5x cross validation which is nice, though for this assignment we have a pre-split train/validation file set that we can just use

Looked at SVD with 5 fold cross validation. takes 20s to fit, but already MAE gone down from 0.73 down to 0.438

# Assignment 3 -- Recommendation Systems

* The final challenge is much like Assignment 2 -- but scoped appropriately for the time and your current abilities.
* It is ratings prediction, just like the movielens recommendations we have seen and many other similar problems.
* The features created are based on Beer Reviews from experts on a website.
* Each beer has been scored between 0 and 5 (on a real scale, so 2.75 or 3.5 is OK).
* The official measure is Mean Average Error (MAE) which is pretty intuitive to work with. Everything supports is and it is easy to interpret.
* A set of features have been created based on the reviewer, the written review, and information about the Beer being reviewed.
* Not all features have to be used, and you can easily create new features using the data if you like.
* The features included are:

![title](Images/A3Features.png)

* Sizes of the files are:
|Size | File|
|---|---|
| 1.9G | features.tsv |
| 88B  | header-features.tsv|
| 48B  | header.tsv |
| 15M  | test.tsv |
| 50M  | train.tsv |
| 16M |  val.tsv |


In [12]:
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"

In [13]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
df_train = pd.read_csv(baseDataDir + 'train_200k.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

# df_vali = pd.read_csv(baseDataDir + 'vali_200k.tsv',sep='\t',
#                          names=['RowID','BeerID','ReviewerID',
#                                   'BeerName','BeerType','rating'])
# df_vali.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch �r Bock,Rauchbier,4.0
1,27,12300,5634,Rauch �r Bock,Rauchbier,4.5
2,28,12300,3544,Rauch �r Bock,Rauchbier,4.5
3,40,12300,6521,Rauch �r Bock,Rauchbier,4.0
4,43,12300,10177,Rauch �r Bock,Rauchbier,4.5
5,48,12300,2907,Rauch �r Bock,Rauchbier,3.5
6,49,12300,1532,Rauch �r Bock,Rauchbier,4.0
7,50,12300,3452,Rauch �r Bock,Rauchbier,3.5
8,59,12300,6861,Rauch �r Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


In [14]:
# # RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag
# df_features = pd.read_csv(baseDataDir + 'features_200k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
#                                                                  'DayofMonth','Year','TimeOfDay','Gender',
#                                                                  'Birthday','Text','Lemmatized','POS_Tag'])

# df_features.head(10)

Unnamed: 0,RowID,BrewerID,ABV,DayofWeek,Month,DayofMonth,Year,TimeOfDay,Gender,Birthday,Text,Lemmatized,POS_Tag
0,18,1075,7.4,Mon,Jan,2,2012,15:20:04,Male,unknown,Pours a murky light brown with a 1 inch fizzy ...,pour a murky light brown with a 1 inch fizzy t...,VBZ DT JJ NN JJ IN DT CD NN JJ NN NN WDT VBZ I...
1,19,1075,7.4,Sun,Jan,1,2012,06:46:52,Male,unknown,Faint sudsy head with some with some dissipati...,faint sudsy head with some with some dissipate...,NN JJ NN IN DT IN DT VBG JJ NN . JJ JJ NN . DT...
2,20,1075,7.4,Tue,Nov,29,2011,05:51:44,Male,unknown,A new arrival to the West TN area ... Pours qu...,a new arrival to the West TN area ... pour qui...,"DT JJ NN IN DT NNP NNP NN , VBZ PDT DT NN JJR ..."
3,21,1075,7.4,Sat,Nov,5,2011,22:59:57,Male,unknown,Sampled 10/30/11 - Transferring the notes . A ...,sample 10/30/11 - transfer the note . a ruby p...,VBN CD HYPH VBG DT NNS . DT NN VBP IN DT NN NN...
4,22,1075,7.4,Tue,Nov,1,2011,20:40:21,Male,"Oct 14, 1983",This is my first rauchbier . Pours a burnt amb...,this be my first rauchbier . pour a burn amber...,DT VBZ PRP$ JJ NN . VBZ DT VBN NN . JJ NN . NN...
5,23,1075,7.4,Sat,Oct,29,2011,11:18:34,unknown,unknown,A,a,DT
6,24,1075,7.4,Mon,Oct,17,2011,09:04:02,unknown,unknown,"Pours a mahogany color , rich , with a tan hea...","pour a mahogany color , rich , with a tan head...","VBZ DT NN NN , JJ , IN DT JJ NN . DT NN , VBD ..."
7,25,1075,7.4,Tue,Oct,11,2011,10:41:41,unknown,unknown,Pours light caramel brown with reddish highlig...,pour light caramel brown with reddish highligh...,NNS JJ NN JJ IN JJ NNS . DT JJ JJ NN VBZ RB VB...
8,26,1075,7.4,Tue,Oct,11,2011,10:31:22,Male,"Nov 24, 1974",Poured a slightly cloudy deep amber/red color ...,pour a slightly cloudy deep amber/red color wi...,VBD DT RB JJ JJ VBN NN IN DT JJ NN NN IN NN . ...
9,27,1075,7.4,Mon,Sep,19,2011,14:18:29,Male,"Oct 10, 1988",Big thanks to N2168 for knocking this off my w...,big thanks to n2168 for knock this off my want...,JJ NNS IN NN IN VBG DT RP PRP$ NNS . VBN IN DT...


In [15]:

# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)
# dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dfTrainFeatures.head()

Unnamed: 0,BeerID,ReviewerID,rating
0,12300,10635,4.0
1,12300,6547,4.5
2,12300,9789,4.5
3,12300,7372,5.0
4,12300,1302,4.5


In [16]:

reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID','rating']],reader)


In [17]:
def runAlgorithmForMae(algorithm, algorithmName):  
  print (algorithmName.upper())
  cv_results = cross_validate(algorithm, dsetTrainFeatures, measures=["MAE"], cv=5, verbose=True)

  print(algorithmName + ": " + str(np.mean(cv_results["test_mae"])))
  print(algorithmName + ": analyse_maes.append(" + str(np.mean(cv_results["test_mae"])) + ")")
  print("--------------------------------------------")




In [18]:

# # Basic Algorithms
# from surprise import NormalPredictor
# from surprise import BaselineOnly

# # KNN algorithms
# from surprise import KNNBasic
# from surprise import KNNWithZScore
# from surprise import KNNWithMeans
# from surprise import KNNBaseline

# # Matrix based Algorithms
# from surprise import SVD
# from surprise import SVDpp #note, pretty slow
# from surprise import NMF

# # Other
# from surprise import SlopeOne
# from surprise import CoClustering

runAlgorithmForMae(NormalPredictor(), "Normal")
runAlgorithmForMae(BaselineOnly(), "BaselineOnly")
runAlgorithmForMae(KNNBasic(), "KNNBasic")
runAlgorithmForMae(KNNWithZScore(), "KNNWithZScore")
runAlgorithmForMae(KNNWithMeans(), "KNNWithMeans")
runAlgorithmForMae(KNNBaseline(), "KNNBaseline")
runAlgorithmForMae(SVD(), "SVD")
runAlgorithmForMae(SVDpp(), "SVDpp")
runAlgorithmForMae(NMF(), "NMF")
runAlgorithmForMae(SlopeOne(), "SlopeOne")
runAlgorithmForMae(CoClustering(), "CoClustering")


NORMAL
Evaluating MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7288  0.7333  0.7298  0.7329  0.7314  0.7313  0.0017  
Fit time          0.16    0.17    0.17    0.17    0.18    0.17    0.01    
Test time         0.22    0.22    0.15    0.15    0.23    0.19    0.04    
Normal: 0.7312632200992769
Normal: analyse_maes.append(0.7312632200992769)
--------------------------------------------
BASELINEONLY
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4345  0.4364  0.4306  0.4331  0.4334  0.4336  0.0019  
Fit time          0.27    0.31    0.30    0.28    0.30    0.29    0.02    
Test time         0.20    0.20    0.10    0.10    0.17    0.15 

# Results Run

NORMAL
Evaluating MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7288  0.7333  0.7298  0.7329  0.7314  0.7313  0.0017  
Fit time          0.16    0.17    0.17    0.17    0.18    0.17    0.01    
Test time         0.22    0.22    0.15    0.15    0.23    0.19    0.04    
Normal: 0.7312632200992769
Normal: analyse_maes.append(0.7312632200992769)
--------------------------------------------
BASELINEONLY
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4345  0.4364  0.4306  0.4331  0.4334  0.4336  0.0019  
Fit time          0.27    0.31    0.30    0.28    0.30    0.29    0.02    
Test time         0.20    0.20    0.10    0.10    0.17    0.15    0.05    
BaselineOnly: 0.43361098912889
BaselineOnly: analyse_maes.append(0.43361098912889)
--------------------------------------------
KNNBASIC
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4856  0.4854  0.4825  0.4867  0.4820  0.4844  0.0018  
Fit time          0.38    0.38    0.42    0.43    0.41    0.40    0.02    
Test time         1.96    1.68    1.83    1.68    1.69    1.77    0.11    
KNNBasic: 0.48443828595202554
KNNBasic: analyse_maes.append(0.48443828595202554)
--------------------------------------------
KNNWITHZSCORE
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4479  0.4494  0.4502  0.4500  0.4468  0.4489  0.0013  
Fit time          0.53    0.53    0.54    0.54    0.54    0.54    0.00    
Test time         2.22    2.14    2.17    2.09    2.09    2.14    0.05    
KNNWithZScore: 0.4488646242836255
KNNWithZScore: analyse_maes.append(0.4488646242836255)
--------------------------------------------
KNNWITHMEANS
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4435  0.4495  0.4474  0.4502  0.4471  0.4475  0.0024  
Fit time          0.42    0.44    0.44    0.45    0.44    0.44    0.01    
Test time         1.96    1.96    1.96    1.87    1.91    1.93    0.04    
KNNWithMeans: 0.44753721529575313
KNNWithMeans: analyse_maes.append(0.44753721529575313)
--------------------------------------------
KNNBASELINE
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4487  0.4490  0.4497  0.4453  0.4465  0.4478  0.0017  
Fit time          0.62    0.65    0.67    0.66    0.66    0.65    0.01    
Test time         2.34    2.32    2.23    2.25    2.25    2.28    0.04    
KNNBaseline: 0.44783256748888844
KNNBaseline: analyse_maes.append(0.44783256748888844)
--------------------------------------------
SVD
Evaluating MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4417  0.4380  0.4341  0.4409  0.4382  0.4386  0.0027  
Fit time          5.09    5.06    5.04    5.05    5.09    5.07    0.02    
Test time         0.22    0.23    0.23    0.23    0.16    0.21    0.03    
SVD: 0.4385932209731469
SVD: analyse_maes.append(0.4385932209731469)
--------------------------------------------
SVDPP
Evaluating MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4359  0.4349  0.4406  0.4359  0.4316  0.4358  0.0029  
Fit time          352.47  355.33  357.96  373.65  360.04  359.89  7.33    
Test time         7.69    7.73    8.27    7.97    7.90    7.91    0.21    
SVDpp: 0.43578014325728337
SVDpp: analyse_maes.append(0.43578014325728337)
--------------------------------------------
NMF
Evaluating MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4805  0.4773  0.4824  0.4905  0.4842  0.4830  0.0044  
Fit time          7.21    7.00    7.10    6.94    6.91    7.03    0.11    
Test time         0.13    0.13    0.13    0.20    0.21    0.16    0.04    
NMF: 0.4829590801112749
NMF: analyse_maes.append(0.4829590801112749)
--------------------------------------------
SLOPEONE
Evaluating MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4616  0.4549  0.4608  0.4602  0.4574  0.4590  0.0025  
Fit time          4.10    3.92    4.07    3.99    4.25    4.07    0.11    
Test time         5.84    5.87    5.92    5.95    6.01    5.92    0.06    
SlopeOne: 0.458970179254116
SlopeOne: analyse_maes.append(0.458970179254116)
--------------------------------------------
COCLUSTERING
Evaluating MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.5187  0.5190  0.5164  0.5169  0.5173  0.5176  0.0010  
Fit time          3.39    3.40    3.40    3.35    3.40    3.39    0.02    
Test time         0.12    0.12    0.12    0.19    0.19    0.15    0.03    
CoClustering: 0.5176436380464999
CoClustering: analyse_maes.append(0.5176436380464999)
--------------------------------------------
