In [77]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split as sklearn_tt_split

from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split

from surprise import NormalPredictor
from surprise import SVD
from surprise import KNNBasic
from surprise import KNNWithZScore

## File Details

To start with, this is basically just a copy paste of the surprise normal sample from Week 12, but then rejigged to use the subset of the data. Currently, 200k records. Still just using BeerId and reviewer id as is.

This one also does the inbuilt 5x cross validation which is nice, though for this assignment we have a pre-split train/validation file set that we can just use

Looked at SVD with 5 fold cross validation. takes 20s to fit, but already MAE gone down from 0.73 down to 0.438

# Assignment 3 -- Recommendation Systems

* The final challenge is much like Assignment 2 -- but scoped appropriately for the time and your current abilities.
* It is ratings prediction, just like the movielens recommendations we have seen and many other similar problems.
* The features created are based on Beer Reviews from experts on a website.
* Each beer has been scored between 0 and 5 (on a real scale, so 2.75 or 3.5 is OK).
* The official measure is Mean Average Error (MAE) which is pretty intuitive to work with. Everything supports is and it is easy to interpret.
* A set of features have been created based on the reviewer, the written review, and information about the Beer being reviewed.
* Not all features have to be used, and you can easily create new features using the data if you like.
* The features included are:

![title](Images/A3Features.png)

* Sizes of the files are:
|Size | File|
|---|---|
| 1.9G | features.tsv |
| 88B  | header-features.tsv|
| 48B  | header.tsv |
| 15M  | test.tsv |
| 50M  | train.tsv |
| 16M |  val.tsv |


In [78]:
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"

In [79]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
df_train = pd.read_csv(baseDataDir + 'train_200k.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

df_vali = pd.read_csv(baseDataDir + 'vali_200k.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch �r Bock,Rauchbier,4.0
1,27,12300,5634,Rauch �r Bock,Rauchbier,4.5
2,28,12300,3544,Rauch �r Bock,Rauchbier,4.5
3,40,12300,6521,Rauch �r Bock,Rauchbier,4.0
4,43,12300,10177,Rauch �r Bock,Rauchbier,4.5
5,48,12300,2907,Rauch �r Bock,Rauchbier,3.5
6,49,12300,1532,Rauch �r Bock,Rauchbier,4.0
7,50,12300,3452,Rauch �r Bock,Rauchbier,3.5
8,59,12300,6861,Rauch �r Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


In [80]:
# RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag
df_features = pd.read_csv(baseDataDir + 'features_200k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
                                                                 'DayofMonth','Year','TimeOfDay','Gender',
                                                                 'Birthday','Text','Lemmatized','POS_Tag'])

df_features.head(10)

Unnamed: 0,RowID,BrewerID,ABV,DayofWeek,Month,DayofMonth,Year,TimeOfDay,Gender,Birthday,Text,Lemmatized,POS_Tag
0,18,1075,7.4,Mon,Jan,2,2012,15:20:04,Male,unknown,Pours a murky light brown with a 1 inch fizzy ...,pour a murky light brown with a 1 inch fizzy t...,VBZ DT JJ NN JJ IN DT CD NN JJ NN NN WDT VBZ I...
1,19,1075,7.4,Sun,Jan,1,2012,06:46:52,Male,unknown,Faint sudsy head with some with some dissipati...,faint sudsy head with some with some dissipate...,NN JJ NN IN DT IN DT VBG JJ NN . JJ JJ NN . DT...
2,20,1075,7.4,Tue,Nov,29,2011,05:51:44,Male,unknown,A new arrival to the West TN area ... Pours qu...,a new arrival to the West TN area ... pour qui...,"DT JJ NN IN DT NNP NNP NN , VBZ PDT DT NN JJR ..."
3,21,1075,7.4,Sat,Nov,5,2011,22:59:57,Male,unknown,Sampled 10/30/11 - Transferring the notes . A ...,sample 10/30/11 - transfer the note . a ruby p...,VBN CD HYPH VBG DT NNS . DT NN VBP IN DT NN NN...
4,22,1075,7.4,Tue,Nov,1,2011,20:40:21,Male,"Oct 14, 1983",This is my first rauchbier . Pours a burnt amb...,this be my first rauchbier . pour a burn amber...,DT VBZ PRP$ JJ NN . VBZ DT VBN NN . JJ NN . NN...
5,23,1075,7.4,Sat,Oct,29,2011,11:18:34,unknown,unknown,A,a,DT
6,24,1075,7.4,Mon,Oct,17,2011,09:04:02,unknown,unknown,"Pours a mahogany color , rich , with a tan hea...","pour a mahogany color , rich , with a tan head...","VBZ DT NN NN , JJ , IN DT JJ NN . DT NN , VBD ..."
7,25,1075,7.4,Tue,Oct,11,2011,10:41:41,unknown,unknown,Pours light caramel brown with reddish highlig...,pour light caramel brown with reddish highligh...,NNS JJ NN JJ IN JJ NNS . DT JJ JJ NN VBZ RB VB...
8,26,1075,7.4,Tue,Oct,11,2011,10:31:22,Male,"Nov 24, 1974",Poured a slightly cloudy deep amber/red color ...,pour a slightly cloudy deep amber/red color wi...,VBD DT RB JJ JJ VBN NN IN DT JJ NN NN IN NN . ...
9,27,1075,7.4,Mon,Sep,19,2011,14:18:29,Male,"Oct 10, 1988",Big thanks to N2168 for knocking this off my w...,big thanks to n2168 for knock this off my want...,JJ NNS IN NN IN VBG DT RP PRP$ NNS . VBN IN DT...


In [81]:

# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dfTrainFeatures.head()

Unnamed: 0,BeerID,ReviewerID,rating
0,12300,10635,4.0
1,12300,6547,4.5
2,12300,9789,4.5
3,12300,7372,5.0
4,12300,1302,4.5


In [82]:

reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID',
                                    'rating']],reader)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID',
                                     'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

print(type(dsetTrainFeatures))
print(type(trainsetTrainFeatures))
trainsetTrainFeatures

<class 'surprise.dataset.DatasetAutoFolds'>
<class 'surprise.trainset.Trainset'>


<surprise.trainset.Trainset at 0x18ebc1b8310>

In [83]:

NA,valset = train_test_split(dsetValiFeatures, test_size=1.0)


In [84]:

# The "Random" Prediction Algorithm from Surprise. See the API for a complete description.
# The Algorithm predicts a random rating based on the distribution of the training set, 
# which is assumed to be normal.
random = NormalPredictor()
model = random.fit(trainsetTrainFeatures)
predictions = random.test(valset)

# Score our predictions with MAE
# It is around 0.77, which means the a random guess based on the distribution of the data
# is on average within 0.77 (plus or minus) the true label.
# Not bad! You can beat it though, I'm sure :).
# Smaller MAE is the better. Good luck!
accuracy.mae(predictions,verbose=True)

MAE:  0.7330


0.7330419414094556

In [85]:
dsetTrainFeatures2 = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID','rating']],reader)

In [86]:
print(type(dsetTrainFeatures2.raw_ratings))
print(str(len((dsetTrainFeatures2.raw_ratings))))
print(dsetTrainFeatures2.raw_ratings[0:50])

<class 'list'>
120564
[(12300, 10635, 4.0, None), (12300, 6547, 4.5, None), (12300, 9789, 4.5, None), (12300, 7372, 5.0, None), (12300, 1302, 4.5, None), (12300, 704, 4.5, None), (12300, 1747, 5.0, None), (12300, 9368, 4.5, None), (12300, 2568, 4.0, None), (12300, 6838, 4.0, None), (12300, 850, 3.0, None), (12300, 9705, 4.0, None), (12300, 3264, 3.5, None), (12300, 2962, 4.5, None), (12300, 2748, 5.0, None), (12300, 757, 3.5, None), (12300, 7207, 4.0, None), (12300, 2849, 4.5, None), (12300, 4737, 4.5, None), (12300, 7826, 4.0, None), (12300, 8720, 4.0, None), (12300, 10680, 4.0, None), (12300, 4094, 4.0, None), (6699, 7162, 4.0, None), (6699, 10135, 2.5, None), (6699, 2465, 4.0, None), (6699, 592, 4.5, None), (6699, 447, 3.5, None), (6699, 4527, 4.0, None), (6699, 10650, 4.0, None), (6699, 10305, 4.0, None), (6699, 837, 3.5, None), (6699, 1082, 4.0, None), (6699, 3158, 4.0, None), (6699, 6718, 3.5, None), (6699, 2849, 4.5, None), (6699, 6872, 5.0, None), (6699, 6330, 5.0, None), (6699

In [87]:

cv_results = cross_validate(algorithm_random, dsetTrainFeatures2, measures=["MAE"], cv=5, verbose=True)

print("Average MAE across 5x KFolds: " + str(np.mean(cv_results["test_mae"])))
print("analyse_maes.append(" + str(np.mean(cv_results["test_mae"])) + ")")

Evaluating MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7255  0.7300  0.7321  0.7281  0.7294  0.7290  0.0022  
Fit time          0.15    0.15    0.15    0.15    0.15    0.15    0.00    
Test time         0.27    0.12    0.27    0.12    0.29    0.21    0.08    
Average MAE across 5x KFolds: 0.7290349930838393
analyse_maes.append(0.7290349930838393)


In [61]:

print("Run 1 - Normal with CV")
# Log of Results
analyse_maes = []

analyse_maes.append(0.7290257148745465)
analyse_maes.append(0.7285347965810015)
analyse_maes.append(0.7313037173081197)

print("Average MAE over all tests: " + str(np.mean(analyse_maes)))

Run 1 - Normal with CV
Average MAE over all tests: 0.728780255727774


In [62]:
algorithm_svd = SVD()

svd_results = cross_validate(algorithm_svd, dsetTrainFeatures2, measures=["MAE"], cv=5, verbose=True)

print("Average MAE across 5x KFolds: " + str(np.mean(svd_results["test_mae"])))
print("analyse_maes.append(" + str(np.mean(svd_results["test_mae"])) + ")")


Evaluating MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4395  0.4370  0.4398  0.4394  0.4413  0.4394  0.0014  
Fit time          3.92    3.95    3.99    3.95    3.95    3.95    0.02    
Test time         0.13    0.13    0.13    0.13    0.26    0.16    0.05    
Average MAE across 5x KFolds: 0.43937855284245797
analyse_maes.append(0.43937855284245797)


In [63]:

print("Run 2")
# Log of Results
analyse_maes = []

analyse_maes.append(0.4384146426377059)
analyse_maes.append(0.4387265105896204)
analyse_maes.append(0.43937855284245797)

print("Average MAE over all tests: " + str(np.mean(analyse_maes)))

Run 2
Average MAE over all tests: 0.43857057661366317


In [68]:
sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between items
               }

algorithm_knn = KNNBasic(sim_options=sim_options)


svd_results = cross_validate(algorithm_knn, dsetTrainFeatures2, measures=["MAE"], cv=5, verbose=True)

print("Average MAE across 5x KFolds: " + str(np.mean(svd_results["test_mae"])))
print("analyse_maes.append(" + str(np.mean(svd_results["test_mae"])) + ")")


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4516  0.4493  0.4466  0.4491  0.4483  0.4490  0.0016  
Fit time          14.03   13.75   13.68   13.51   13.49   13.69   0.20    
Test time         5.32    5.35    5.52    5.31    5.49    5.40    0.09    
Average MAE across 5x KFolds: 0.44898425294985894
analyse_maes.append(0.44898425294985894)


In [71]:
sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between items
               }

algorithm_knnz = KNNWithZScore(sim_options=sim_options)


svd_results = cross_validate(algorithm_knnz, dsetTrainFeatures2, measures=["MAE"], cv=5, verbose=True)

print("Average MAE across 5x KFolds: " + str(np.mean(svd_results["test_mae"])))
print("analyse_maes.append(" + str(np.mean(svd_results["test_mae"])) + ")")

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.4688  0.4668  0.4629  0.4627  0.4674  0.4657  0.0025  
Fit time          13.83   14.56   13.58   14.25   13.68   13.98   0.37    
Test time         5.62    5.78    5.74    5.78    5.69    5.72    0.06    
Average MAE across 5x KFolds: 0.4657190522257462
analyse_maes.append(0.4657190522257462)
