In [1]:
# Import libraries
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split

from utilities import data_basic_utility as databasic

## File Details

Basic run with SVD without cross validation. This can be the basis for parameter tuning and other stuff later


# Assignment 3 -- Recommendation Systems

* The final challenge is much like Assignment 2 -- but scoped appropriately for the time and your current abilities.
* It is ratings prediction, just like the movielens recommendations we have seen and many other similar problems.
* The features created are based on Beer Reviews from experts on a website.
* Each beer has been scored between 0 and 5 (on a real scale, so 2.75 or 3.5 is OK).
* The official measure is Mean Average Error (MAE) which is pretty intuitive to work with. Everything supports is and it is easy to interpret.
* A set of features have been created based on the reviewer, the written review, and information about the Beer being reviewed.
* Not all features have to be used, and you can easily create new features using the data if you like.
* The features included are:

![title](Images/A3Features.png)

* Sizes of the files are:
|Size | File|
|---|---|
| 1.9G | features.tsv |
| 88B  | header-features.tsv|
| 48B  | header.tsv |
| 15M  | test.tsv |
| 50M  | train.tsv |
| 16M |  val.tsv |


In [2]:
filePrefix = "A3_076_surprise_baselineonly"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [3]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
#df_train = pd.read_csv(baseDataDir + 'train_wk12.tsv',sep='\t',
df_train = pd.read_csv(baseDataDir + 'train_200k.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

#df_vali = pd.read_csv(baseDataDir + 'val_wk12.tsv',sep='\t',
df_vali = pd.read_csv(baseDataDir + 'vali_200k.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch �r Bock,Rauchbier,4.0
1,27,12300,5634,Rauch �r Bock,Rauchbier,4.5
2,28,12300,3544,Rauch �r Bock,Rauchbier,4.5
3,40,12300,6521,Rauch �r Bock,Rauchbier,4.0
4,43,12300,10177,Rauch �r Bock,Rauchbier,4.5
5,48,12300,2907,Rauch �r Bock,Rauchbier,3.5
6,49,12300,1532,Rauch �r Bock,Rauchbier,4.0
7,50,12300,3452,Rauch �r Bock,Rauchbier,3.5
8,59,12300,6861,Rauch �r Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


Column List: 
RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag

Collab Filtering with Surprise, doesn't use the features at all

In [4]:
# df_features = pd.read_csv(baseDataDir + 'features-top500.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
# df_features = pd.read_csv(baseDataDir + 'features_200k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
#                                                                  'DayofMonth','Year','TimeOfDay','Gender',
#                                                                  'Birthday','Text','Lemmatized','POS_Tag'])

# df_features.head(10)

In [5]:
idCols = ['RowID','BeerID','ReviewerID']

# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)
dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dfTrainFeatures.head()

Unnamed: 0,BeerID,ReviewerID,rating
0,12300,10635,4.0
1,12300,6547,4.5
2,12300,9789,4.5
3,12300,7372,5.0
4,12300,1302,4.5


In [6]:

reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID',
                                    'rating']],reader)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID',
                                     'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

print(type(dsetTrainFeatures))
print(type(trainsetTrainFeatures))
trainsetTrainFeatures

<class 'surprise.dataset.DatasetAutoFolds'>
<class 'surprise.trainset.Trainset'>


<surprise.trainset.Trainset at 0x1288edac6d0>

In [7]:

NA,valset = train_test_split(dsetValiFeatures, test_size=1.0)

# simple Tuning best params: {'bsl_options': }

#algorithm = BaselineOnly()
algorithm = BaselineOnly(bsl_options = {'n_epochs': 8, 'reg_u': 4, 'reg_i': 15})

model = algorithm.fit(trainsetTrainFeatures)
predictions = algorithm.test(valset)


# Score our predictions with MAE
# It is around 0.77, which means the a random guess based on the distribution of the data
# is on average within 0.77 (plus or minus) the true label.
# Not bad! You can beat it though, I'm sure :).
# Smaller MAE is the better. Good luck!
mae = accuracy.mae(predictions,verbose=True)

print("Average MAE: " + str(mae))

Estimating biases using als...
MAE:  0.4262
Average MAE: 0.4262243789863012


In [8]:
print(type(valset))
print(valset[0:10])


<class 'list'>
[(13095, 8313, 5.0), (968, 10638, 5.0), (4374, 3173, 4.0), (6304, 1947, 5.0), (9779, 6322, 4.0), (7909, 10194, 4.5), (196, 2013, 4.0), (172, 9842, 4.5), (3976, 6861, 4.0), (1889, 1042, 4.5)]


In [9]:
print(type(predictions))
print(str(len(predictions)))
print(predictions[0:10])
print(predictions[0])

<class 'list'>
39509
[Prediction(uid=13095, iid=8313, r_ui=5.0, est=3.9478036285927223, details={'was_impossible': False}), Prediction(uid=968, iid=10638, r_ui=5.0, est=3.9144583899124434, details={'was_impossible': False}), Prediction(uid=4374, iid=3173, r_ui=4.0, est=3.8791189850139425, details={'was_impossible': False}), Prediction(uid=6304, iid=1947, r_ui=5.0, est=4.043556715740135, details={'was_impossible': False}), Prediction(uid=9779, iid=6322, r_ui=4.0, est=3.9909264633840564, details={'was_impossible': False}), Prediction(uid=7909, iid=10194, r_ui=4.5, est=3.3884840290625386, details={'was_impossible': False}), Prediction(uid=196, iid=2013, r_ui=4.0, est=3.65936914084648, details={'was_impossible': False}), Prediction(uid=172, iid=9842, r_ui=4.5, est=3.062846541280187, details={'was_impossible': False}), Prediction(uid=3976, iid=6861, r_ui=4.0, est=4.128468202498596, details={'was_impossible': False}), Prediction(uid=1889, iid=1042, r_ui=4.5, est=4.070448570489779, details={'

In [10]:
# Convert the Predictions to a dataframe so we can lookup predictions easy
lstUIds = list(map(lambda x: x.uid, predictions))
lstIIds = list(map(lambda x: x.iid, predictions))
lstTrueRatings = list(map(lambda x: x.r_ui, predictions))
lstRatingEst = list(map(lambda x: x.est, predictions))


# uid == BeerId, iid == ReviewerId, r_ui == Original Ration, est = Predicted rating
dfPredictions = pd.DataFrame({ "uid": lstUIds,"iid": lstIIds, "r_ui": lstTrueRatings, "Predict": lstRatingEst })

dfPredictions.head()
# dfPredictions[dfPredictions.uid == 3519]

Unnamed: 0,uid,iid,r_ui,Predict
0,13095,8313,5.0,3.947804
1,968,10638,5.0,3.914458
2,4374,3173,4.0,3.879119
3,6304,1947,5.0,4.043557
4,9779,6322,4.0,3.990926


In [11]:
print(dfValiIds.shape)
print(dfPredictions.shape)

(39509, 3)
(39509, 4)


In [12]:
# join the predictions to the ids, sort by rowid and write to file
dfPredictions = pd.merge(dfValiIds, dfPredictions, how="inner", left_on=["BeerID", "ReviewerID"], right_on=["uid", "iid"])
dfPredictions.head()


Unnamed: 0,RowID,BeerID,ReviewerID,uid,iid,r_ui,Predict
0,22,12300,2634,12300,2634,4.0,4.265737
1,27,12300,5634,12300,5634,4.5,4.202083
2,28,12300,3544,12300,3544,4.5,4.371105
3,40,12300,6521,12300,6521,4.0,4.231804
4,43,12300,10177,12300,10177,4.5,4.17668


This basic normal run doesn't give good MAE, so not worth writing out and considering

In [13]:
if writeSubRunFile:
  dfPredictions.sort_values("RowID")[["RowID", "BeerID", "ReviewerID", "Predict"]].to_csv(subrunDir + filePrefix + "_subrun.csv", index=False)

print("Average MAE: " + str(mae))
print("analyse_maes.append(" + str(mae) + ")")
print(dfPredictions.shape)
dfPredictions.sort_values("RowID").head(8)


Average MAE: 0.4262243789863012
analyse_maes.append(0.4262243789863012)
(39703, 7)


Unnamed: 0,RowID,BeerID,ReviewerID,uid,iid,r_ui,Predict
0,22,12300,2634,12300,2634,4.0,4.265737
1,27,12300,5634,12300,5634,4.5,4.202083
2,28,12300,3544,12300,3544,4.5,4.371105
3,40,12300,6521,12300,6521,4.0,4.231804
4,43,12300,10177,12300,10177,4.5,4.17668
5,48,12300,2907,12300,2907,3.5,3.993235
6,49,12300,1532,12300,1532,4.0,4.236586
7,50,12300,3452,12300,3452,3.5,4.116234


In [14]:

print("Run - " + filePrefix)
# Log of Results
analyse_maes = []

print("Average MAE over all tests: " + str(np.mean(analyse_maes)))

# Make sure it's predicting floats
# dfPredicted["Predict"].drop_duplicates()

Run - A3_076_surprise_baselineonly
Average MAE over all tests: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## Results

* Average MAE with simple Baseline Only: 0.42877945168111453
* bsl_options = {'n_epochs': 5, 'reg_u': 8, 'reg_i': 7} MAE : 0.4273068071188259
* bsl_options = {'n_epochs': 8, 'reg_u': 4, 'reg_i': 15} MAE : 0.4262243789863012

