In [1]:
# Import libraries
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split

from utilities import data_basic_utility as databasic

## File Details

For Ensemble Version 1
Run with BaselineOnly on full data set, with basic param tuning


In [2]:
filePrefix = "A3_102_surprise_baselineonly"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [3]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
#df_train = pd.read_csv(baseDataDir + 'train_wk12.tsv',sep='\t',
df_train = pd.read_csv(baseDataDir + 'train.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

#df_vali = pd.read_csv(baseDataDir + 'val_wk12.tsv',sep='\t',
df_vali = pd.read_csv(baseDataDir + 'val.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch Ür Bock,Rauchbier,4.0
1,27,12300,5634,Rauch Ür Bock,Rauchbier,4.5
2,28,12300,3544,Rauch Ür Bock,Rauchbier,4.5
3,40,12300,6521,Rauch Ür Bock,Rauchbier,4.0
4,43,12300,10177,Rauch Ür Bock,Rauchbier,4.5
5,48,12300,2907,Rauch Ür Bock,Rauchbier,3.5
6,49,12300,1532,Rauch Ür Bock,Rauchbier,4.0
7,50,12300,3452,Rauch Ür Bock,Rauchbier,3.5
8,59,12300,6861,Rauch Ür Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


In [4]:
idCols = ['RowID','BeerID','ReviewerID']

# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)
dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dfTrainFeatures.head()

Unnamed: 0,BeerID,ReviewerID,rating
0,12300,10635,4.0
1,12300,6547,4.5
2,12300,9789,4.5
3,12300,7372,5.0
4,12300,1302,4.5


In [5]:

reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID',
                                    'rating']],reader)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID',
                                     'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

print(type(dsetTrainFeatures))
print(type(trainsetTrainFeatures))
trainsetTrainFeatures

<class 'surprise.dataset.DatasetAutoFolds'>
<class 'surprise.trainset.Trainset'>


<surprise.trainset.Trainset at 0x28e87ebd3d0>

In [6]:

NA,valset = train_test_split(dsetValiFeatures, test_size=1.0)

# simple Tuning best params: {'bsl_options': }

#algorithm = BaselineOnly()
algorithm = BaselineOnly(bsl_options = {'n_epochs': 8, 'reg_u': 4, 'reg_i': 15})

model = algorithm.fit(trainsetTrainFeatures)
predictions = algorithm.test(valset)


# Score our predictions with MAE
# It is around 0.77, which means the a random guess based on the distribution of the data
# is on average within 0.77 (plus or minus) the true label.
# Not bad! You can beat it though, I'm sure :).
# Smaller MAE is the better. Good luck!
mae = accuracy.mae(predictions,verbose=True)

print("Average MAE: " + str(mae))

Estimating biases using als...
MAE:  0.4399
Average MAE: 0.439900482465001


In [7]:
print(type(valset))
print(valset[0:10])


<class 'list'>
[(7264, 229, 3.5), (10244, 2430, 4.0), (10280, 33, 3.5), (1148, 9682, 4.0), (1206, 8021, 3.5), (80, 5659, 3.0), (9195, 4496, 4.0), (7641, 8109, 3.5), (4022, 1704, 4.0), (9323, 6560, 2.5)]


In [8]:
print(type(predictions))
print(str(len(predictions)))
print(predictions[0:10])
print(predictions[0])

<class 'list'>
275876
[Prediction(uid=7264, iid=229, r_ui=3.5, est=3.3911483765043413, details={'was_impossible': False}), Prediction(uid=10244, iid=2430, r_ui=4.0, est=3.8113907328608705, details={'was_impossible': False}), Prediction(uid=10280, iid=33, r_ui=3.5, est=4.156424797538301, details={'was_impossible': False}), Prediction(uid=1148, iid=9682, r_ui=4.0, est=3.992169543742586, details={'was_impossible': False}), Prediction(uid=1206, iid=8021, r_ui=3.5, est=4.0403981672292275, details={'was_impossible': False}), Prediction(uid=80, iid=5659, r_ui=3.0, est=3.2894239864995836, details={'was_impossible': False}), Prediction(uid=9195, iid=4496, r_ui=4.0, est=4.04481501642584, details={'was_impossible': False}), Prediction(uid=7641, iid=8109, r_ui=3.5, est=3.715397018015087, details={'was_impossible': False}), Prediction(uid=4022, iid=1704, r_ui=4.0, est=3.9217396421183657, details={'was_impossible': False}), Prediction(uid=9323, iid=6560, r_ui=2.5, est=3.6192053958146926, details={'w

In [9]:
# Convert the Predictions to a dataframe so we can lookup predictions easy
lstUIds = list(map(lambda x: x.uid, predictions))
lstIIds = list(map(lambda x: x.iid, predictions))
lstTrueRatings = list(map(lambda x: x.r_ui, predictions))
lstRatingEst = list(map(lambda x: x.est, predictions))


# uid == BeerId, iid == ReviewerId, r_ui == Original Ration, est = Predicted rating
dfPredictions = pd.DataFrame({ "uid": lstUIds,"iid": lstIIds, "r_ui": lstTrueRatings, "Predict": lstRatingEst })

dfPredictions.head()
# dfPredictions[dfPredictions.uid == 3519]

Unnamed: 0,uid,iid,r_ui,Predict
0,7264,229,3.5,3.391148
1,10244,2430,4.0,3.811391
2,10280,33,3.5,4.156425
3,1148,9682,4.0,3.99217
4,1206,8021,3.5,4.040398


In [10]:
print(dfValiIds.shape)
print(dfPredictions.shape)

(275876, 3)
(275876, 4)


In [11]:
# join the predictions to the ids, sort by rowid and write to file
dfPredictions = pd.merge(dfValiIds, dfPredictions, how="inner", left_on=["BeerID", "ReviewerID"], right_on=["uid", "iid"])
dfPredictions.head()


Unnamed: 0,RowID,BeerID,ReviewerID,uid,iid,r_ui,Predict
0,22,12300,2634,12300,2634,4.0,4.244866
1,27,12300,5634,12300,5634,4.5,4.188136
2,28,12300,3544,12300,3544,4.5,4.301863
3,40,12300,6521,12300,6521,4.0,4.311279
4,43,12300,10177,12300,10177,4.5,4.284111


This basic normal run doesn't give good MAE, so not worth writing out and considering

In [12]:
if writeSubRunFile:
  dfPredictions.sort_values("RowID")[["RowID", "BeerID", "ReviewerID", "Predict"]].to_csv(subrunDir + filePrefix + "_subrun.csv", index=False)

print("Average MAE: " + str(mae))
print("analyse_maes.append(" + str(mae) + ")")
print(dfPredictions.shape)
dfPredictions.sort_values("RowID").head(8)


Average MAE: 0.439900482465001
analyse_maes.append(0.439900482465001)
(276976, 7)


Unnamed: 0,RowID,BeerID,ReviewerID,uid,iid,r_ui,Predict
0,22,12300,2634,12300,2634,4.0,4.244866
1,27,12300,5634,12300,5634,4.5,4.188136
2,28,12300,3544,12300,3544,4.5,4.301863
3,40,12300,6521,12300,6521,4.0,4.311279
4,43,12300,10177,12300,10177,4.5,4.284111
5,48,12300,2907,12300,2907,3.5,3.83706
6,49,12300,1532,12300,1532,4.0,4.266661
7,50,12300,3452,12300,3452,3.5,4.070988


## Results

MAE over Full data for Baseline Only: 0.439900482465001

