In [1]:
# Import libraries
import pandas as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split

from utilities import data_basic_utility as databasic

## File Details

For Ensemble Version 1
Run with KNNMeans on full data set, with basic param tuning


In [2]:
filePrefix = "A3_101_surprise_knnmeans_full"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [3]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
#df_train = pd.read_csv(baseDataDir + 'train_wk12.tsv',sep='\t',
df_train = pd.read_csv(baseDataDir + 'train.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

#df_vali = pd.read_csv(baseDataDir + 'val_wk12.tsv',sep='\t',
df_vali = pd.read_csv(baseDataDir + 'val.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch Ür Bock,Rauchbier,4.0
1,27,12300,5634,Rauch Ür Bock,Rauchbier,4.5
2,28,12300,3544,Rauch Ür Bock,Rauchbier,4.5
3,40,12300,6521,Rauch Ür Bock,Rauchbier,4.0
4,43,12300,10177,Rauch Ür Bock,Rauchbier,4.5
5,48,12300,2907,Rauch Ür Bock,Rauchbier,3.5
6,49,12300,1532,Rauch Ür Bock,Rauchbier,4.0
7,50,12300,3452,Rauch Ür Bock,Rauchbier,3.5
8,59,12300,6861,Rauch Ür Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


In [4]:
idCols = ['RowID','BeerID','ReviewerID']

# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)
dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dfTrainFeatures.head()

Unnamed: 0,BeerID,ReviewerID,rating
0,12300,10635,4.0
1,12300,6547,4.5
2,12300,9789,4.5
3,12300,7372,5.0
4,12300,1302,4.5


In [5]:

reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID',
                                    'rating']],reader)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID',
                                     'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

print(type(dsetTrainFeatures))
print(type(trainsetTrainFeatures))
trainsetTrainFeatures

<class 'surprise.dataset.DatasetAutoFolds'>
<class 'surprise.trainset.Trainset'>


<surprise.trainset.Trainset at 0x16686db3ac0>

In [6]:

NA,valset = train_test_split(dsetValiFeatures, test_size=1.0)

# Medium Tuning best params: {'k': 60, 'min_k': 2, 'sim_options': {'name': 'msd', 'min_support': 1, 'user_based': False}}

# algorithm = KNNWithMeans()
algorithm = KNNWithMeans(k=80)
# algorithm = KNNWithMeans(k=60, min_k = 2, sim_options= {'name': 'msd', 'min_support': 1, 'user_based': False})

model = algorithm.fit(trainsetTrainFeatures)
predictions = algorithm.test(valset)


# Score our predictions with MAE
# It is around 0.77, which means the a random guess based on the distribution of the data
# is on average within 0.77 (plus or minus) the true label.
# Not bad! You can beat it though, I'm sure :).
# Smaller MAE is the better. Good luck!
mae = accuracy.mae(predictions,verbose=True)

print("Average MAE: " + str(mae))

Computing the msd similarity matrix...
Done computing similarity matrix.
MAE:  0.4401
Average MAE: 0.4401281860792271


In [7]:
print(type(valset))
print(valset[0:10])


<class 'list'>
[(79, 3634, 3.5), (2015, 4689, 4.0), (1047, 6197, 4.0), (692, 10195, 3.5), (553, 7982, 2.0), (6755, 3887, 4.0), (6399, 10461, 4.0), (474, 7399, 4.0), (7790, 8034, 4.0), (4551, 7912, 4.0)]


In [8]:
print(type(predictions))
print(str(len(predictions)))
print(predictions[0:10])
print(predictions[0])

<class 'list'>
275876
[Prediction(uid=79, iid=3634, r_ui=3.5, est=3.738683159064228, details={'actual_k': 80, 'was_impossible': False}), Prediction(uid=2015, iid=4689, r_ui=4.0, est=4.144444099599183, details={'actual_k': 69, 'was_impossible': False}), Prediction(uid=1047, iid=6197, r_ui=4.0, est=4.303068864877547, details={'actual_k': 80, 'was_impossible': False}), Prediction(uid=692, iid=10195, r_ui=3.5, est=3.856147491130205, details={'actual_k': 80, 'was_impossible': False}), Prediction(uid=553, iid=7982, r_ui=2.0, est=2.4528263894607494, details={'actual_k': 80, 'was_impossible': False}), Prediction(uid=6755, iid=3887, r_ui=4.0, est=4.317178648763694, details={'actual_k': 80, 'was_impossible': False}), Prediction(uid=6399, iid=10461, r_ui=4.0, est=3.676984348557091, details={'actual_k': 80, 'was_impossible': False}), Prediction(uid=474, iid=7399, r_ui=4.0, est=4.599027782414849, details={'actual_k': 80, 'was_impossible': False}), Prediction(uid=7790, iid=8034, r_ui=4.0, est=3.5180

In [9]:
# Convert the Predictions to a dataframe so we can lookup predictions easy
lstUIds = list(map(lambda x: x.uid, predictions))
lstIIds = list(map(lambda x: x.iid, predictions))
lstTrueRatings = list(map(lambda x: x.r_ui, predictions))
lstRatingEst = list(map(lambda x: x.est, predictions))


# uid == BeerId, iid == ReviewerId, r_ui == Original Ration, est = Predicted rating
dfPredictions = pd.DataFrame({ "uid": lstUIds,"iid": lstIIds, "r_ui": lstTrueRatings, "Predict": lstRatingEst })

dfPredictions.head()
# dfPredictions[dfPredictions.uid == 3519]

Unnamed: 0,uid,iid,r_ui,Predict
0,79,3634,3.5,3.738683
1,2015,4689,4.0,4.144444
2,1047,6197,4.0,4.303069
3,692,10195,3.5,3.856147
4,553,7982,2.0,2.452826


In [10]:
print(dfValiIds.shape)
print(dfPredictions.shape)

(275876, 3)
(275876, 4)


In [11]:
# join the predictions to the ids, sort by rowid and write to file
dfPredictions = pd.merge(dfValiIds, dfPredictions, how="inner", left_on=["BeerID", "ReviewerID"], right_on=["uid", "iid"])
dfPredictions.head()


Unnamed: 0,RowID,BeerID,ReviewerID,uid,iid,r_ui,Predict
0,22,12300,2634,12300,2634,4.0,4.307391
1,27,12300,5634,12300,5634,4.5,4.161029
2,28,12300,3544,12300,3544,4.5,4.302595
3,40,12300,6521,12300,6521,4.0,4.384419
4,43,12300,10177,12300,10177,4.5,4.249009


This basic normal run doesn't give good MAE, so not worth writing out and considering

In [12]:
if writeSubRunFile:
  dfPredictions.sort_values("RowID")[["RowID", "BeerID", "ReviewerID", "Predict"]].to_csv(subrunDir + filePrefix + "_subrun.csv", index=False)

print("Average MAE: " + str(mae))
print("analyse_maes.append(" + str(mae) + ")")
print(dfPredictions.shape)
dfPredictions.sort_values("RowID").head(8)


Average MAE: 0.4401281860792271
analyse_maes.append(0.4401281860792271)
(276976, 7)


Unnamed: 0,RowID,BeerID,ReviewerID,uid,iid,r_ui,Predict
0,22,12300,2634,12300,2634,4.0,4.307391
1,27,12300,5634,12300,5634,4.5,4.161029
2,28,12300,3544,12300,3544,4.5,4.302595
3,40,12300,6521,12300,6521,4.0,4.384419
4,43,12300,10177,12300,10177,4.5,4.249009
5,48,12300,2907,12300,2907,3.5,3.945549
6,49,12300,1532,12300,1532,4.0,4.30818
7,50,12300,3452,12300,3452,3.5,4.098133


In [13]:

print("Run - " + filePrefix)
# Log of Results
analyse_maes = []

print("Average MAE over all tests: " + str(np.mean(analyse_maes)))

# Make sure it's predicting floats
# dfPredicted["Predict"].drop_duplicates()

Run - A3_101_surprise_knnmeans_full
Average MAE over all tests: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## Results

MAE on full validation data: 0.4401281860792271
