In [None]:
# Import libraries
# import pandas as pd
import dask.dataframe as pd
import numpy as np

from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil

from contentknn import ContentKNNFullCosSimilarityAlgorithm

## File Details - Light GBM Regression on Beer Context columns inc review counts

This is a Candidate for being used in an Ensemble. 
Characteristicts:
* Light GBM Regression Algorithm
* Using Beer Context columns inc ABV, Year and Review Counts
* Todo: use optimised parameters for Light GBM Regression


In [None]:
filePrefix = "A3_161_contentknn_tinkering2"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [None]:

trainFullProcessedPath = baseDataDir + 'train_features_preprocessed.csv'
valiFullProcessedPath = baseDataDir + 'vali_features_preprocessed.csv'
testFullProcessedPath = baseDataDir + 'test_features_preprocessed.csv'

In [None]:
df_train = pd.read_csv(trainFullProcessedPath)

# df_train = df_train.iloc[0:10000, :]
# print(df_train.shape)
# df_train.head()

In [None]:
df_vali = pd.read_csv(valiFullProcessedPath)

# df_vali = df_vali.iloc[0:10000, :]
# print(df_vali.shape)
# df_vali.head()

In [None]:
# df_train = dd.from_pandas(df_train, npartitions=10)
# df_vali = dd.from_pandas(df_vali, npartitions=10)

Add the Review Count columns for Reviewers and Beers to both the Train and Validation sets

In [None]:
# Get all the columns
col_names = df_train.columns

idCols = ['RowID','BeerID','ReviewerID']
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train[idCols]
dfTrainTarget = df_train[target_col]

dfValiIds = df_vali[idCols]
dfValiTarget = df_vali[target_col]

# This time, we need to keep the Row Id so we can use it for the cosine similarity
# feature_cols =  col_names.drop(['BeerID','ReviewerID','rating'])
feature_cols = [ "RowID" ]
feature_cols = feature_cols + list(filter(lambda x: x.startswith("BeerType_"), col_names))
dfFullFeatures = df_train[feature_cols]
# dfFullFeatures = df_train[feature_cols].append(df_vali[feature_cols])



In [None]:
dfFullFeatures.head()

In [None]:

reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(df_train[['BeerID','ReviewerID',
                                    'rating']],reader)

dsetValiFeatures = Dataset.load_from_df(df_vali[['BeerID','ReviewerID',
                                     'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

print(type(dsetTrainFeatures))
print(type(trainsetTrainFeatures))
trainsetTrainFeatures

In [None]:

print(type(dsetValiFeatures))

In [None]:

NA,valset = train_test_split(dsetValiFeatures, test_size=1.0)

# simple Tuning best params: {'bsl_options': }

algorithm = ContentKNNFullCosSimilarityAlgorithm()
algorithm.setFeatures(dfFullFeatures)

# algorithm.fit_simulation(trainsetTrainFeatures)

In [None]:


model = algorithm.fit(trainsetTrainFeatures)
predictions = algorithm.test(valset)


# Score our predictions with MAE
# It is around 0.77, which means the a random guess based on the distribution of the data
# is on average within 0.77 (plus or minus) the true label.
# Not bad! You can beat it though, I'm sure :).
# Smaller MAE is the better. Good luck!
mae = accuracy.mae(predictions,verbose=True)

print("Average MAE: " + str(mae))

In [None]:
print(type(valset))
print(valset[0:10])


In [None]:
print(type(predictions))
print(str(len(predictions)))
print(predictions[0:10])
print(predictions[0])

In [None]:
# Convert the Predictions to a dataframe so we can lookup predictions easy
lstUIds = list(map(lambda x: x.uid, predictions))
lstIIds = list(map(lambda x: x.iid, predictions))
lstTrueRatings = list(map(lambda x: x.r_ui, predictions))
lstRatingEst = list(map(lambda x: x.est, predictions))


# uid == BeerId, iid == ReviewerId, r_ui == Original Ration, est = Predicted rating
dfPredictions = pd.DataFrame({ "uid": lstUIds,"iid": lstIIds, "r_ui": lstTrueRatings, "Predict": lstRatingEst })

dfPredictions.head()
# dfPredictions[dfPredictions.uid == 3519]

In [None]:
print(dfValiIds.shape)
print(dfPredictions.shape)

In [None]:
# join the predictions to the ids, sort by rowid and write to file
dfPredictions = pd.merge(dfValiIds, dfPredictions, how="inner", left_on=["BeerID", "ReviewerID"], right_on=["uid", "iid"])
dfPredictions.head()


Write to a subrun file

In [None]:
if writeSubRunFile:
  dfPredictions.sort_values("RowID")[["RowID", "BeerID", "ReviewerID", "Predict"]].to_csv(subrunDir + filePrefix + "_subrun.csv", index=False)

print("Average MAE: " + str(mae))
print("analyse_maes.append(" + str(mae) + ")")
print(dfPredictions.shape)
dfPredictions.sort_values("RowID").head(8)


# Summary

MAE on just 10k records:
Average MAE: 0.4534419870925659