In [1]:
# Import libraries
import pandas as pd
import dask.dataframe as dd
import numpy as np
import lightgbm as lgb

# Possibly use this if we have memory issues
#import dask.dataframe as dd

from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import SlopeOne
from surprise import accuracy
from surprise import dump
from surprise.model_selection import train_test_split

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil

## File Details

Working notebook of a Full Ensemble Complete Run, this time including a NLP Regression with LightGBM. Found that NLP runs on Beer Name and Beer Type each separately scored better than the all columns, so going to use these as separate runs

Collaborative Filter models to use:

* KNNWithMeans
* BaselineOnly
* SVDpp

Content Filter models

* LightGBM on all non text columns
* LightGBM with Beer Context columns (non text)
* SKLearn Linear Regression on all non text columns
* LightGBM All columns including NLP


In [2]:
filePrefix = "A3_152_ensemble_v4_complete_run"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
runDir = "runs/"
modelsDir = "models/"
forceRetrainModels = False

seed = databasic.get_random_seed()

In [3]:
trainFilePath = baseDataDir + 'train.tsv'
valiFilePath = baseDataDir + 'val.tsv'
featuresFilePath = baseDataDir + 'features.tsv'
testFilePath = baseDataDir + 'test.tsv'
trainFullProcessedPath = baseDataDir + 'train_features_preprocessed.csv'
valiFullProcessedPath = baseDataDir + 'vali_features_preprocessed.csv'
testFullProcessedPath = baseDataDir + 'test_features_preprocessed.csv'

# trainFilePath = baseDataDir + 'train_200k.tsv'
# valiFilePath = baseDataDir + 'vali_200k.tsv'
# featuresFilePath = baseDataDir + 'features_200k.tsv'
# testFilePath = baseDataDir + 'test_200k.tsv'
# trainFullProcessedPath = baseDataDir + 'train_features_preprocessed_200k.csv'
# valiFullProcessedPath = baseDataDir + 'vali_features_preprocessed_200k.csv'
# testFullProcessedPath = baseDataDir + 'test_features_preprocessed_200k.csv'

In [4]:
useModelSurpriseSlopeOne = True

useModelLbgmBeerContext = True
useModelLbgmAllCols = True
useModelSkLinReg = True

useModelLgbmNlpAll = True
useModelLgbmNlpBeerName = True
useModelLgbmNlpText = True

Load the Files one by one then delete them after your done, for memory management

In [5]:
df_train = pd.read_csv(trainFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])
df_train.head(10)

Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,19,12300,10635,Rauch Ür Bock,Rauchbier,4.0
1,21,12300,6547,Rauch Ür Bock,Rauchbier,4.5
2,23,12300,9789,Rauch Ür Bock,Rauchbier,4.5
3,24,12300,7372,Rauch Ür Bock,Rauchbier,5.0
4,25,12300,1302,Rauch Ür Bock,Rauchbier,4.5
5,26,12300,704,Rauch Ür Bock,Rauchbier,4.5
6,29,12300,1747,Rauch Ür Bock,Rauchbier,5.0
7,31,12300,9368,Rauch Ür Bock,Rauchbier,4.5
8,32,12300,2568,Rauch Ür Bock,Rauchbier,4.0
9,33,12300,6838,Rauch Ür Bock,Rauchbier,4.0


In [6]:
# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)

## Collaborative Filter Models: Train

For the Collaborative Filtering Models, we only need the Training set. Train the models, then save them to file for later use

In [7]:
# Load into a Surprise dataset
reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID', 'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

In [8]:
# Create each algorithm, train the model, save it to file for later, then delete the model

predictorKNN = KNNWithMeans(k=160)
featutil.trainSurpriseModel(predictorKNN, trainsetTrainFeatures, modelsDir, filePrefix, "knnwithmeans", forceRetrainModels)
del predictorKNN

predictorBaselineOnly = BaselineOnly(bsl_options = {'n_epochs': 5, 'reg_u': 3, 'reg_i': 16})
featutil.trainSurpriseModel(predictorBaselineOnly, trainsetTrainFeatures, modelsDir, filePrefix, "baselineonly", forceRetrainModels)
del predictorBaselineOnly

predictorSVDpp = SVDpp(n_factors = 10, n_epochs=20, lr_all=0.005, reg_all=0.2)
featutil.trainSurpriseModel(predictorSVDpp, trainsetTrainFeatures, modelsDir, filePrefix, "svdpp", forceRetrainModels)
del predictorSVDpp

if useModelSurpriseSlopeOne:
  predictorSlopeOne = SlopeOne()
  featutil.trainSurpriseModel(predictorSlopeOne, trainsetTrainFeatures, modelsDir, filePrefix, "slopeone", forceRetrainModels)
  del predictorSlopeOne

In [9]:
# Clean up the training data used for the collaborate filters
del trainsetTrainFeatures
del reader
del dsetTrainFeatures
del dfTrainFeatures
del df_train

## Collaborative Filter Models: Predict On Validation Data 

Now we want to load the Validation set to we can predict against it and write out the subrun files, which will be used later for the Ensemble.

First, do the Predictions for the Collaborative Filter models (surprise)

In [10]:
# Read the validation data (in full)
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

reader = Reader(rating_scale=(0, 5))

idCols = ['RowID','BeerID','ReviewerID']
dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID', 'rating']],reader)

In [11]:
predictValiMae_KnnWithMeans = featutil.predictSurpriseModel(modelsDir, filePrefix, "knnwithmeans", "val", dsetValiFeatures, dfValiIds, subrunDir)
predictValiMae_BaselineOnly = featutil.predictSurpriseModel(modelsDir, filePrefix, "baselineonly", "val", dsetValiFeatures, dfValiIds, subrunDir)
predictValiMae_SVDpp = featutil.predictSurpriseModel(modelsDir, filePrefix, "svdpp", "val", dsetValiFeatures, dfValiIds, subrunDir)

if useModelSurpriseSlopeOne:
  predictValiMae_SlopeOne = featutil.predictSurpriseModel(modelsDir, filePrefix, "slopeone", "val", dsetValiFeatures, dfValiIds, subrunDir)

MAE:  0.4395
MAE for knnwithmeans: 0.4395334732274164
MAE:  0.4397
MAE for baselineonly: 0.43974731321337585
MAE:  0.4432
MAE for svdpp: 0.44321489275755915
MAE:  0.4418
MAE for slopeone: 0.44176542058237306


In [12]:
# Clean up variables from the Predict Stage
del df_vali
del reader
del dfValiIds
del dfValiFeatures
del dsetValiFeatures

## Content Filter Models, train and predict

First we want to load the features and do all the data preprocessing, then we can train the different models

In [13]:
# Load the training data
df_train = pd.read_csv(trainFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

# Load the validation data. When we want to do one hot encoding, we have to do it over both datasets to ensure consistency
df_vali = pd.read_csv(valiFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID', 'BeerName','BeerType','rating'])

# Load the validation data. When we want to do one hot encoding, we have to do it over both datasets to ensure consistency
df_test = pd.read_csv(testFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID', 'BeerName','BeerType','rating'])                         

# Load the features
df_features = pd.read_csv(featuresFilePath,sep='\t',
    names=['RowID','BrewerID','ABV','DayofWeek','Month',
          'DayofMonth','Year','TimeOfDay','Gender',
          'Birthday','Text','Lemmatized','POS_Tag'])    

In [14]:
# Add the reviewer counts to each of the data sets
df_train = featutil.addReviewerReviewCount(df_train)
df_train = featutil.addBeerReviewCount(df_train)

df_vali = featutil.addReviewerReviewCount(df_vali)
df_vali = featutil.addBeerReviewCount(df_vali)

df_test = featutil.addReviewerReviewCount(df_test)
df_test = featutil.addBeerReviewCount(df_test)

In [15]:
colsToUse = ["RowID", "BrewerID", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender", "TimeOfDay", "Birthday"]

df_train_data = df_train.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
df_vali_data = df_vali.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
df_test_data = df_test.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")

# Remove the duplicated Row ID, also remove Beer Name at this point, we're nt using it
df_train_data = df_train_data.drop(['RowIDFeat', "BeerName"],axis=1)
df_vali_data = df_vali_data.drop(['RowIDFeat', "BeerName"],axis=1)
df_test_data = df_test_data.drop(['RowIDFeat', "BeerName"],axis=1)

In [16]:
# Clean up these dataframes now that they have been joined
del df_train
del df_vali
del df_features
del df_test

In [17]:
# do the feature transformations
df_train_data = featutil.fixNullABV(df_train_data)
df_vali_data = featutil.fixNullABV(df_vali_data)
df_test_data = featutil.fixNullABV(df_test_data)

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "BrewerID")

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "BeerType")

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "Gender")

df_train_data = featutil.formatDayOfWeek(df_train_data)
df_vali_data = featutil.formatDayOfWeek(df_vali_data)
df_test_data = featutil.formatDayOfWeek(df_test_data)

df_train_data = featutil.formatMonth(df_train_data)
df_vali_data = featutil.formatMonth(df_vali_data)
df_test_data = featutil.formatMonth(df_test_data)

df_train_data = featutil.formatTimeToSec(df_train_data)
df_vali_data = featutil.formatTimeToSec(df_vali_data)
df_test_data = featutil.formatTimeToSec(df_test_data)

df_train_data = featutil.convertBirthdayToAge(df_train_data)
df_vali_data = featutil.convertBirthdayToAge(df_vali_data)
df_test_data = featutil.convertBirthdayToAge(df_test_data)

(1237434, 17)


  df_combined.columns = df_combined.columns.str.replace(" ", "").str.replace("/", "").str.replace("-", "") \


(1237434, 2092)
(1237434, 2195)


In [18]:
print(df_train_data.shape)
print(df_vali_data.shape)
print(df_test_data.shape)
df_test_data.head()

(746207, 2196)
(243834, 2196)
(247393, 2196)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,18,12300,10059,,165,8,7.4,7,12,6,...,0,0,0,0,0,0,0,0,1,0
1,20,12300,9761,,156,8,7.4,6,21,5,...,0,0,0,0,0,0,0,0,1,0
2,30,12300,7279,,553,8,7.4,2,12,10,...,0,0,0,0,0,0,0,0,0,1
3,46,12300,2367,,283,8,5.5,3,22,7,...,0,0,0,0,0,0,0,0,1,0
4,47,12300,2230,,35,8,5.5,2,21,7,...,0,0,0,0,0,0,0,0,1,0


In [19]:
# Write the test data file out so we can load it back in later so as not to have to redo this step
df_test_data.to_csv(baseDataDir + filePrefix + "_test_cleaned.csv", index=False)

del df_test_data

In [20]:
# Get all the columns
col_names = df_train_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainFeatures = df_train_data[feature_cols]
dfTrainTarget = df_train_data[target_col]

dfValiIds = df_vali_data[idCols]
dfValiFeatures = df_vali_data[feature_cols]
dfValiTarget = df_vali_data[target_col]


In [21]:
dfTrainFeatures.head()

Unnamed: 0,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,Year,TimeOfDay,Birthday,BrewerID_1,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,200,23,7.4,1,23,5,2011,56188,0,0,...,0,0,0,0,0,0,0,0,1,0
1,10,23,7.4,1,16,5,2011,1906,0,0,...,0,0,0,0,0,0,0,0,1,0
2,164,23,7.4,7,10,4,2011,44246,0,0,...,0,0,0,0,0,0,0,0,0,1
3,432,23,7.4,3,30,3,2011,50880,0,0,...,0,0,0,0,0,0,0,0,1,0
4,500,23,7.4,4,24,3,2011,50820,37,0,...,0,0,0,0,0,0,0,0,1,0


In [22]:
def getFeaturesBeerContext(df1):
  consumerCols = ["DayofWeek", "DayofMonth", "Month", "TimeOfDay", "Birthday", "Gender_Male", "Gender_Female", "Gender_unknown"]
  return dfutil.getFeaturesWithoutCols(df1, consumerCols)

In [23]:
# Train the models, save them to file and then clear the model from memory
if useModelLbgmBeerContext:
    modelBeerContext = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
        ,learning_rate=0.010443500090385492, num_leaves = 68, max_depth = 14, n_estimators = 608
    )  
    dfTrainFeatures_BeerContext = getFeaturesBeerContext(dfTrainFeatures)
    dfValiFeatures_BeerContext = getFeaturesBeerContext(dfValiFeatures)
    featutil.trainLightGbmModel(modelBeerContext, dfTrainFeatures_BeerContext, dfTrainTarget, 
        modelsDir, filePrefix, "lgbm_beercontext", forceRetrainModels)
    predictValiMae_LgbmBeerContext = featutil.predictLightGbmModel(dfValiIds, dfValiFeatures_BeerContext, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_beercontext")    
    del dfTrainFeatures_BeerContext
    del dfValiFeatures_BeerContext
    del modelBeerContext

if useModelLbgmAllCols:
    modelLgbm = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
        ,learning_rate=0.16142127923810723, num_leaves = 127, max_depth = 18, n_estimators = 811
    ) 
    featutil.trainLightGbmModel(modelLgbm, dfTrainFeatures, dfTrainTarget, 
        modelsDir, filePrefix, "lgbm_allcols", forceRetrainModels)
    predictValiMae_LgbmAllCols = featutil.predictLightGbmModel(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_allcols")    
    del modelLgbm

if useModelSkLinReg:
    modelLinReg = LinearRegression()
    featutil.trainSkLinearRegModel(modelLinReg, dfTrainFeatures, dfTrainTarget, 
        modelsDir, filePrefix, "sklinearreg", forceRetrainModels)
    predictValiMae_SkLinearReg = featutil.predictSkLinearRegModel(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "sklinearreg")
    del modelLinReg

MAE for lgbm_beercontext: 0.4783112983280762
MAE for lgbm_allcols: 0.47739847463670526
MAE for sklinearreg: 0.49873941880294115


In [24]:
# Clean up the variables from memory
del df_train_data
del df_vali_data
del dfTrainFeatures
del dfTrainTarget
del dfValiIds
del dfValiFeatures
del dfValiTarget

## Content Filter Models with NLP

In this version, well train the Content Filter models using NLP. Assuming that the data has been preprocessed already and saved to file via the A3_130_create_full_features_processed notebook, although later, possibly add the logic into here as part of the process

In [25]:
# Load the training data
df_train = dd.read_csv(trainFullProcessedPath)

df_train.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,BeerType_Altbier,BeerType_AmericanAdjunctLager,BeerType_AmericanAmberRedAle,BeerType_AmericanAmberRedLager,...,Lemmatized_DocVec_190,Lemmatized_DocVec_191,Lemmatized_DocVec_192,Lemmatized_DocVec_193,Lemmatized_DocVec_194,Lemmatized_DocVec_195,Lemmatized_DocVec_196,Lemmatized_DocVec_197,Lemmatized_DocVec_198,Lemmatized_DocVec_199
0,19,12300,10635,4.0,200,23,0,0,0,0,...,-0.04602,-0.085923,-0.020191,0.028098,0.009554,-0.00107,0.058253,-0.020965,-0.060335,-0.025998
1,21,12300,6547,4.5,10,23,0,0,0,0,...,-0.035202,-0.071197,-0.009042,0.013252,-0.038783,-0.011112,0.017132,0.007957,-0.019318,-0.020068
2,23,12300,9789,4.5,164,23,0,0,0,0,...,-0.008459,-0.038928,-0.028027,0.011146,-0.041214,-0.004981,0.029954,0.013247,-0.027631,-0.020938
3,24,12300,7372,5.0,432,23,0,0,0,0,...,-0.043639,-0.069309,-0.031036,0.009859,-0.03439,-0.008547,0.048554,0.007532,-0.044557,-0.019336
4,25,12300,1302,4.5,500,23,0,0,0,0,...,-0.021395,-0.060889,-0.019478,0.016087,-0.05127,-0.01291,0.02522,-0.00026,-0.032785,-0.020871
5,26,12300,704,4.5,605,23,0,0,0,0,...,-0.025142,-0.064408,-0.026324,0.024486,-0.047075,-0.005287,0.025225,0.001942,-0.038939,-0.025383
6,29,12300,1747,5.0,463,23,0,0,0,0,...,-0.024191,-0.071046,-0.025827,0.004,-0.050098,-0.008236,-0.010039,-0.001957,-0.046674,-0.017337
7,31,12300,9368,4.5,49,23,0,0,0,0,...,-0.035988,-0.069957,-0.02605,0.023177,-0.032557,-0.010953,0.026482,-0.00533,-0.052173,-0.013001
8,32,12300,2568,4.0,221,23,0,0,0,0,...,-0.03143,-0.074136,-0.017969,0.024147,-0.043018,-0.01219,0.002802,0.004149,-0.046826,-0.021015
9,33,12300,6838,4.0,110,23,0,0,0,0,...,-0.03856,-0.071023,-0.026504,0.015529,-0.042741,0.001625,0.001332,0.009657,-0.038897,-0.0234


In [26]:
# "BrewerID_", "BeerType_", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender_", "TimeOfDay", "Birthday", "BeerName_", "Lemmatized_"
# Get all the columns
col_names = df_train.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train[idCols]
dfTrainFeatures = df_train[feature_cols]
dfTrainTarget = df_train[target_col]

if useModelLgbmNlpAll:
  # train the model
  model_lgbm_nlp = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
      ,learning_rate=0.09075359977364383, num_leaves = 120, max_depth = 40, n_estimators = 248, min_split_gain = 0.6310082232017945, 
      min_child_samples = 35, subsample = 0.9466694477903548, 
      subsample_freq = 0, colsample_bytree = 0.29392263338193186, reg_alpha = 0.891904482598078, reg_lambda = 0.4521335679885054
    )
  featutil.trainLightGbmModel(model_lgbm_nlp, dfTrainFeatures, dfTrainTarget, 
      modelsDir, filePrefix, "lgbm_allcols_inc_nlp", forceRetrainModels)


In [27]:
del dfTrainFeatures

# Just get the Beer Name columns
feature_cols = list(filter(lambda x: x.startswith("BeerName_"), col_names))
dfTrainFeatures = df_train[feature_cols]

if useModelLgbmNlpBeerName:
  # train the model
  model_lgbm_nlp = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
      ,learning_rate=0.09075359977364383, num_leaves = 120, max_depth = 40, n_estimators = 248, min_split_gain = 0.6310082232017945, 
      min_child_samples = 35, subsample = 0.9466694477903548, 
      subsample_freq = 0, colsample_bytree = 0.29392263338193186, reg_alpha = 0.891904482598078, reg_lambda = 0.4521335679885054
    )
  featutil.trainLightGbmModel(model_lgbm_nlp, dfTrainFeatures, dfTrainTarget, 
      modelsDir, filePrefix, "lgbm_allcols_nlp_beer_name", forceRetrainModels)


In [28]:
del dfTrainFeatures

# Just get the Beer Text Lemmatized columns
feature_cols = list(filter(lambda x: x.startswith("Lemmatized_"), col_names))
dfTrainFeatures = df_train[feature_cols]

if useModelLgbmNlpText:
  # train the model
  model_lgbm_nlp = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
      ,learning_rate=0.09075359977364383, num_leaves = 120, max_depth = 40, n_estimators = 248, min_split_gain = 0.6310082232017945, 
      min_child_samples = 35, subsample = 0.9466694477903548, 
      subsample_freq = 0, colsample_bytree = 0.29392263338193186, reg_alpha = 0.891904482598078, reg_lambda = 0.4521335679885054
    )
  featutil.trainLightGbmModel(model_lgbm_nlp, dfTrainFeatures, dfTrainTarget, 
      modelsDir, filePrefix, "lgbm_allcols_nlp_text", forceRetrainModels)


In [29]:
# clean up training data from memory
del df_train
del dfTrainIds
del dfTrainFeatures
del dfTrainTarget

del model_lgbm_nlp

In [30]:
df_vali = dd.read_csv(valiFullProcessedPath)

df_vali.head(10)

Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,BeerType_Altbier,BeerType_AmericanAdjunctLager,BeerType_AmericanAmberRedAle,BeerType_AmericanAmberRedLager,...,Lemmatized_DocVec_190,Lemmatized_DocVec_191,Lemmatized_DocVec_192,Lemmatized_DocVec_193,Lemmatized_DocVec_194,Lemmatized_DocVec_195,Lemmatized_DocVec_196,Lemmatized_DocVec_197,Lemmatized_DocVec_198,Lemmatized_DocVec_199
0,22,12300,2634,4.0,19,9,0,0,0,0,...,-0.074742,-0.168165,-0.0109,0.028198,0.006687,0.014946,-0.005977,-0.011162,-0.020651,-0.055318
1,27,12300,5634,4.5,48,9,0,0,0,0,...,-0.039903,-0.081178,-0.026782,0.0284,-0.016471,-0.01523,0.036627,-0.007582,-0.064121,-0.0241
2,28,12300,3544,4.5,227,9,0,0,0,0,...,-0.025519,-0.059343,-0.011759,0.019818,-0.025224,-0.014813,0.010308,-0.004253,-0.056163,-0.02306
3,40,12300,6521,4.0,81,9,0,0,0,0,...,-0.009961,-0.064353,-0.026132,0.024919,-0.032892,-0.024914,0.003314,0.015528,-0.035941,0.005979
4,43,12300,10177,4.5,58,9,0,0,0,0,...,-0.041421,-0.0774,-0.034871,0.025505,-0.007491,-0.014076,0.050549,-0.000238,-0.050784,-0.01935
5,48,12300,2907,3.5,230,9,0,0,0,0,...,-0.0095,-0.068618,-0.037802,0.037846,-0.036413,-0.014508,0.010346,0.011012,-0.065818,-0.00271
6,49,12300,1532,4.0,185,9,0,0,0,0,...,-0.047784,-0.066319,-0.007743,0.041015,-0.026885,-0.021267,0.010073,-0.002857,-0.035936,-0.031098
7,50,12300,3452,3.5,37,9,0,0,0,0,...,0.008967,-0.063812,-0.033023,0.033007,-0.047081,-0.00422,0.012645,0.013312,-0.040105,-0.013771
8,59,12300,6861,4.0,230,9,0,0,0,0,...,-0.031515,-0.084435,-0.039401,0.041123,-0.044032,-0.007342,0.002211,0.014602,-0.036788,-0.007242
9,85539,1198,2634,4.5,19,531,0,0,0,0,...,-0.051095,-0.167019,0.018095,0.029359,0.02371,0.046872,0.023815,-0.026127,0.0013,-0.057575


In [31]:
col_names = df_vali.columns

dfValiIds = df_vali[idCols]
dfValiTarget = df_vali[target_col]

if useModelLgbmNlpAll:
    feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
    dfValiFeatures = df_vali[feature_cols]
    predictValiMae_LgbmAllColsIncNlp = featutil.predictLightGbmModelDask(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_allcols_inc_nlp")  
     
    del dfValiFeatures




MAE for lgbm_allcols_inc_nlp: 0.463112342301003


In [32]:
if useModelLgbmNlpBeerName:

    # Just get the Beer Name columns
    feature_cols = list(filter(lambda x: x.startswith("BeerName_"), col_names))
    dfValiFeatures = df_vali[feature_cols]

    predictValiMae_LgbmNlpBeerName = featutil.predictLightGbmModelDask(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_allcols_nlp_beer_name")  

    del dfValiFeatures



MAE for lgbm_allcols_nlp_beer_name: 0.4618885539614043


In [33]:
if useModelLgbmNlpText:
    # Just get the Beer Name columns
    feature_cols = list(filter(lambda x: x.startswith("Lemmatized"), col_names))
    dfValiFeatures = df_vali[feature_cols]
    predictValiMae_LgbmNlpText = featutil.predictLightGbmModelDask(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_allcols_nlp_text")  
        
    del dfValiFeatures



MAE for lgbm_allcols_nlp_text: 0.4969013926605755


In [34]:
del df_vali
del dfValiIds
del dfValiTarget

## Train the Ensemble Model

Now that all the sub run files have been generated, combine all the predictions into one dataset, train a new final, ensemble model, predict on the validation data and get an MAE and save the model for use later on the Test data.

In [35]:
# Read the validation data (in full) again. But this time, we just want the Row and the rating
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

df_ensemble_full = df_vali[["RowID", "rating"]]      

del df_vali

In [36]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "svdpp" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

if useModelSurpriseSlopeOne:
  fileName = filePrefix + "_" + "slopeone" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

# # Content Filter Runs
if useModelLbgmBeerContext:
  fileName = filePrefix + "_" + "lgbm_beercontext" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

if useModelLbgmAllCols:
  fileName = filePrefix + "_" + "lgbm_allcols" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

if useModelSkLinReg:
  fileName = filePrefix + "_" + "sklinearreg" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)


# # Content Filter with NLP Runs
if useModelLgbmNlpAll:
  fileName = filePrefix + "_" + "lgbm_allcols_inc_nlp" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)
  
if useModelLgbmNlpBeerName:
  fileName = filePrefix + "_" + "lgbm_allcols_nlp_beer_name" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)
  
if useModelLgbmNlpText:
  fileName = filePrefix + "_" + "lgbm_allcols_nlp_text" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

In [37]:
# Get all the columns
col_names = df_ensemble_full.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID','rating'])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_ensemble_full[idCols]
dfTrainFeatures = df_ensemble_full[feature_cols]
dfTrainTarget = df_ensemble_full[target_col]


In [38]:
# Doing the final Ensemble prediction using Light GBM Regression, params tuned

# Create the model and predict
model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed,
  learning_rate=0.298864877137463, num_leaves=127, max_depth=26, n_estimators=974
)
model.fit(X=dfTrainFeatures, y=dfTrainTarget)

# use the model to predict
test_predicted = model.predict(dfTrainFeatures)
dfPredicted = pd.DataFrame({"Predict": test_predicted})

# Calc the MAE and display
predictValiMae_Ensemble = mean_absolute_error(dfTrainTarget, test_predicted)
print("Ensemble Final Average MAE (from validation data): " + str(predictValiMae_Ensemble))

# Save the model to file
model.booster_.save_model(modelsDir + filePrefix + "_ensemble_predictor.model")

print(model.feature_importances_)

Ensemble Final Average MAE (from validation data): 0.4099487846844963
[22407 21312  4957 19467  1958  5838 10947 13563 13565  8710]


In [39]:
# Clean up all the variables
del df_ensemble_full
del dfTrainIds
del dfTrainFeatures
del dfTrainTarget
del model
del test_predicted
del dfPredicted

## Predict on the Test data with Models for Subruns

Now that we have the final Ensemble model, we can process the Test data. We need to load the test data, and create all the sub runs by using all the base level models to predict.

First, predict using the Collaborative Filter Models

In [40]:
# Read the validation data (in full)
df_test = pd.read_csv(testFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType'])

# The test set is unlabeled, so we don't know the true ratings. Populate a rating col with zeros, as we are going
# to predict these values
df_test["rating"] = 0

reader = Reader(rating_scale=(0, 5))

idCols = ['RowID','BeerID','ReviewerID']
dfTestIds = df_test[idCols]
dfTestFeatures = df_test.drop(['RowID','BeerName','BeerType'],axis=1)
dsetTestFeatures = Dataset.load_from_df(dfTestFeatures[['BeerID','ReviewerID','rating']],reader)

In [41]:
# Predict using the Collaborative Filter Models
featutil.predictSurpriseModel(modelsDir, filePrefix, "knnwithmeans", "test", dsetTestFeatures, dfTestIds, subrunDir)
featutil.predictSurpriseModel(modelsDir, filePrefix, "baselineonly", "test", dsetTestFeatures, dfTestIds, subrunDir)
featutil.predictSurpriseModel(modelsDir, filePrefix, "svdpp", "test", dsetTestFeatures, dfTestIds, subrunDir)

if useModelSurpriseSlopeOne:
  featutil.predictSurpriseModel(modelsDir, filePrefix, "slopeone", "test", dsetTestFeatures, dfTestIds, subrunDir)

# Ignore the displaed MAEs, since all the targets are 0

MAE:  3.8233
MAE for knnwithmeans: 3.8232692268407495
MAE:  3.8282
MAE for baselineonly: 3.828185583765752
MAE:  3.8266
MAE for svdpp: 3.8266065706323
MAE:  3.8257
MAE for slopeone: 3.825660997611865


In [42]:
# Clean up variables from the Predict Stage
del reader
del dfTestIds
del dfTestFeatures
del dsetTestFeatures

# Keep this, as we will use this in the next stage
# del df_test 

Now Predict using the Content Filter Models. 

In [43]:
# Reload that test data that was cleaned and processed previously
df_test_data = pd.read_csv(baseDataDir + filePrefix + "_test_cleaned.csv")

In [44]:
# Get all the columns
col_names = df_test_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID', 'rating' ])

# Create the sub data sets of the features and the target
dfTestIds = df_test_data[idCols]
dfTestFeatures = df_test_data[feature_cols]

In [45]:
print(df_test_data.columns)
df_test_data.head()

Index(['RowID', 'BeerID', 'ReviewerID', 'rating', 'ReviewerReviewCount',
       'BeerReviewCount', 'ABV', 'DayofWeek', 'DayofMonth', 'Month',
       ...
       'BeerType_SmokedBeer', 'BeerType_Tripel', 'BeerType_ViennaLager',
       'BeerType_Weizenbock', 'BeerType_Wheatwine', 'BeerType_WinterWarmer',
       'BeerType_Witbier', 'Gender_Female', 'Gender_Male', 'Gender_unknown'],
      dtype='object', length=2196)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,18,12300,10059,,165,8,7.4,7,12,6,...,0,0,0,0,0,0,0,0,1,0
1,20,12300,9761,,156,8,7.4,6,21,5,...,0,0,0,0,0,0,0,0,1,0
2,30,12300,7279,,553,8,7.4,2,12,10,...,0,0,0,0,0,0,0,0,0,1
3,46,12300,2367,,283,8,5.5,3,22,7,...,0,0,0,0,0,0,0,0,1,0
4,47,12300,2230,,35,8,5.5,2,21,7,...,0,0,0,0,0,0,0,0,1,0


Problem is with one hot encoding, different sets of brewers or beer types between the training data (train+vali) and what is in test

In [46]:
# Now we can make predictions according to each of our Content Filter Models. Pass None for the target set, the function
# will just skip the evaluation (calculating the MAE)
if useModelLbgmBeerContext:
    dfTestFeatures_BeerContext =  getFeaturesBeerContext(dfTestFeatures)
    featutil.predictLightGbmModel(dfTestIds, dfTestFeatures_BeerContext, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_beercontext")    
    del dfTestFeatures_BeerContext

if useModelLbgmAllCols:
    featutil.predictLightGbmModel(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_allcols") 

if useModelSkLinReg:
    featutil.predictSkLinearRegModel(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "sklinearreg")


In [47]:
del df_test_data
del dfTestIds
del dfTestFeatures

Finally, Predict using the Content Filter with NLP models

In [48]:
df_test = dd.read_csv(testFullProcessedPath)

df_test.head(10)

Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,BeerType_Altbier,BeerType_AmericanAdjunctLager,BeerType_AmericanAmberRedAle,BeerType_AmericanAmberRedLager,...,Lemmatized_DocVec_190,Lemmatized_DocVec_191,Lemmatized_DocVec_192,Lemmatized_DocVec_193,Lemmatized_DocVec_194,Lemmatized_DocVec_195,Lemmatized_DocVec_196,Lemmatized_DocVec_197,Lemmatized_DocVec_198,Lemmatized_DocVec_199
0,18,12300,10059,,165,8,0,0,0,0,...,-0.047357,-0.073542,-0.023027,0.025345,-0.030229,-0.007293,0.040549,0.005911,-0.05855,-0.022677
1,20,12300,9761,,156,8,0,0,0,0,...,-0.027229,-0.069021,-0.01867,0.021589,-0.035218,-0.000501,0.013069,0.017086,-0.039631,-0.022168
2,30,12300,7279,,553,8,0,0,0,0,...,-0.056197,-0.079049,-0.01005,0.016265,-0.033021,-0.007931,0.035068,-0.018398,-0.054272,-0.022589
3,46,12300,2367,,283,8,0,0,0,0,...,-0.056593,-0.067475,-0.033789,0.029176,-0.015081,-0.00761,0.066042,0.016889,-0.057355,-0.021957
4,47,12300,2230,,35,8,0,0,0,0,...,-0.026703,-0.084011,-0.074139,0.015535,0.01414,0.022282,0.008867,-0.008055,-0.017667,-0.064827
5,51,12300,4346,,46,8,0,0,0,0,...,-0.024198,-0.076104,-0.020363,0.027268,-0.034891,-0.014437,0.002004,0.008169,-0.043079,-0.010329
6,52,12300,532,,154,8,0,0,0,0,...,-0.023447,-0.062591,-0.038872,0.027154,-0.040924,-0.011036,0.015005,0.022344,-0.049522,-0.003017
7,53,12300,8883,,323,8,0,0,0,0,...,-0.024459,-0.086282,-0.030375,0.034249,-0.03669,-0.023567,-0.006577,0.008867,-0.043228,-0.009612
8,4799,10553,10059,,165,90,0,0,0,0,...,-0.030926,-0.081421,-0.01783,0.017344,-0.018943,-0.007471,0.018448,0.010695,-0.040801,-0.024204
9,4764,10553,1376,,83,90,0,0,0,0,...,-0.027462,-0.071645,-0.005458,0.038459,-0.01849,0.002905,0.014639,0.004184,-0.060298,-0.056569


In [49]:
# Get all the columns
col_names = df_test.columns

idCols = ['RowID','BeerID','ReviewerID']
dfTestIds = df_test[idCols]

if useModelLgbmNlpAll:
    feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID', 'rating' ])
    dfTestFeatures = df_test[feature_cols]
    featutil.predictLightGbmModelDask(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_allcols_inc_nlp")       

    del dfTestFeatures



In [50]:
if useModelLgbmNlpBeerName:
    # Just get the Text columns
    feature_cols = list(filter(lambda x: x.startswith("BeerName_"), col_names))
    dfTestFeatures = df_test[feature_cols]
    featutil.predictLightGbmModelDask(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_allcols_nlp_beer_name")  
        
    del dfTestFeatures



In [51]:
if useModelLgbmNlpText:
    # Just get the Text columns
    feature_cols = list(filter(lambda x: x.startswith("Lemmatized_"), col_names))
    dfTestFeatures = df_test[feature_cols]
    featutil.predictLightGbmModelDask(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_allcols_nlp_text")  
        
    del dfTestFeatures



In [52]:
del dfTestIds

### Load the Ensemble Model and predict on the Test data

Load the test data

In [53]:
df_ensemble_test = df_test[["RowID"]]     

# Convert the Dask Dataset back to a pandas dataset
df_ensemble_test = df_ensemble_test.compute()

del df_test

In [54]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "svdpp" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

if useModelSurpriseSlopeOne:
  fileName = filePrefix + "_" + "slopeone" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

# # Content Filter Runs
if useModelLbgmBeerContext:
  fileName = filePrefix + "_" + "lgbm_beercontext" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

if useModelLbgmAllCols:
  fileName = filePrefix + "_" + "lgbm_allcols" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

if useModelSkLinReg:
  fileName = filePrefix + "_" + "sklinearreg" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)


# Content Filter Runs inc NLP doc vector cols
if useModelLgbmNlpAll:
  fileName = filePrefix + "_" + "lgbm_allcols_inc_nlp" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)
  
if useModelLgbmNlpBeerName:
  fileName = filePrefix + "_" + "lgbm_allcols_nlp_beer_name" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)
  
if useModelLgbmNlpText:
  fileName = filePrefix + "_" + "lgbm_allcols_nlp_text" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

In [55]:
df_ensemble_test.head()

Unnamed: 0,RowID,A3_152_ensemble_v4_complete_run_knnwithmeans_test_subrun,A3_152_ensemble_v4_complete_run_baselineonly_test_subrun,A3_152_ensemble_v4_complete_run_svdpp_test_subrun,A3_152_ensemble_v4_complete_run_slopeone_test_subrun,A3_152_ensemble_v4_complete_run_lgbm_beercontext_test_subrun,A3_152_ensemble_v4_complete_run_lgbm_allcols_test_subrun,A3_152_ensemble_v4_complete_run_sklinearreg_test_subrun,A3_152_ensemble_v4_complete_run_lgbm_allcols_inc_nlp_test_subrun,A3_152_ensemble_v4_complete_run_lgbm_allcols_nlp_beer_name_test_subrun,A3_152_ensemble_v4_complete_run_lgbm_allcols_nlp_text_test_subrun
0,18,4.125042,4.083293,4.060041,4.140064,4.0,4.0,3.876806,3.924936,3.931508,4.0
1,20,4.116027,4.108744,4.048259,4.246457,4.0,4.0,3.877441,3.924936,3.931508,4.0
2,30,4.313215,4.30251,4.229747,4.27467,4.0,4.0,3.881769,3.924936,3.931508,4.007501
3,46,4.13573,4.129984,4.07646,4.067069,4.0,4.0,3.879409,3.924936,3.931508,3.964135
4,47,4.367684,4.38283,4.301268,4.201221,4.0,4.0,3.878602,3.924315,3.931508,4.003343


In [56]:
# Get all the columns
col_names = df_ensemble_test.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID'])

# Create the sub data sets of the features and the target
dfTestFeatures = df_ensemble_test[feature_cols]

In [57]:
# load the ensemble model  and predict
model = lgb.Booster(model_file=modelsDir + filePrefix + "_ensemble_predictor.model")
predicted = model.predict(dfTestFeatures)

dfPredictions = df_ensemble_test[idCols]
dfPredictions["Score"] = predicted

# join the predictions to the ids, sort by rowid and write to out the subrun file
finalRunFilePath = runDir + filePrefix + "_run.tsv"
dfPredictions.to_csv(finalRunFilePath, sep="\t", index=False, header=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfPredictions["Score"] = predicted


In [58]:
print("Final Report on Validation Set MAEs")
print(" ")
print("* KNN With Means: " + str(predictValiMae_KnnWithMeans))
print("* Baseline Only: " + str(predictValiMae_BaselineOnly))
print("* SVDpp: " + str(predictValiMae_SVDpp))

if useModelSurpriseSlopeOne:
  print("* SlopeOne: " + str(predictValiMae_SlopeOne))

if useModelLbgmBeerContext:
  print("* Lgbm Beer Context columns: " + str(predictValiMae_LgbmBeerContext))

if useModelLbgmAllCols:
  print("* Lgbm All cols: " + str(predictValiMae_LgbmAllCols))

if useModelSkLinReg:
  print("* Sklearn Linear Regression: " + str(predictValiMae_SkLinearReg))

if useModelLgbmNlpAll:
  print("* Lgbm NLP All columns: " + str(predictValiMae_LgbmAllColsIncNlp))

if useModelLgbmNlpBeerName:
  print("* Lgbm NLP on Beer Name: " + str(predictValiMae_LgbmNlpBeerName))

if useModelLgbmNlpText:
  print("* Lgbm NLP on Lemmatized Text: " + str(predictValiMae_LgbmNlpText))  
  
print(" ")
print("Final Ensemble MAE: " + str(predictValiMae_Ensemble))

Final Report on Validation Set MAEs
 
* KNN With Means: 0.4395334732274164
* Baseline Only: 0.43974731321337585
* SVDpp: 0.44321489275755915
* SlopeOne: 0.44176542058237306
* Lgbm Beer Context columns: 0.4783112983280762
* Lgbm All cols: 0.47739847463670526
* Sklearn Linear Regression: 0.49873941880294115
* Lgbm NLP All columns: 0.463112342301003
* Lgbm NLP on Beer Name: 0.4618885539614043
* Lgbm NLP on Lemmatized Text: 0.4969013926605755
 
Final Ensemble MAE: 0.4099487846844963


In [59]:
# Clean up variables
del df_ensemble_test
del dfTestFeatures
del model
del predicted
del dfPredictions

## Run with all models, inc NLL models and SlopeOne:

Final Report on Validation Set MAEs
 
* KNN With Means: 0.4395334732274164
* Baseline Only: 0.43974731321337585
* SVDpp: 0.44321489275755915
* SlopeOne: 0.44176542058237306
* Lgbm Beer Context columns: 0.4783112983280762
* Lgbm All cols: 0.47739847463670526
* Sklearn Linear Regression: 0.49873941880294115
* Lgbm NLP All columns: 0.463112342301003
* Lgbm NLP on Beer Name: 0.4618885539614043
* Lgbm NLP on Lemmatized Text: 0.4969013926605755
 
Final Ensemble MAE: 0.4099487846844963


## Run with all data, no NLP All and No SlopeOne:

Final Report on Validation Set MAEs
 
* KNN With Means: 0.4395334732274163
* Baseline Only: 0.4397473132133757
* SVDpp: 0.443214892757559
* Lgbm Beer Context columns: 0.4783112983280762
* Lgbm All cols: 0.47739847463670526
* Sklearn Linear Regression: 0.49873941880294115
* Lgbm NLP on Beer Name: 0.4618885539614043
* Lgbm NLP on Lemmatized Text: ( Lost Value )
 
Final Ensemble MAE: 0.41426354435001556
