In [1]:
# Import libraries
import pandas as pd
import dask.dataframe as dd
import numpy as np
import lightgbm as lgb

# Possibly use this if we have memory issues
#import dask.dataframe as dd

from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import SlopeOne
from surprise import accuracy
from surprise import dump
from surprise.model_selection import train_test_split

from contentknn import ContentKNNAlgorithm

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil

## File Details

Working notebook of a Full Ensemble Complete Run, this time including a NLP Regression with LightGBM. Found that NLP runs on Beer Name and Beer Type each separately scored better than the all columns, so going to use these as separate runs

Collaborative Filter models to use:

* KNNWithMeans
* BaselineOnly
* SVDpp
* SlopeOne

Content Filter models 

* LightGBM on all non text columns
* LightGBM with Beer Context columns (non text)
* SKLearn Linear Regression on all non text columns
* LightGBM All columns including NLP
* LightGBM NLP on Beer Name
* LightGBM NLP on Text Lemmatized

Hybrid Models
* ContentKNNAlgorithm - uses just the number columns for Cosine Similarity Matrix


In [2]:
filePrefix = "A3_153_ensemble_v5_complete_run"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
runDir = "runs/"
modelsDir = "models/"
forceRetrainModels = True
forceRetrainModelSurprise = forceRetrainModels
forceRetrainModelContentFilters = forceRetrainModels
# forceRetrainModelSurprise = False
# forceRetrainModelContentFilters = True

# seed = databasic.get_random_seed()
seed = 2046

In [3]:
trainFilePath = baseDataDir + 'train.tsv'
valiFilePath = baseDataDir + 'val.tsv'
featuresFilePath = baseDataDir + 'features.tsv'
testFilePath = baseDataDir + 'test.tsv'
trainFullProcessedPath = baseDataDir + 'train_features_preprocessed.csv'
valiFullProcessedPath = baseDataDir + 'vali_features_preprocessed.csv'
testFullProcessedPath = baseDataDir + 'test_features_preprocessed.csv'

# trainFilePath = baseDataDir + 'train_200k.tsv'
# valiFilePath = baseDataDir + 'vali_200k.tsv'
# featuresFilePath = baseDataDir + 'features_200k.tsv'
# testFilePath = baseDataDir + 'test_200k.tsv'
# trainFullProcessedPath = baseDataDir + 'train_features_preprocessed_200k.csv'
# valiFullProcessedPath = baseDataDir + 'vali_features_preprocessed_200k.csv'
# testFullProcessedPath = baseDataDir + 'test_features_preprocessed_200k.csv'

In [4]:
useModelSurpriseSlopeOne = True

useModelLbgmBeerContext = True
useModelLbgmAllCols = True
useModelSkLinReg = True

useModelLgbmNlpAll = True
useModelLgbmNlpBeerName = True
useModelLgbmNlpText = True

useModelContentKNN = True

Load the Files one by one then delete them after your done, for memory management

In [5]:
df_train = pd.read_csv(trainFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])
df_train.head(10)

Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,19,12300,10635,Rauch Ür Bock,Rauchbier,4.0
1,21,12300,6547,Rauch Ür Bock,Rauchbier,4.5
2,23,12300,9789,Rauch Ür Bock,Rauchbier,4.5
3,24,12300,7372,Rauch Ür Bock,Rauchbier,5.0
4,25,12300,1302,Rauch Ür Bock,Rauchbier,4.5
5,26,12300,704,Rauch Ür Bock,Rauchbier,4.5
6,29,12300,1747,Rauch Ür Bock,Rauchbier,5.0
7,31,12300,9368,Rauch Ür Bock,Rauchbier,4.5
8,32,12300,2568,Rauch Ür Bock,Rauchbier,4.0
9,33,12300,6838,Rauch Ür Bock,Rauchbier,4.0


In [6]:
# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)

## Collaborative Filter Models: Train

For the Collaborative Filtering Models, we only need the Training set. Train the models, then save them to file for later use

In [7]:
# Load into a Surprise dataset
reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID', 'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

In [8]:
# Create each algorithm, train the model, save it to file for later, then delete the model

predictorKNN = KNNWithMeans(k=160)
featutil.trainSurpriseModel(predictorKNN, trainsetTrainFeatures, modelsDir, filePrefix, "knnwithmeans", forceRetrainModelSurprise)
del predictorKNN

predictorBaselineOnly = BaselineOnly(bsl_options = {'n_epochs': 5, 'reg_u': 3, 'reg_i': 16})
featutil.trainSurpriseModel(predictorBaselineOnly, trainsetTrainFeatures, modelsDir, filePrefix, "baselineonly", forceRetrainModelSurprise)
del predictorBaselineOnly

predictorSVDpp = SVDpp(n_factors = 10, n_epochs=20, lr_all=0.005, reg_all=0.2)
featutil.trainSurpriseModel(predictorSVDpp, trainsetTrainFeatures, modelsDir, filePrefix, "svdpp", forceRetrainModelSurprise)
del predictorSVDpp

if useModelSurpriseSlopeOne:
  predictorSlopeOne = SlopeOne()
  featutil.trainSurpriseModel(predictorSlopeOne, trainsetTrainFeatures, modelsDir, filePrefix, "slopeone", forceRetrainModelSurprise)
  del predictorSlopeOne


Computing the msd similarity matrix...
Done computing similarity matrix.
The dump has been saved as file models/A3_153_ensemble_v5_complete_run_knnwithmeans_predictor.model
Estimating biases using als...
The dump has been saved as file models/A3_153_ensemble_v5_complete_run_baselineonly_predictor.model
The dump has been saved as file models/A3_153_ensemble_v5_complete_run_svdpp_predictor.model
The dump has been saved as file models/A3_153_ensemble_v5_complete_run_slopeone_predictor.model


In [9]:
# Clean up the training data used for the collaborate filters
del trainsetTrainFeatures
del reader
del dsetTrainFeatures
del dfTrainFeatures
del df_train

## Collaborative Filter Models: Predict On Validation Data 

Now we want to load the Validation set to we can predict against it and write out the subrun files, which will be used later for the Ensemble.

First, do the Predictions for the Collaborative Filter models (surprise)

In [10]:
# Read the validation data (in full)
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

reader = Reader(rating_scale=(0, 5))

idCols = ['RowID','BeerID','ReviewerID']
dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID', 'rating']],reader)

In [11]:
predictValiMae_KnnWithMeans = featutil.predictSurpriseModel(modelsDir, filePrefix, "knnwithmeans", "val", dsetValiFeatures, dfValiIds, subrunDir)
predictValiMae_BaselineOnly = featutil.predictSurpriseModel(modelsDir, filePrefix, "baselineonly", "val", dsetValiFeatures, dfValiIds, subrunDir)
predictValiMae_SVDpp = featutil.predictSurpriseModel(modelsDir, filePrefix, "svdpp", "val", dsetValiFeatures, dfValiIds, subrunDir)

if useModelSurpriseSlopeOne:
  predictValiMae_SlopeOne = featutil.predictSurpriseModel(modelsDir, filePrefix, "slopeone", "val", dsetValiFeatures, dfValiIds, subrunDir)  

MAE:  0.4395
MAE for knnwithmeans: 0.43953347322741626
MAE:  0.4397
MAE for baselineonly: 0.4397473132133758
MAE:  0.4432
MAE for svdpp: 0.4432245115467577
MAE:  0.4418
MAE for slopeone: 0.44176542058237306


In [12]:
# Clean up variables from the Predict Stage
del df_vali
del reader
del dfValiIds
del dfValiFeatures
del dsetValiFeatures

## Content Filter Models, train and predict

First we want to load the features and do all the data preprocessing, then we can train the different models

In [13]:
# Load the training data
df_train = pd.read_csv(trainFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

# Load the validation data. When we want to do one hot encoding, we have to do it over both datasets to ensure consistency
df_vali = pd.read_csv(valiFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID', 'BeerName','BeerType','rating'])

# Load the validation data. When we want to do one hot encoding, we have to do it over both datasets to ensure consistency
df_test = pd.read_csv(testFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID', 'BeerName','BeerType','rating'])                         

# Load the features
df_features = pd.read_csv(featuresFilePath,sep='\t',
    names=['RowID','BrewerID','ABV','DayofWeek','Month',
          'DayofMonth','Year','TimeOfDay','Gender',
          'Birthday','Text','Lemmatized','POS_Tag'])    

In [14]:
# Add the reviewer counts to each of the data sets
df_train = featutil.addReviewerReviewCount(df_train)
df_train = featutil.addBeerReviewCount(df_train)

df_vali = featutil.addReviewerReviewCount(df_vali)
df_vali = featutil.addBeerReviewCount(df_vali)

df_test = featutil.addReviewerReviewCount(df_test)
df_test = featutil.addBeerReviewCount(df_test)

In [15]:
colsToUse = ["RowID", "BrewerID", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender", "TimeOfDay", "Birthday"]

df_train_data = df_train.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
df_vali_data = df_vali.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
df_test_data = df_test.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")

# Remove the duplicated Row ID, also remove Beer Name at this point, we're nt using it
df_train_data = df_train_data.drop(['RowIDFeat', "BeerName"],axis=1)
df_vali_data = df_vali_data.drop(['RowIDFeat', "BeerName"],axis=1)
df_test_data = df_test_data.drop(['RowIDFeat', "BeerName"],axis=1)

In [16]:
# Clean up these dataframes now that they have been joined
del df_train
del df_vali
del df_features
del df_test

In [17]:
# do the feature transformations
df_train_data = featutil.fixNullABV(df_train_data)
df_vali_data = featutil.fixNullABV(df_vali_data)
df_test_data = featutil.fixNullABV(df_test_data)

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "BrewerID")

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "BeerType")

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "Gender")

df_train_data = featutil.formatDayOfWeek(df_train_data)
df_vali_data = featutil.formatDayOfWeek(df_vali_data)
df_test_data = featutil.formatDayOfWeek(df_test_data)

df_train_data = featutil.formatMonth(df_train_data)
df_vali_data = featutil.formatMonth(df_vali_data)
df_test_data = featutil.formatMonth(df_test_data)

df_train_data = featutil.formatTimeToSec(df_train_data)
df_vali_data = featutil.formatTimeToSec(df_vali_data)
df_test_data = featutil.formatTimeToSec(df_test_data)

df_train_data = featutil.convertBirthdayToAge(df_train_data)
df_vali_data = featutil.convertBirthdayToAge(df_vali_data)
df_test_data = featutil.convertBirthdayToAge(df_test_data)

(1237434, 17)


  df_combined.columns = df_combined.columns.str.replace(" ", "").str.replace("/", "").str.replace("-", "") \


(1237434, 2092)
(1237434, 2195)


In [18]:
# df_train_data = featutil.scaleFeatureDataFrame(df_train_data)
# df_vali_data = featutil.scaleFeatureDataFrame(df_vali_data)
# df_test_data = featutil.scaleFeatureDataFrame(df_test_data)

In [19]:
print(df_train_data.shape)
print(df_vali_data.shape)
print(df_test_data.shape)
df_test_data.head()

(746207, 2196)
(243834, 2196)
(247393, 2196)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,18,12300,10059,,165,8,7.4,7,12,6,...,0,0,0,0,0,0,0,0,1,0
1,20,12300,9761,,156,8,7.4,6,21,5,...,0,0,0,0,0,0,0,0,1,0
2,30,12300,7279,,553,8,7.4,2,12,10,...,0,0,0,0,0,0,0,0,0,1
3,46,12300,2367,,283,8,5.5,3,22,7,...,0,0,0,0,0,0,0,0,1,0
4,47,12300,2230,,35,8,5.5,2,21,7,...,0,0,0,0,0,0,0,0,1,0


In [20]:
# Write the test data file out so we can load it back in later so as not to have to redo this step
df_test_data.to_csv(baseDataDir + filePrefix + "_test_cleaned.csv", index=False)

del df_test_data

In [21]:
# Get all the columns
col_names = df_train_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainFeatures = df_train_data[feature_cols]
dfTrainTarget = df_train_data[target_col]

dfValiIds = df_vali_data[idCols]
dfValiFeatures = df_vali_data[feature_cols]
dfValiTarget = df_vali_data[target_col]


In [22]:
dfTrainFeatures.head()

Unnamed: 0,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,Year,TimeOfDay,Birthday,BrewerID_1,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,200,23,7.4,1,23,5,2011,56188,0,0,...,0,0,0,0,0,0,0,0,1,0
1,10,23,7.4,1,16,5,2011,1906,0,0,...,0,0,0,0,0,0,0,0,1,0
2,164,23,7.4,7,10,4,2011,44246,0,0,...,0,0,0,0,0,0,0,0,0,1
3,432,23,7.4,3,30,3,2011,50880,0,0,...,0,0,0,0,0,0,0,0,1,0
4,500,23,7.4,4,24,3,2011,50820,37,0,...,0,0,0,0,0,0,0,0,1,0


In [23]:
def getFeaturesBeerContext(df1):
  consumerCols = ["DayofWeek", "DayofMonth", "Month", "TimeOfDay", "Birthday", "Gender_Male", "Gender_Female", "Gender_unknown"]
  return dfutil.getFeaturesWithoutCols(df1, consumerCols)

In [25]:
# Train the models, save them to file and then clear the model from memory
if useModelLbgmBeerContext:
    modelBeerContext = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
        ,learning_rate=0.010443500090385492, num_leaves = 68, max_depth = 14, n_estimators = 608
    )  
    dfTrainFeatures_BeerContext = getFeaturesBeerContext(dfTrainFeatures)
    dfValiFeatures_BeerContext = getFeaturesBeerContext(dfValiFeatures)
    featutil.trainLightGbmModel(modelBeerContext, dfTrainFeatures_BeerContext, dfTrainTarget, 
        modelsDir, filePrefix, "lgbm_beercontext", forceRetrainModelContentFilters)
    predictValiMae_LgbmBeerContext = featutil.predictLightGbmModel(dfValiIds, dfValiFeatures_BeerContext, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_beercontext")    
    del dfTrainFeatures_BeerContext
    del dfValiFeatures_BeerContext
    del modelBeerContext

if useModelLbgmAllCols:
    modelLgbm = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
        ,learning_rate=0.16142127923810723, num_leaves = 127, max_depth = 18, n_estimators = 811
    ) 
    featutil.trainLightGbmModel(modelLgbm, dfTrainFeatures, dfTrainTarget, 
        modelsDir, filePrefix, "lgbm_allcols", forceRetrainModelContentFilters)
    predictValiMae_LgbmAllCols = featutil.predictLightGbmModel(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_allcols")    
    del modelLgbm

if useModelSkLinReg:
    modelLinReg = LinearRegression()
    featutil.trainSkLinearRegModel(modelLinReg, dfTrainFeatures, dfTrainTarget, 
        modelsDir, filePrefix, "sklinearreg", forceRetrainModelContentFilters)
    predictValiMae_SkLinearReg = featutil.predictSkLinearRegModel(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "sklinearreg")
    del modelLinReg    


MAE for lgbm_beercontext: 0.47949462210333754
MAE for lgbm_allcols: 0.4770979101350705
MAE for sklinearreg: 0.49873941880294115


Do the Content KNN here to use the text preprocessed data without the full NLP data. Train then Predict

In [26]:
if useModelContentKNN:

  # Load into a Surprise dataset
  reader = Reader(rating_scale=(0, 5))
  dsetTrainFeatures = Dataset.load_from_df(df_train_data[['BeerID','ReviewerID', 'rating']],reader)
  trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

  colsToUse = df_train_data.columns.drop(['BeerID','ReviewerID','rating' ])

  # Train the KNN Model
  predictorContentKNN = ContentKNNAlgorithm()
  features_for_cos = df_train_data[colsToUse]
  predictorContentKNN.setFeatures(features_for_cos)
  featutil.trainSurpriseModel(predictorContentKNN, trainsetTrainFeatures, modelsDir, filePrefix, "contentknn", forceRetrainModelContentFilters)
  del predictorContentKNN

  # Now Predict with the model
  dsetValiFeatures = Dataset.load_from_df(df_vali_data[['BeerID','ReviewerID', 'rating']],reader)
  features_for_cos = df_vali_data[colsToUse]
  predictValiMae_ContentKNN = featutil.predictSurpriseModel(modelsDir, filePrefix, "contentknn", "val", dsetValiFeatures, dfValiIds, subrunDir, features_for_cos)


  del reader
  del dsetTrainFeatures
  del trainsetTrainFeatures
  del features_for_cos
  del dsetValiFeatures

Computing content-based similarity matrix...
0  of  10694
  Processing thisMovieID: 10635 otherMovieID: 6547
  Processing thisMovieID: 10635 otherMovieID: 9789
  Processing thisMovieID: 10635 otherMovieID: 7372
  Processing thisMovieID: 6547 otherMovieID: 9789
  Processing thisMovieID: 6547 otherMovieID: 7372
  Processing thisMovieID: 6547 otherMovieID: 1302
  Processing thisMovieID: 9789 otherMovieID: 7372
  Processing thisMovieID: 9789 otherMovieID: 1302
  Processing thisMovieID: 9789 otherMovieID: 704
  Processing thisMovieID: 7372 otherMovieID: 1302
  Processing thisMovieID: 7372 otherMovieID: 704
  Processing thisMovieID: 7372 otherMovieID: 1747
  Processing thisMovieID: 1302 otherMovieID: 704
  Processing thisMovieID: 1302 otherMovieID: 1747
  Processing thisMovieID: 1302 otherMovieID: 9368
1000  of  10694
2000  of  10694
3000  of  10694
4000  of  10694
5000  of  10694
6000  of  10694
7000  of  10694
8000  of  10694
9000  of  10694
10000  of  10694
...done.
The dump has been save

In [27]:
# Clean up the variables from memory
del df_train_data
del df_vali_data
del dfTrainFeatures
del dfTrainTarget
del dfValiIds
del dfValiFeatures
del dfValiTarget

## Content Filter Models with NLP

In this version, well train the Content Filter models using NLP. Assuming that the data has been preprocessed already and saved to file via the A3_130_create_full_features_processed notebook, although later, possibly add the logic into here as part of the process

In [28]:
# Load the training data
df_train = dd.read_csv(trainFullProcessedPath)

df_train.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,BeerType_Altbier,BeerType_AmericanAdjunctLager,BeerType_AmericanAmberRedAle,BeerType_AmericanAmberRedLager,...,Lemmatized_DocVec_190,Lemmatized_DocVec_191,Lemmatized_DocVec_192,Lemmatized_DocVec_193,Lemmatized_DocVec_194,Lemmatized_DocVec_195,Lemmatized_DocVec_196,Lemmatized_DocVec_197,Lemmatized_DocVec_198,Lemmatized_DocVec_199
0,19,12300,10635,4.0,200,23,0,0,0,0,...,-0.015221,-0.037351,-0.027596,0.042438,0.032531,-0.02837,0.035328,0.010716,-0.016952,-0.022973
1,21,12300,6547,4.5,10,23,0,0,0,0,...,-0.023733,-0.01519,-0.001721,0.023392,-0.019479,-0.005462,0.021144,0.027892,-0.010553,-0.038961
2,23,12300,9789,4.5,164,23,0,0,0,0,...,-0.018624,-0.013615,-0.011761,0.032562,0.005066,-0.030226,0.019855,0.056897,-0.034139,-0.028913
3,24,12300,7372,5.0,432,23,0,0,0,0,...,-0.03295,-0.008993,-0.009092,0.014841,-0.012791,-0.010687,0.020269,0.035415,-0.010456,-0.034556
4,25,12300,1302,4.5,500,23,0,0,0,0,...,-0.005594,-0.021759,-0.006643,0.022896,-0.017797,-0.012587,0.01783,0.021566,-0.017301,-0.024277
5,26,12300,704,4.5,605,23,0,0,0,0,...,-0.012204,-0.018415,-0.01822,0.029805,-0.018995,-0.017093,0.026047,0.023464,-0.024087,-0.023485
6,29,12300,1747,5.0,463,23,0,0,0,0,...,-0.016405,-0.007826,0.006608,0.03471,-0.001579,-0.020236,0.017857,0.023612,-0.017856,-0.011979
7,31,12300,9368,4.5,49,23,0,0,0,0,...,-0.017761,-0.020269,-0.028371,0.017902,-0.007823,-0.021233,0.021767,0.02661,-0.032275,-0.021046
8,32,12300,2568,4.0,221,23,0,0,0,0,...,-0.007007,-0.015566,-0.008591,0.037669,-0.016438,-0.011946,0.018999,0.02183,-0.031791,-0.001757
9,33,12300,6838,4.0,110,23,0,0,0,0,...,-0.009198,-0.012072,-0.014309,0.04513,-0.011133,-0.013445,0.014829,0.037455,-0.026073,-0.029366


In [29]:
# "BrewerID_", "BeerType_", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender_", "TimeOfDay", "Birthday", "BeerName_", "Lemmatized_"
# Get all the columns
col_names = df_train.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train[idCols]
dfTrainFeatures = df_train[feature_cols]
dfTrainTarget = df_train[target_col]

if useModelLgbmNlpAll:
  # train the model
  model_lgbm_nlp = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
      ,learning_rate=0.09075359977364383, num_leaves = 120, max_depth = 40, n_estimators = 248, min_split_gain = 0.6310082232017945, 
      min_child_samples = 35, subsample = 0.9466694477903548, 
      subsample_freq = 0, colsample_bytree = 0.29392263338193186, reg_alpha = 0.891904482598078, reg_lambda = 0.4521335679885054
    )
  featutil.trainLightGbmModel(model_lgbm_nlp, dfTrainFeatures, dfTrainTarget, 
      modelsDir, filePrefix, "lgbm_allcols_inc_nlp", forceRetrainModelContentFilters)


In [30]:
del dfTrainFeatures

# Just get the Beer Name columns
feature_cols = list(filter(lambda x: x.startswith("BeerName_"), col_names))
dfTrainFeatures = df_train[feature_cols]

if useModelLgbmNlpBeerName:
  # train the model
  model_lgbm_nlp = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
      ,learning_rate=0.09075359977364383, num_leaves = 120, max_depth = 40, n_estimators = 248, min_split_gain = 0.6310082232017945, 
      min_child_samples = 35, subsample = 0.9466694477903548, 
      subsample_freq = 0, colsample_bytree = 0.29392263338193186, reg_alpha = 0.891904482598078, reg_lambda = 0.4521335679885054
    )
  featutil.trainLightGbmModel(model_lgbm_nlp, dfTrainFeatures, dfTrainTarget, 
      modelsDir, filePrefix, "lgbm_allcols_nlp_beer_name", forceRetrainModelContentFilters)


In [31]:
del dfTrainFeatures

# Just get the Text Lemmatized columns
feature_cols = list(filter(lambda x: x.startswith("Lemmatized_"), col_names))
dfTrainFeatures = df_train[feature_cols]

if useModelLgbmNlpText:
  # train the model
  model_lgbm_nlp = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
      ,learning_rate=0.09075359977364383, num_leaves = 120, max_depth = 40, n_estimators = 248, min_split_gain = 0.6310082232017945, 
      min_child_samples = 35, subsample = 0.9466694477903548, 
      subsample_freq = 0, colsample_bytree = 0.29392263338193186, reg_alpha = 0.891904482598078, reg_lambda = 0.4521335679885054
    )
  featutil.trainLightGbmModel(model_lgbm_nlp, dfTrainFeatures, dfTrainTarget, 
      modelsDir, filePrefix, "lgbm_allcols_nlp_text", forceRetrainModelContentFilters)


In [32]:
# clean up training data from memory
del df_train
del dfTrainIds
del dfTrainFeatures
del dfTrainTarget

del model_lgbm_nlp

In [33]:
df_vali = dd.read_csv(valiFullProcessedPath)

df_vali.head(10)

Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,BeerType_Altbier,BeerType_AmericanAdjunctLager,BeerType_AmericanAmberRedAle,BeerType_AmericanAmberRedLager,...,Lemmatized_DocVec_190,Lemmatized_DocVec_191,Lemmatized_DocVec_192,Lemmatized_DocVec_193,Lemmatized_DocVec_194,Lemmatized_DocVec_195,Lemmatized_DocVec_196,Lemmatized_DocVec_197,Lemmatized_DocVec_198,Lemmatized_DocVec_199
0,22,12300,2634,4.0,19,9,0,0,0,0,...,0.037856,-0.025707,0.018765,0.017947,0.032514,-0.005172,0.078611,0.037843,-0.055529,-0.062678
1,27,12300,5634,4.5,48,9,0,0,0,0,...,-0.008134,-0.024224,-0.016466,0.0284,0.006461,-0.019549,0.03387,0.035065,-0.03409,-0.023707
2,28,12300,3544,4.5,227,9,0,0,0,0,...,-0.017464,-0.020363,-0.029821,0.012509,-0.004991,-0.028156,0.004233,0.016622,-0.023521,-0.011652
3,40,12300,6521,4.0,81,9,0,0,0,0,...,0.009325,-0.009763,0.011405,0.036466,-0.000207,-0.027346,0.002969,0.049804,0.005456,-0.030397
4,43,12300,10177,4.5,58,9,0,0,0,0,...,0.005636,-0.022704,-0.017054,0.030922,0.023351,-0.029383,0.038637,0.034193,-0.033337,-0.031086
5,48,12300,2907,3.5,230,9,0,0,0,0,...,-0.006462,-0.019776,-0.02931,0.032069,-0.021865,-0.02649,0.021921,0.053789,-0.030939,-0.007063
6,49,12300,1532,4.0,185,9,0,0,0,0,...,-0.002694,-0.027112,-0.015027,0.028278,0.012752,-0.00664,0.022978,0.026543,-0.033684,-0.022682
7,50,12300,3452,3.5,37,9,0,0,0,0,...,0.001617,-0.002316,-0.004816,0.029866,-0.014934,-0.029023,0.019102,0.039361,-0.033655,-0.013928
8,59,12300,6861,4.0,230,9,0,0,0,0,...,-0.013137,-0.016453,-0.005179,0.027188,-0.005314,-0.021139,0.020758,0.047926,-0.019481,-0.014745
9,85539,1198,2634,4.5,19,531,0,0,0,0,...,0.049964,-0.036805,0.024819,0.027689,0.058805,-0.020324,0.068448,0.071592,-0.07276,-0.04718


In [34]:
col_names = df_vali.columns

dfValiIds = df_vali[idCols]
dfValiTarget = df_vali[target_col]

if useModelLgbmNlpAll:
    feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
    dfValiFeatures = df_vali[feature_cols]
    predictValiMae_LgbmAllColsIncNlp = featutil.predictLightGbmModelDask(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_allcols_inc_nlp")  
     
    del dfValiFeatures




MAE for lgbm_allcols_inc_nlp: 0.4619192065868069


In [35]:
if useModelLgbmNlpBeerName:

    # Just get the Beer Name columns
    feature_cols = list(filter(lambda x: x.startswith("BeerName_"), col_names))
    dfValiFeatures = df_vali[feature_cols]

    predictValiMae_LgbmNlpBeerName = featutil.predictLightGbmModelDask(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_allcols_nlp_beer_name")  

    del dfValiFeatures



MAE for lgbm_allcols_nlp_beer_name: 0.4618644555954773


In [36]:
if useModelLgbmNlpText:
    # Just get the Beer Name columns
    feature_cols = list(filter(lambda x: x.startswith("Lemmatized"), col_names))
    dfValiFeatures = df_vali[feature_cols]
    predictValiMae_LgbmNlpText = featutil.predictLightGbmModelDask(dfValiIds, dfValiFeatures, dfValiTarget,
        subrunDir, modelsDir, filePrefix, "val", "lgbm_allcols_nlp_text")  
        
    del dfValiFeatures



MAE for lgbm_allcols_nlp_text: 0.4968858179209336


In [37]:
del df_vali
del dfValiIds
del dfValiTarget

## Train the Ensemble Model

Now that all the sub run files have been generated, combine all the predictions into one dataset, train a new final, ensemble model, predict on the validation data and get an MAE and save the model for use later on the Test data.

In [38]:
# Read the validation data (in full) again. But this time, we just want the Row and the rating
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

df_ensemble_full = df_vali[["RowID", "rating"]]      

del df_vali

In [39]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "svdpp" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

if useModelSurpriseSlopeOne:
  fileName = filePrefix + "_" + "slopeone" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

# # Content Filter Runs
if useModelLbgmBeerContext:
  fileName = filePrefix + "_" + "lgbm_beercontext" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

if useModelLbgmAllCols:
  fileName = filePrefix + "_" + "lgbm_allcols" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

if useModelSkLinReg:
  fileName = filePrefix + "_" + "sklinearreg" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)


# # Content Filter with NLP Runs
if useModelLgbmNlpAll:
  fileName = filePrefix + "_" + "lgbm_allcols_inc_nlp" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)
  
if useModelLgbmNlpBeerName:
  fileName = filePrefix + "_" + "lgbm_allcols_nlp_beer_name" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)
  
if useModelLgbmNlpText:
  fileName = filePrefix + "_" + "lgbm_allcols_nlp_text" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)


#  Hybrid Filter 
  
if useModelContentKNN:
  fileName = filePrefix + "_" + "contentknn" + "_val" + "_subrun"
  df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

In [40]:
# Get all the columns
col_names = df_ensemble_full.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID','rating'])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_ensemble_full[idCols]
dfTrainFeatures = df_ensemble_full[feature_cols]
dfTrainTarget = df_ensemble_full[target_col]


In [41]:
# Doing the final Ensemble prediction using Light GBM Regression, params tuned

# Create the model and predict
model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed,
  learning_rate=0.2743431718467076, num_leaves=119, max_depth=24, n_estimators=772
)
model.fit(X=dfTrainFeatures, y=dfTrainTarget)

# use the model to predict
test_predicted = model.predict(dfTrainFeatures)
dfPredicted = pd.DataFrame({"Predict": test_predicted})

# Calc the MAE and display
predictValiMae_Ensemble = mean_absolute_error(dfTrainTarget, test_predicted)
print("Ensemble Final Average MAE (from validation data): " + str(predictValiMae_Ensemble))

# Save the model to file
model.booster_.save_model(modelsDir + filePrefix + "_ensemble_predictor.model")

print(model.feature_importances_)

Ensemble Final Average MAE (from validation data): 0.3976888068139247
[17764 10060  7711 11616  2286  1540 12121  5757 11473 10768     0]


In [42]:
# Clean up all the variables
del df_ensemble_full
del dfTrainIds
del dfTrainFeatures
del dfTrainTarget
del model
del test_predicted
del dfPredicted

## Predict on the Test data with Models for Subruns

Now that we have the final Ensemble model, we can process the Test data. We need to load the test data, and create all the sub runs by using all the base level models to predict.

First, predict using the Collaborative Filter Models

In [43]:
# Read the validation data (in full)
df_test = pd.read_csv(testFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType'])

# The test set is unlabeled, so we don't know the true ratings. Populate a rating col with zeros, as we are going
# to predict these values
df_test["rating"] = 0

reader = Reader(rating_scale=(0, 5))

idCols = ['RowID','BeerID','ReviewerID']
dfTestIds = df_test[idCols]
dfTestFeatures = df_test.drop(['RowID','BeerName','BeerType'],axis=1)
dsetTestFeatures = Dataset.load_from_df(dfTestFeatures[['BeerID','ReviewerID','rating']],reader)

In [44]:
# Predict using the Collaborative Filter Models
featutil.predictSurpriseModel(modelsDir, filePrefix, "knnwithmeans", "test", dsetTestFeatures, dfTestIds, subrunDir)
featutil.predictSurpriseModel(modelsDir, filePrefix, "baselineonly", "test", dsetTestFeatures, dfTestIds, subrunDir)
featutil.predictSurpriseModel(modelsDir, filePrefix, "svdpp", "test", dsetTestFeatures, dfTestIds, subrunDir)

if useModelSurpriseSlopeOne:
  featutil.predictSurpriseModel(modelsDir, filePrefix, "slopeone", "test", dsetTestFeatures, dfTestIds, subrunDir)

# Ignore the displaed MAEs, since all the targets are 0

MAE:  3.8233
MAE for knnwithmeans: 3.8232692268407495
MAE:  3.8282
MAE for baselineonly: 3.828185583765753
MAE:  3.8266
MAE for svdpp: 3.8265949358490663
MAE:  3.8257
MAE for slopeone: 3.825660997611865


In [45]:
# Clean up variables from the Predict Stage
del reader
del dfTestIds
del dfTestFeatures
del dsetTestFeatures

# Keep this, as we will use this in the next stage
# del df_test 

Now Predict using the Content Filter Models. 

In [46]:
# Reload that test data that was cleaned and processed previously
df_test_data = pd.read_csv(baseDataDir + filePrefix + "_test_cleaned.csv")

In [47]:
# Get all the columns
col_names = df_test_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID', 'rating' ])

# Create the sub data sets of the features and the target
dfTestIds = df_test_data[idCols]
dfTestFeatures = df_test_data[feature_cols]

In [48]:
print(df_test_data.columns)
df_test_data.head()

Index(['RowID', 'BeerID', 'ReviewerID', 'rating', 'ReviewerReviewCount',
       'BeerReviewCount', 'ABV', 'DayofWeek', 'DayofMonth', 'Month',
       ...
       'BeerType_SmokedBeer', 'BeerType_Tripel', 'BeerType_ViennaLager',
       'BeerType_Weizenbock', 'BeerType_Wheatwine', 'BeerType_WinterWarmer',
       'BeerType_Witbier', 'Gender_Female', 'Gender_Male', 'Gender_unknown'],
      dtype='object', length=2196)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,18,12300,10059,,165,8,7.4,7,12,6,...,0,0,0,0,0,0,0,0,1,0
1,20,12300,9761,,156,8,7.4,6,21,5,...,0,0,0,0,0,0,0,0,1,0
2,30,12300,7279,,553,8,7.4,2,12,10,...,0,0,0,0,0,0,0,0,0,1
3,46,12300,2367,,283,8,5.5,3,22,7,...,0,0,0,0,0,0,0,0,1,0
4,47,12300,2230,,35,8,5.5,2,21,7,...,0,0,0,0,0,0,0,0,1,0


Problem is with one hot encoding, different sets of brewers or beer types between the training data (train+vali) and what is in test

In [49]:
# Now we can make predictions according to each of our Content Filter Models. Pass None for the target set, the function
# will just skip the evaluation (calculating the MAE)
if useModelLbgmBeerContext:
    dfTestFeatures_BeerContext =  getFeaturesBeerContext(dfTestFeatures)
    featutil.predictLightGbmModel(dfTestIds, dfTestFeatures_BeerContext, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_beercontext")    
    del dfTestFeatures_BeerContext

if useModelLbgmAllCols:
    featutil.predictLightGbmModel(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_allcols") 

if useModelSkLinReg:
    featutil.predictSkLinearRegModel(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "sklinearreg")


Also do our Content KNN here because we want to use the saved cleaned features without all the nlp data

In [50]:
if useModelContentKNN:

  # Load into a Surprise dataset
  reader = Reader(rating_scale=(0, 5))
  dsetTestFeatures = Dataset.load_from_df(df_test_data[['BeerID','ReviewerID', 'rating']],reader)

  colsToUse =  df_test_data.columns.drop(['BeerID','ReviewerID', 'rating' ])
  features_for_cos = df_test_data[colsToUse]

  featutil.predictSurpriseModel(modelsDir, filePrefix, "contentknn", "test", dsetTestFeatures, dfTestIds, subrunDir, features_for_cos)

  del reader
  del dsetTestFeatures
  del features_for_cos  

MAE:  nan
MAE for contentknn: nan


In [51]:
del df_test_data
del dfTestIds
del dfTestFeatures

Finally, Predict using the Content Filter with NLP models

In [52]:
df_test = dd.read_csv(testFullProcessedPath)

df_test.head(10)

Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,BeerType_Altbier,BeerType_AmericanAdjunctLager,BeerType_AmericanAmberRedAle,BeerType_AmericanAmberRedLager,...,Lemmatized_DocVec_190,Lemmatized_DocVec_191,Lemmatized_DocVec_192,Lemmatized_DocVec_193,Lemmatized_DocVec_194,Lemmatized_DocVec_195,Lemmatized_DocVec_196,Lemmatized_DocVec_197,Lemmatized_DocVec_198,Lemmatized_DocVec_199
0,18,12300,10059,,165,8,0,0,0,0,...,-0.014647,-0.026918,-0.011228,0.03132,0.003554,-0.008815,0.013557,0.02666,0.003221,-0.023906
1,20,12300,9761,,156,8,0,0,0,0,...,-0.015676,-0.016347,0.000929,0.03232,-0.003642,-0.001494,0.011423,0.029472,-0.016628,-0.025493
2,30,12300,7279,,553,8,0,0,0,0,...,-0.025169,-0.026436,-0.007408,0.021257,0.002338,-0.006929,0.013163,0.017373,-0.017656,-0.033172
3,46,12300,2367,,283,8,0,0,0,0,...,-0.019156,-0.042503,-0.032763,0.029283,0.018134,-0.023948,0.039365,0.030793,-0.016637,-0.024441
4,47,12300,2230,,35,8,0,0,0,0,...,0.054461,-0.020031,-0.019424,-0.004812,0.010093,-0.021314,0.009206,0.014198,-0.029098,-0.024061
5,51,12300,4346,,46,8,0,0,0,0,...,0.011447,-0.027746,-0.006993,0.035289,-0.016882,-0.014864,0.002232,0.034966,-0.02604,-0.02424
6,52,12300,532,,154,8,0,0,0,0,...,-0.004506,0.008535,-3.7e-05,0.037093,-0.004332,-0.019775,0.012535,0.043481,-0.030942,-0.010344
7,53,12300,8883,,323,8,0,0,0,0,...,-0.001062,-0.007997,-0.001691,0.046424,-0.008982,-0.024702,0.020907,0.0484,-0.025968,-0.026031
8,4799,10553,10059,,165,90,0,0,0,0,...,-0.017229,-0.022178,0.022393,0.026138,0.023597,-0.013615,0.031667,0.050123,-0.032404,-0.024811
9,4764,10553,1376,,83,90,0,0,0,0,...,-0.038134,0.000195,-0.0264,0.025432,0.01795,-0.006545,0.051419,0.00233,-0.062575,-0.068906


In [53]:
# Get all the columns
col_names = df_test.columns

idCols = ['RowID','BeerID','ReviewerID']
dfTestIds = df_test[idCols]

if useModelLgbmNlpAll:
    feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID', 'rating' ])
    dfTestFeatures = df_test[feature_cols]
    featutil.predictLightGbmModelDask(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_allcols_inc_nlp")       

    del dfTestFeatures



In [54]:
if useModelLgbmNlpBeerName:
    # Just get the Text columns
    feature_cols = list(filter(lambda x: x.startswith("BeerName_"), col_names))
    dfTestFeatures = df_test[feature_cols]
    featutil.predictLightGbmModelDask(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_allcols_nlp_beer_name")  
        
    del dfTestFeatures



In [55]:
if useModelLgbmNlpText:
    # Just get the Text columns
    feature_cols = list(filter(lambda x: x.startswith("Lemmatized_"), col_names))
    dfTestFeatures = df_test[feature_cols]
    featutil.predictLightGbmModelDask(dfTestIds, dfTestFeatures, None,
        subrunDir, modelsDir, filePrefix, "test", "lgbm_allcols_nlp_text")  
        
    del dfTestFeatures



In [56]:
del dfTestIds

### Load the Ensemble Model and predict on the Test data

Load the test data

In [57]:
df_ensemble_test = df_test[["RowID"]]     

# Convert the Dask Dataset back to a pandas dataset
df_ensemble_test = df_ensemble_test.compute()

del df_test

In [58]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "svdpp" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

if useModelSurpriseSlopeOne:
  fileName = filePrefix + "_" + "slopeone" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

# # Content Filter Runs
if useModelLbgmBeerContext:
  fileName = filePrefix + "_" + "lgbm_beercontext" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

if useModelLbgmAllCols:
  fileName = filePrefix + "_" + "lgbm_allcols" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

if useModelSkLinReg:
  fileName = filePrefix + "_" + "sklinearreg" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)


# Content Filter Runs inc NLP doc vector cols
if useModelLgbmNlpAll:
  fileName = filePrefix + "_" + "lgbm_allcols_inc_nlp" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)
  
if useModelLgbmNlpBeerName:
  fileName = filePrefix + "_" + "lgbm_allcols_nlp_beer_name" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)
  
if useModelLgbmNlpText:
  fileName = filePrefix + "_" + "lgbm_allcols_nlp_text" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)


if useModelContentKNN:
  fileName = filePrefix + "_" + "contentknn" + "_test" + "_subrun"
  df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)  

In [59]:
df_ensemble_test.head()

Unnamed: 0,RowID,A3_153_ensemble_v5_complete_run_knnwithmeans_test_subrun,A3_153_ensemble_v5_complete_run_baselineonly_test_subrun,A3_153_ensemble_v5_complete_run_svdpp_test_subrun,A3_153_ensemble_v5_complete_run_slopeone_test_subrun,A3_153_ensemble_v5_complete_run_lgbm_beercontext_test_subrun,A3_153_ensemble_v5_complete_run_lgbm_allcols_test_subrun,A3_153_ensemble_v5_complete_run_sklinearreg_test_subrun,A3_153_ensemble_v5_complete_run_lgbm_allcols_inc_nlp_test_subrun,A3_153_ensemble_v5_complete_run_lgbm_allcols_nlp_beer_name_test_subrun,A3_153_ensemble_v5_complete_run_lgbm_allcols_nlp_text_test_subrun,A3_153_ensemble_v5_complete_run_contentknn_test_subrun
0,18,4.125042,4.083293,4.058317,4.140064,4.0,4.0,3.876806,3.92397,3.913344,4.0,3.8192
1,20,4.116027,4.108744,4.050429,4.246457,4.0,4.0,3.877441,3.92397,3.913344,4.0,3.8192
2,30,4.313215,4.30251,4.232854,4.27467,4.0,4.0,3.881769,3.917861,3.913344,4.0,3.8192
3,46,4.13573,4.129984,4.081257,4.067069,4.0,4.0,3.879409,3.92397,3.913344,3.922515,3.8192
4,47,4.367684,4.38283,4.313616,4.201221,4.0,4.0,3.878602,3.925656,3.913344,3.970589,3.8192


In [60]:
# Get all the columns
col_names = df_ensemble_test.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID'])

# Create the sub data sets of the features and the target
dfTestFeatures = df_ensemble_test[feature_cols]

In [61]:
# load the ensemble model  and predict
model = lgb.Booster(model_file=modelsDir + filePrefix + "_ensemble_predictor.model")
predicted = model.predict(dfTestFeatures)

dfPredictions = df_ensemble_test[idCols]
dfPredictions["Score"] = predicted

# join the predictions to the ids, sort by rowid and write to out the subrun file
finalRunFilePath = runDir + filePrefix + "_run.tsv"
dfPredictions.to_csv(finalRunFilePath, sep="\t", index=False, header=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfPredictions["Score"] = predicted


In [62]:
print("Final Report on Validation Set MAEs")
print(" ")
print("* KNN With Means: " + str(predictValiMae_KnnWithMeans))
print("* Baseline Only: " + str(predictValiMae_BaselineOnly))
print("* SVDpp: " + str(predictValiMae_SVDpp))

if useModelSurpriseSlopeOne:
  print("* SlopeOne: " + str(predictValiMae_SlopeOne))

if useModelLbgmBeerContext:
  print("* Lgbm Beer Context columns: " + str(predictValiMae_LgbmBeerContext))

if useModelLbgmAllCols:
  print("* Lgbm All cols: " + str(predictValiMae_LgbmAllCols))

if useModelSkLinReg:
  print("* Sklearn Linear Regression: " + str(predictValiMae_SkLinearReg))

if useModelLgbmNlpAll:
  print("* Lgbm NLP All columns: " + str(predictValiMae_LgbmAllColsIncNlp))

if useModelLgbmNlpBeerName:
  print("* Lgbm NLP on Beer Name: " + str(predictValiMae_LgbmNlpBeerName))

if useModelLgbmNlpText:
  print("* Lgbm NLP on Lemmatized Text: " + str(predictValiMae_LgbmNlpText))  
  
if useModelContentKNN:
  print("* Content KNN Algorithm: " + str(predictValiMae_ContentKNN))  
  
print(" ")
print("Final Ensemble MAE: " + str(predictValiMae_Ensemble))

Final Report on Validation Set MAEs
 
* KNN With Means: 0.43953347322741626
* Baseline Only: 0.4397473132133758
* SVDpp: 0.4432245115467577
* SlopeOne: 0.44176542058237306
* Lgbm Beer Context columns: 0.47949462210333754
* Lgbm All cols: 0.4770979101350705
* Sklearn Linear Regression: 0.49873941880294115
* Lgbm NLP All columns: 0.4619192065868069
* Lgbm NLP on Beer Name: 0.4618644555954773
* Lgbm NLP on Lemmatized Text: 0.4968858179209336
* Content KNN Algorithm: 0.5422772501342061
 
Final Ensemble MAE: 0.3976888068139247


In [63]:
# Clean up variables
del df_ensemble_test
del dfTestFeatures
del model
del predicted
del dfPredictions

## Run with all data, no NLP All and No SlopeOne:

Final Report on Validation Set MAEs
 
* KNN With Means: 0.4395334732274163
* Baseline Only: 0.4397473132133757
* SVDpp: 0.443214892757559
* Lgbm Beer Context columns: 0.4783112983280762
* Lgbm All cols: 0.47739847463670526
* Sklearn Linear Regression: 0.49873941880294115
* Lgbm NLP on Beer Name: 0.4618885539614043
* Lgbm NLP on Lemmatized Text: ( Lost Value )
 
Final Ensemble MAE: 0.41426354435001556

## Run with All Data, All Models inc nlp and Slope, inc ContentKNN, inc Standard Scaling

Final Report on Validation Set MAEs
 
* KNN With Means: 0.43953347322741637
* Baseline Only: 0.4397473132133758
* SVDpp: 0.4432078542168362
* SlopeOne: 0.441765420582373
* Lgbm Beer Context columns: 0.5036487199845824
* Lgbm All cols: 0.5004678230697351
* Sklearn Linear Regression: 1.6641694140752619
* Lgbm NLP All columns: 0.45761532875372146
* Lgbm NLP on Beer Name: 0.46222906787060947
* Lgbm NLP on Lemmatized Text: 0.4968467908471179
* Content KNN Algorithm: 0.542277250134206
 
Final Ensemble MAE: 0.393425426381801


## Run with All Data, No SKLearn, inc ContentKNN, inc Scaling

REmoved SKLearn, because it seemed that after Standard Scaling was added, it scored so badly. But removing it still made MAE worse
 
* KNN With Means: 0.43953347322741626
* Baseline Only: 0.4397473132133759
* SVDpp: 0.4432078542168361
* SlopeOne: 0.4417654205823728
* Lgbm Beer Context columns: 0.5036487199845824
* Lgbm All cols: 0.5004678230697351
* Lgbm NLP All columns: 0.45761532875372146
* Lgbm NLP on Beer Name: 0.46222906787060947
* Lgbm NLP on Lemmatized Text: 0.4968467908471179
* Content KNN Algorithm: 0.542277250134206
 
Final Ensemble MAE: 0.4032176978751534


## Run with All Models, No Scaling on nontext, but still scaling in files

Final Report on Validation Set MAEs
 
* KNN With Means: 0.4395334732274162
* Baseline Only: 0.43974731321337585
* SVDpp: 0.4432246012215977
* SlopeOne: 0.441765420582373
* Lgbm Beer Context columns: 0.4791764047335666
* Lgbm All cols: 0.47742580616126645
* Sklearn Linear Regression: 0.49873941880294115
* Lgbm NLP All columns: 0.45750210628479165
* Lgbm NLP on Beer Name: 0.46205675242142946
* Lgbm NLP on Lemmatized Text: 0.49692320521533206
* Content KNN Algorithm: 0.5422772501342061
 
Final Ensemble MAE: 0.3990488148267674


## Run with all models, no scaling

Final Report on Validation Set MAEs
 
* KNN With Means: 0.4395334732274163
* Baseline Only: 0.43974731321337573
* SVDpp: 0.4432246012215977
* SlopeOne: 0.44176542058237295
* Lgbm Beer Context columns: 0.47949462210333754
* Lgbm All cols: 0.4770979101350705
* Sklearn Linear Regression: 0.49873941880294115
* Lgbm NLP All columns: 0.4619192065868069
* Lgbm NLP on Beer Name: 0.4618644555954773
* Lgbm NLP on Lemmatized Text: 0.4968858179209336
* Content KNN Algorithm: 0.542277250134206
 
Final Ensemble MAE: 0.39641466830606437

## Run with all models and Min Max Scaling:

Final Report on Validation Set MAEs
 
* KNN With Means: 0.4395334732274163
* Baseline Only: 0.4397473132133759
* SVDpp: 0.44322460122159774
* SlopeOne: 0.4417654205823729
* Lgbm Beer Context columns: 0.4734802080656173
* Lgbm All cols: 0.4701917238446327
* Sklearn Linear Regression: 1428.9212716765455
* Lgbm NLP All columns: 0.45685522406630247
* Lgbm NLP on Beer Name: 0.4618644555954773
* Lgbm NLP on Lemmatized Text: 0.4968858179209336
* Content KNN Algorithm: 0.542277250134206
 
Final Ensemble MAE: 0.39784848136548595