In [1]:
# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
import pickle

from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import accuracy
from surprise import dump
from surprise.model_selection import train_test_split

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil

## File Details

Working notebook of a Full Ensemble Run structure, with just KNNWithMeans Model


In [2]:
filePrefix = "A3_111_ensemble_v2_complete_run"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
runDir = "runs/"
modelsDir = "models/"

seed = databasic.get_random_seed()

In [3]:
trainFilePath = baseDataDir + 'train.tsv'
valiFilePath = baseDataDir + 'val.tsv'
featuresFilePath = baseDataDir + 'features.tsv'
testFilePath = baseDataDir + 'test.tsv'

# trainFilePath = baseDataDir + 'train_200k.tsv'
# valiFilePath = baseDataDir + 'vali_200k.tsv'
# featuresFilePath = baseDataDir + 'features_200k.tsv'
# testFilePath = baseDataDir + 'test_200k.tsv'

Load the Files one by one then delete them after your done, for memory management

In [4]:
df_train = pd.read_csv(trainFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])
df_train.head(10)

Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,19,12300,10635,Rauch Ür Bock,Rauchbier,4.0
1,21,12300,6547,Rauch Ür Bock,Rauchbier,4.5
2,23,12300,9789,Rauch Ür Bock,Rauchbier,4.5
3,24,12300,7372,Rauch Ür Bock,Rauchbier,5.0
4,25,12300,1302,Rauch Ür Bock,Rauchbier,4.5
5,26,12300,704,Rauch Ür Bock,Rauchbier,4.5
6,29,12300,1747,Rauch Ür Bock,Rauchbier,5.0
7,31,12300,9368,Rauch Ür Bock,Rauchbier,4.5
8,32,12300,2568,Rauch Ür Bock,Rauchbier,4.0
9,33,12300,6838,Rauch Ür Bock,Rauchbier,4.0


In [5]:
# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)

## Collaborative Filter Models: Train

For the Collaborative Filtering Models, we only need the Training set. Train the models, then save them to file for later use

In [6]:
# Load into a Surprise dataset
reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID', 'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()

In [7]:
def trainSurpriseModel(algorithm, trainset, modelsDir, filePrefix, modelName, forceTrain=False):
  modelSavePath = modelsDir + filePrefix + "_" + modelName + "_predictor.model"
  modelPath = Path(modelSavePath)
  
  if modelPath.exists() == False or forceTrain:
    # Train the model then Save the predictor model to file
    model = algorithm.fit(trainset)  
    dump.dump(modelSavePath, None, model, True)
  else:
    # load existing model from file
    predictions, model = dump.load(modelsDir + filePrefix + "_" + modelName + "_predictor.model")

  return model
    

In [8]:
# Create each algorithm, train the model, save it to file for later, then delete the model

predictorKNN = KNNWithMeans(k=160)
trainSurpriseModel(predictorKNN, trainsetTrainFeatures, modelsDir, filePrefix, "knnwithmeans")
del predictorKNN

predictorBaselineOnly = BaselineOnly(bsl_options = {'n_epochs': 5, 'reg_u': 3, 'reg_i': 16})
trainSurpriseModel(predictorBaselineOnly, trainsetTrainFeatures, modelsDir, filePrefix, "baselineonly")
del predictorBaselineOnly

predictorSVDpp = SVDpp(n_factors = 10, n_epochs=20, lr_all=0.005, reg_all=0.2)
trainSurpriseModel(predictorSVDpp, trainsetTrainFeatures, modelsDir, filePrefix, "svdpp")
del predictorSVDpp

Computing the msd similarity matrix...
Done computing similarity matrix.
The dump has been saved as file models/A3_111_ensemble_v2_complete_run_knnwithmeans_predictor.model
Estimating biases using als...
The dump has been saved as file models/A3_111_ensemble_v2_complete_run_baselineonly_predictor.model
The dump has been saved as file models/A3_111_ensemble_v2_complete_run_svdpp_predictor.model


In [9]:
# Clean up the training data used for the collaborate filters
del trainsetTrainFeatures
del reader
del dsetTrainFeatures
del dfTrainFeatures
del df_train

## Collaborative Filter Models: Predict On Validation Data 

Now we want to load the Validation set to we can predict against it and write out the subrun files, which will be used later for the Ensemble.

First, do the Predictions for the Collaborative Filter models (surprise)

In [10]:
# Read the validation data (in full)
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

reader = Reader(rating_scale=(0, 5))

idCols = ['RowID','BeerID','ReviewerID']
dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID', 'rating']],reader)

In [11]:
def predictSurpriseModel(modelsDir, filePrefix, modelName, dsName, dataset, dfIds, subrunDir):
  # Load the algorithm from the file, the predictions aren't used so that variable will be None
  predictions, algorithm = dump.load(modelsDir + filePrefix + "_" + modelName + "_predictor.model")
  
  # Make Predictions using the model
  NA,valset = train_test_split(dataset, test_size=1.0)
  predictions = algorithm.test(valset)
  
  # Display the MAE
  mae = accuracy.mae(predictions,verbose=True)
  print("MAE for " + modelName + ": " + str(mae))

  # Convert the Predictions to a dataframe so we can lookup predictions easy
  # uid == BeerId, iid == ReviewerId, r_ui == Original Ration, est = Predicted rating
  lstUIds = list(map(lambda x: x.uid, predictions))
  lstIIds = list(map(lambda x: x.iid, predictions))
  lstTrueRatings = list(map(lambda x: x.r_ui, predictions))
  lstRatingEst = list(map(lambda x: x.est, predictions))
  dfPredictions = pd.DataFrame({ "uid": lstUIds,"iid": lstIIds, "r_ui": lstTrueRatings, "Predict": lstRatingEst })  

  # join the predictions to the ids, sort by rowid and write to out the subrun file
  subRunFilePath = subrunDir + filePrefix + "_" + modelName + "_" + dsName + "_subrun.csv"
  dfPredictions = pd.merge(dfIds, dfPredictions, how="inner", left_on=["BeerID", "ReviewerID"], right_on=["uid", "iid"])
  dfPredictions.sort_values("RowID")[["RowID", "BeerID", "ReviewerID", "Predict"]].to_csv(subRunFilePath, index=False)

  # Clean up the variables from memory
  del predictions
  del algorithm
  del valset
  del lstUIds
  del lstIIds
  del lstTrueRatings
  del lstRatingEst
  del dfPredictions


In [12]:
predictSurpriseModel(modelsDir, filePrefix, "knnwithmeans", "val", dsetValiFeatures, dfValiIds, subrunDir)
predictSurpriseModel(modelsDir, filePrefix, "baselineonly", "val", dsetValiFeatures, dfValiIds, subrunDir)
predictSurpriseModel(modelsDir, filePrefix, "svdpp", "val", dsetValiFeatures, dfValiIds, subrunDir)

MAE:  0.4401
MAE for knnwithmeans: 0.4401281860792271
MAE:  0.4399
MAE for baselineonly: 0.43990048246500113
MAE:  0.4432
MAE for svdpp: 0.4432183525372235


In [13]:
# Clean up variables from the Predict Stage
del df_vali
del reader
del dfValiIds
del dfValiFeatures
del dsetValiFeatures

## Content Filter Models, train and predict

First we want to load the features and do all the data preprocessing, then we can train the different models

In [14]:
# Load the training data
df_train = pd.read_csv(trainFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

# Load the validation data. When we want to do one hot encoding, we have to do it over both datasets to ensure consistency
df_vali = pd.read_csv(valiFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID', 'BeerName','BeerType','rating'])

# Load the validation data. When we want to do one hot encoding, we have to do it over both datasets to ensure consistency
df_test = pd.read_csv(testFilePath,sep='\t',
            names=['RowID','BeerID','ReviewerID', 'BeerName','BeerType','rating'])                         

# Load the features
df_features = pd.read_csv(featuresFilePath,sep='\t',
    names=['RowID','BrewerID','ABV','DayofWeek','Month',
          'DayofMonth','Year','TimeOfDay','Gender',
          'Birthday','Text','Lemmatized','POS_Tag'])    

In [15]:
# Add the reviewer counts to each of the data sets
df_train = featutil.addReviewerReviewCount(df_train)
df_train = featutil.addBeerReviewCount(df_train)

df_vali = featutil.addReviewerReviewCount(df_vali)
df_vali = featutil.addBeerReviewCount(df_vali)

df_test = featutil.addReviewerReviewCount(df_test)
df_test = featutil.addBeerReviewCount(df_test)

In [16]:
colsToUse = ["RowID", "BrewerID", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender", "TimeOfDay"]

df_train_data = df_train.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
df_vali_data = df_vali.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
df_test_data = df_test.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")

# Remove the duplicated Row ID, also remove Beer Name at this point, we're nt using it
df_train_data = df_train_data.drop(['RowIDFeat', "BeerName"],axis=1)
df_vali_data = df_vali_data.drop(['RowIDFeat', "BeerName"],axis=1)
df_test_data = df_test_data.drop(['RowIDFeat', "BeerName"],axis=1)

In [17]:
# Clean up these dataframes now that they have been joined
del df_train
del df_vali
del df_features
del df_test

In [18]:
# do the feature transformations
df_train_data = featutil.fixNullABV(df_train_data)
df_vali_data = featutil.fixNullABV(df_vali_data)
df_test_data = featutil.fixNullABV(df_test_data)

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "BrewerID")

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "BeerType")

df_train_data, df_vali_data, df_test_data = dfutil.getDummiesForTripleSets(df_train_data, df_vali_data, df_test_data, "Gender")

df_train_data = featutil.formatDayOfWeek(df_train_data)
df_vali_data = featutil.formatDayOfWeek(df_vali_data)
df_test_data = featutil.formatDayOfWeek(df_test_data)

df_train_data = featutil.formatMonth(df_train_data)
df_vali_data = featutil.formatMonth(df_vali_data)
df_test_data = featutil.formatMonth(df_test_data)

df_train_data = featutil.formatTimeToSec(df_train_data)
df_vali_data = featutil.formatTimeToSec(df_vali_data)
df_test_data = featutil.formatTimeToSec(df_test_data)

df_train_data = featutil.convertBirthdayToAge(df_train_data)
df_vali_data = featutil.convertBirthdayToAge(df_vali_data)
df_test_data = featutil.convertBirthdayToAge(df_test_data)

  df_combined.columns = df_combined.columns.str.replace(" ", "").str.replace("/", "").str.replace("-", "") \


In [19]:
print(df_train_data.shape)
print(df_vali_data.shape)
print(df_test_data.shape)
df_test_data.head()

(746207, 2195)
(243834, 2195)
(247393, 2195)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,18,12300,10059,,165,8,7.4,7,12,6,...,0,0,0,0,0,0,0,0,1,0
1,20,12300,9761,,156,8,7.4,6,21,5,...,0,0,0,0,0,0,0,0,1,0
2,30,12300,7279,,553,8,7.4,2,12,10,...,0,0,0,0,0,0,0,0,0,1
3,46,12300,2367,,283,8,5.5,3,22,7,...,0,0,0,0,0,0,0,0,1,0
4,47,12300,2230,,35,8,5.5,2,21,7,...,0,0,0,0,0,0,0,0,1,0


In [20]:
# Write the test data file out so we can load it back in later so as not to have to redo this step
df_test_data.to_csv(baseDataDir + filePrefix + "_test_cleaned.csv", index=False)

In [21]:
# Get all the columns
col_names = df_train_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainFeatures = df_train_data[feature_cols]
dfTrainTarget = df_train_data[target_col]

dfValiIds = df_vali_data[idCols]
dfValiFeatures = df_vali_data[feature_cols]
dfValiTarget = df_vali_data[target_col]


In [22]:
def trainLightGbmModel(model, train_feat, train_target, 
      modelsDir, filePrefix, modelName):

  # Train the model and Save the predictor model to file
  model.fit(X=train_feat, y=train_target) 
  model.booster_.save_model(modelsDir + filePrefix + "_" + modelName + "_predictor.model")


def trainSkLinearRegModel(model, 
      train_feat, train_target,  
      modelsDir, filePrefix, modelName):
  # Train the model then Save the predictor model to file
  model.fit(train_feat, train_target) 
  pickle.dump(model, open(modelsDir + filePrefix + "_" + modelName + "_predictor.model", 'wb'))

In [23]:
def predictLightGbmModel(vali_ids, vali_feat, vali_target, 
      subrunDir, modelsDir, filePrefix, dsName, modelName):

  model = lgb.Booster(model_file=modelsDir + filePrefix + "_" + modelName + "_predictor.model")

  predicted = model.predict(vali_feat)
  dfPredicted = pd.DataFrame({"Predict": predicted})

  # join the predictions to the ids, sort by rowid and write to out the subrun file
  subRunFilePath = subrunDir + filePrefix + "_" + modelName + "_" + dsName + "_subrun.csv"
  dfPredicted = pd.concat([vali_ids.reset_index(), dfPredicted], axis=1).drop(columns="index")
  dfPredicted.to_csv(subRunFilePath, index=False)

  if vali_target is not None:      
    mae = mean_absolute_error(vali_target, predicted)
    print("MAE for " + modelName + ": " + str(mae))

  # clean up variables in memory
  del predicted
  del dfPredicted


def predictSkLinearRegModel(vali_ids, vali_feat, vali_target, 
      subrunDir, modelsDir, filePrefix, dsName, modelName):

  model = pickle.load(open(modelsDir + filePrefix + "_" + modelName + "_predictor.model", 'rb'))

  predicted = model.predict(vali_feat)
  dfPredicted = pd.DataFrame({"Predict": predicted})

  # join the predictions to the ids, sort by rowid and write to out the subrun file
  subRunFilePath = subrunDir + filePrefix + "_" + modelName + "_" + dsName + "_subrun.csv"
  dfPredicted = pd.concat([vali_ids.reset_index(), dfPredicted], axis=1).drop(columns="index")
  dfPredicted.to_csv(subRunFilePath, index=False)

  # When used on the test data, we have not target/label data. This is just used for evaluation, to 
  # calculate our MAE against real labels. If none is passed in, don't do this
  if vali_target is not None:      
    mae = mean_absolute_error(vali_target, predicted)
    print("MAE for " + modelName + ": " + str(mae))

  # clean up variables in memory
  del predicted
  del dfPredicted  


In [24]:
dfTrainFeatures.head()

Unnamed: 0,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,Year,TimeOfDay,BrewerID_1,BrewerID_2,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,200,23,7.4,1,23,5,2011,56188,0,0,...,0,0,0,0,0,0,0,0,1,0
1,10,23,7.4,1,16,5,2011,1906,0,0,...,0,0,0,0,0,0,0,0,1,0
2,164,23,7.4,7,10,4,2011,44246,0,0,...,0,0,0,0,0,0,0,0,0,1
3,432,23,7.4,3,30,3,2011,50880,0,0,...,0,0,0,0,0,0,0,0,1,0
4,500,23,7.4,4,24,3,2011,50820,0,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
def getFeaturesBeerContext(df1):
  consumerCols = ["DayofWeek", "DayofMonth", "Month", "TimeOfDay", "Gender_Male", "Gender_Female", "Gender_unknown"]
  return dfutil.getFeaturesWithoutCols(df1, consumerCols)

In [26]:
# Train the models, save them to file and then clear the model from memory
modelBeerContext = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
    ,learning_rate=0.010443500090385492, num_leaves = 68, max_depth = 14, n_estimators = 608
  )  
dfTrainFeatures_BeerContext = getFeaturesBeerContext(dfTrainFeatures)
dfValiFeatures_BeerContext = getFeaturesBeerContext(dfValiFeatures)
trainLightGbmModel(modelBeerContext, dfTrainFeatures_BeerContext, dfTrainTarget, 
    modelsDir, filePrefix, "lgbm_beercontext")
predictLightGbmModel(dfValiIds, dfValiFeatures_BeerContext, dfValiTarget,
    subrunDir, modelsDir, filePrefix, "val", "lgbm_beercontext")    
del dfTrainFeatures_BeerContext
del dfValiFeatures_BeerContext
del modelBeerContext


modelConsumerContext = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
    ,learning_rate=0.26879548049242075, num_leaves = 91, max_depth = 2, n_estimators = 384
 ) 
dfTrainFeatures_ConsumerContext = dfTrainFeatures[["Year", "ReviewerReviewCount", "BeerReviewCount", "DayofWeek", "DayofMonth", "Month", "TimeOfDay", "Gender_Male", "Gender_Female", "Gender_unknown"]]
dfValiFeatures_ConsumerContext = dfValiFeatures[["Year", "ReviewerReviewCount", "BeerReviewCount", "DayofWeek", "DayofMonth", "Month", "TimeOfDay", "Gender_Male", "Gender_Female", "Gender_unknown"]]
trainLightGbmModel(modelConsumerContext, dfTrainFeatures_ConsumerContext, dfTrainTarget, 
    modelsDir, filePrefix, "lgbm_consumercontext")
predictLightGbmModel(dfValiIds, dfValiFeatures_ConsumerContext, dfValiTarget,
    subrunDir, modelsDir, filePrefix, "val", "lgbm_consumercontext")    
del dfTrainFeatures_ConsumerContext
del dfValiFeatures_ConsumerContext
del modelConsumerContext


modelLinReg = LinearRegression()
trainSkLinearRegModel(modelLinReg, dfTrainFeatures, dfTrainTarget, 
    modelsDir, filePrefix, "sklinearreg")
predictSkLinearRegModel(dfValiIds, dfValiFeatures, dfValiTarget,
    subrunDir, modelsDir, filePrefix, "val", "sklinearreg")
del modelLinReg

MAE for lgbm_beercontext: 0.47784540503622086
MAE for lgbm_consumercontext: 0.49471976836700376
MAE for sklinearreg: 0.4987394011520422


In [27]:
# Clean up the variables from memory
del df_train_data
del df_vali_data
del df_test_data
del dfTrainFeatures
del dfTrainTarget
del dfValiIds
del dfValiFeatures
del dfValiTarget

### Train the Ensemble Model

Now that all the sub run files have been generated, combine all the predictions into one dataset, train a new final, ensemble model, predict on the validation data and get an MAE and save the model for use later on the Test data.

In [28]:
# Read the validation data (in full) again. But this time, we just want the Row and the rating
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

df_ensemble_full = df_vali[["RowID", "rating"]]      

del df_vali

In [29]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "svdpp" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

# # Content Filter Runs
fileName = filePrefix + "_" + "lgbm_beercontext" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "lgbm_consumercontext" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "sklinearreg" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

In [30]:
# Get all the columns
col_names = df_ensemble_full.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID','rating'])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_ensemble_full[idCols]
dfTrainFeatures = df_ensemble_full[feature_cols]
dfTrainTarget = df_ensemble_full[target_col]


In [31]:
# Doing the final Ensemble prediction using Light GBM Regression, params tuned

# Create the model and predict
model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed,
  learning_rate=0.298864877137463, num_leaves=127, max_depth=26, n_estimators=974
)
model.fit(X=dfTrainFeatures, y=dfTrainTarget)

# use the model to predict
test_predicted = model.predict(dfTrainFeatures)
dfPredicted = pd.DataFrame({"Predict": test_predicted})

# Calc the MAE and display
mae = mean_absolute_error(dfTrainTarget, test_predicted)
print("Ensemble Final Average MAE (from validation data): " + str(mae))

# Save the model to file
model.booster_.save_model(modelsDir + filePrefix + "_ensemble_predictor.model")

Ensemble Final Average MAE (from validation data): 0.41574639975141314


<lightgbm.basic.Booster at 0x1857c3614f0>

In [32]:
# Clean up all the variables
del df_ensemble_full
del dfTrainIds
del dfTrainFeatures
del dfTrainTarget
del model
del test_predicted
del dfPredicted

## Predict on the Test data with Models for Subruns

Now that we have the final Ensemble model, we can process the Test data. We need to load the test data, and create all the sub runs by using all the base level models to predict.

First, predict using the Collaborative Filter Models

In [33]:
# Read the validation data (in full)
df_test = pd.read_csv(testFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType'])

# The test set is unlabeled, so we don't know the true ratings. Populate a rating col with zeros, as we are going
# to predict these values
df_test["rating"] = 0

reader = Reader(rating_scale=(0, 5))

idCols = ['RowID','BeerID','ReviewerID']
dfTestIds = df_test[idCols]
dfTestFeatures = df_test.drop(['RowID','BeerName','BeerType'],axis=1)
dsetTestFeatures = Dataset.load_from_df(dfTestFeatures[['BeerID','ReviewerID','rating']],reader)

In [34]:
# Predict using the Collaborative Filter Models
predictSurpriseModel(modelsDir, filePrefix, "knnwithmeans", "test", dsetTestFeatures, dfTestIds, subrunDir)
predictSurpriseModel(modelsDir, filePrefix, "baselineonly", "test", dsetTestFeatures, dfTestIds, subrunDir)
predictSurpriseModel(modelsDir, filePrefix, "svdpp", "test", dsetTestFeatures, dfTestIds, subrunDir)

MAE:  3.8224
MAE for knnwithmeans: 3.8224290696665957
MAE:  3.8284
MAE for baselineonly: 3.8284220621788596
MAE:  3.8266
MAE for svdpp: 3.8266004450114988


In [35]:
# Clean up variables from the Predict Stage
del reader
del dfTestIds
del dfTestFeatures
del dsetTestFeatures

# Keep this, as we will use this in the next stage
# del df_test 

Now Predict using the Content Filter Models. 

In [36]:
# Reload that test data that was cleaned and processed previously
df_test_data = pd.read_csv(baseDataDir + filePrefix + "_test_cleaned.csv")

In [37]:
# Get all the columns
col_names = df_test_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID', 'rating' ])

# Create the sub data sets of the features and the target
dfTestIds = df_test_data[idCols]
dfTestFeatures = df_test_data[feature_cols]

In [38]:
print(df_test_data.columns)
df_test_data.head()

Index(['RowID', 'BeerID', 'ReviewerID', 'rating', 'ReviewerReviewCount',
       'BeerReviewCount', 'ABV', 'DayofWeek', 'DayofMonth', 'Month',
       ...
       'BeerType_SmokedBeer', 'BeerType_Tripel', 'BeerType_ViennaLager',
       'BeerType_Weizenbock', 'BeerType_Wheatwine', 'BeerType_WinterWarmer',
       'BeerType_Witbier', 'Gender_Female', 'Gender_Male', 'Gender_unknown'],
      dtype='object', length=2195)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,18,12300,10059,,165,8,7.4,7,12,6,...,0,0,0,0,0,0,0,0,1,0
1,20,12300,9761,,156,8,7.4,6,21,5,...,0,0,0,0,0,0,0,0,1,0
2,30,12300,7279,,553,8,7.4,2,12,10,...,0,0,0,0,0,0,0,0,0,1
3,46,12300,2367,,283,8,5.5,3,22,7,...,0,0,0,0,0,0,0,0,1,0
4,47,12300,2230,,35,8,5.5,2,21,7,...,0,0,0,0,0,0,0,0,1,0


Problem is with one hot encoding, different sets of brewers or beer types between the training data (train+vali) and what is in test

In [39]:
# Now we can make predictions according to each of our Content Filter Models. Pass None for the target set, the function
# will just skip the evaluation (calculating the MAE)
dfTestFeatures_BeerContext =  getFeaturesBeerContext(dfTestFeatures)
predictLightGbmModel(dfTestIds, dfTestFeatures_BeerContext, None,
    subrunDir, modelsDir, filePrefix, "test", "lgbm_beercontext")    
del dfTestFeatures_BeerContext


dfTestFeatures_ConsumerContext = dfTestFeatures[["Year", "ReviewerReviewCount", "BeerReviewCount", "DayofWeek", "DayofMonth", "Month", "TimeOfDay", "Gender_Male", "Gender_Female", "Gender_unknown"]]
predictLightGbmModel(dfTestIds, dfTestFeatures_ConsumerContext, None,
    subrunDir, modelsDir, filePrefix, "test", "lgbm_consumercontext") 
del dfTestFeatures_ConsumerContext   


modelLinReg = LinearRegression()
predictSkLinearRegModel(dfTestIds, dfTestFeatures, None,
    subrunDir, modelsDir, filePrefix, "test", "sklinearreg")
del modelLinReg

### Load the Ensemble Model and predict on the Test data

Load the test data

In [40]:
df_ensemble_test = df_test[["RowID"]]      

del df_test

In [41]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "svdpp" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

# # Content Filter Runs
fileName = filePrefix + "_" + "lgbm_beercontext" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "lgbm_consumercontext" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "sklinearreg" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

In [42]:
df_ensemble_test.head()

Unnamed: 0,RowID,A3_111_ensemble_v2_complete_run_knnwithmeans_test_subrun,A3_111_ensemble_v2_complete_run_baselineonly_test_subrun,A3_111_ensemble_v2_complete_run_svdpp_test_subrun,A3_111_ensemble_v2_complete_run_lgbm_beercontext_test_subrun,A3_111_ensemble_v2_complete_run_lgbm_consumercontext_test_subrun,A3_111_ensemble_v2_complete_run_sklinearreg_test_subrun
0,18,4.054958,4.066056,4.057032,4.0,4.0,3.87681
1,20,4.144754,4.092544,4.04767,4.0,4.0,3.877434
2,30,4.328965,4.285232,4.231721,4.0,4.0,3.881769
3,46,4.056218,4.114596,4.077979,4.0,4.0,3.879405
4,47,4.357345,4.367196,4.306754,4.0,4.0,3.878607


In [43]:
# Get all the columns
col_names = df_ensemble_test.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID'])

# Create the sub data sets of the features and the target
dfTestFeatures = df_ensemble_test[feature_cols]

In [44]:
# load the ensemble model  and predict
model = lgb.Booster(model_file=modelsDir + filePrefix + "_ensemble_predictor.model")
predicted = model.predict(dfTestFeatures)

dfPredictions = df_ensemble_test[idCols]
dfPredictions["Score"] = predicted

# join the predictions to the ids, sort by rowid and write to out the subrun file
finalRunFilePath = runDir + filePrefix + "_run.tsv"
dfPredictions.to_csv(finalRunFilePath, sep="\t", index=False, header=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfPredictions["Score"] = predicted


In [45]:
# Clean up variables
del df_ensemble_test
del dfTestFeatures
del model
del predicted
del dfPredictions