In [1]:
# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.metrics import mean_absolute_error

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import accuracy
from surprise import dump
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import train_test_split

from utilities import data_basic_utility as databasic
import features_utility as featutil

## File Details

Working notebook of a Full Ensemble Run structure, with just KNNWithMeans Model


In [2]:
filePrefix = "A3_110_ensemble_v1_complete_run_test"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
runDir = "runs/"
modelsDir = "models/"

seed = databasic.get_random_seed()

In [None]:
trainFilePath = baseDataDir + 'train.tsv'
valiFilePath = baseDataDir + 'val.tsv'
featuresFilePath = baseDataDir + 'features.tsv'
testFilePath = baseDataDir + 'test.tsv'

Load the Files one by one then delete them after your done, for memory management

In [3]:
df_train = pd.read_csv(trainFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])
df_train.head(10)



Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,19,12300,10635,Rauch Ür Bock,Rauchbier,4.0
1,21,12300,6547,Rauch Ür Bock,Rauchbier,4.5
2,23,12300,9789,Rauch Ür Bock,Rauchbier,4.5
3,24,12300,7372,Rauch Ür Bock,Rauchbier,5.0
4,25,12300,1302,Rauch Ür Bock,Rauchbier,4.5
5,26,12300,704,Rauch Ür Bock,Rauchbier,4.5
6,29,12300,1747,Rauch Ür Bock,Rauchbier,5.0
7,31,12300,9368,Rauch Ür Bock,Rauchbier,4.5
8,32,12300,2568,Rauch Ür Bock,Rauchbier,4.0
9,33,12300,6838,Rauch Ür Bock,Rauchbier,4.0


In [4]:
# Setup the data to be just the Reviewer and the Beer(Item) and the Rating Label we want to learn.
dfTrainFeatures = df_train.drop(['RowID','BeerName','BeerType'],axis=1)

## Train Models Stage

For the Collaborative Filtering Models, we only need the Training set. Train the models, then save them to file for later use

In [5]:
# Load into a Surprise dataset
reader = Reader(rating_scale=(0, 5))
dsetTrainFeatures = Dataset.load_from_df(dfTrainFeatures[['BeerID','ReviewerID', 'rating']],reader)
trainsetTrainFeatures = dsetTrainFeatures.build_full_trainset()


In [6]:
def trainSurpriseModel(algorithm, trainset, modelsDir, filePrefix, modelName):
  # Train the model then Save the predictor model to file
  model = algorithm.fit(trainset)  
  dump.dump(modelsDir + filePrefix + "_" + modelName + "_predictor.model", None, model, True)

In [7]:
# Create each algorithm, train the model, save it to file for later, then delete the model

predictorKNN = KNNWithMeans(k=80)
trainSurpriseModel(predictorKNN, trainsetTrainFeatures, modelsDir, filePrefix, "knnwithmeans")
del predictorKNN

predictorBaselineOnly = BaselineOnly(bsl_options = {'n_epochs': 8, 'reg_u': 4, 'reg_i': 15})
trainSurpriseModel(predictorBaselineOnly, trainsetTrainFeatures, modelsDir, filePrefix, "baselineonly")
del predictorBaselineOnly

# predictorSVDpp = SVDpp(n_factors = 10, n_epochs=20, lr_all=0.005, reg_all=0.2)
# trainSurpriseModel(predictorSVDpp, trainsetTrainFeatures, modelsDir, filePrefix, "svdpp")
# del predictorSVDpp

Computing the msd similarity matrix...
Done computing similarity matrix.
The dump has been saved as file models/A3_110_ensemble_v1_fullrun_test_knnwithmeans_predictor.model
Estimating biases using als...
The dump has been saved as file models/A3_110_ensemble_v1_fullrun_test_baselineonly_predictor.model


In [8]:
# Now we will want to work on the Content Filter models, todo

In [9]:
# Clean up the training data
del trainsetTrainFeatures
del reader
del dsetTrainFeatures
del dfTrainFeatures
del df_train

## Predict On Validation Data with Models Stage

Now we want to load the Validation set to we can predict against it and write out the subrun files, which will be used later for the Ensemble

In [10]:
# Read the validation data (in full)
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

reader = Reader(rating_scale=(0, 5))

idCols = ['RowID','BeerID','ReviewerID']
dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali.drop(['RowID','BeerName','BeerType'],axis=1)

dsetValiFeatures = Dataset.load_from_df(dfValiFeatures[['BeerID','ReviewerID', 'rating']],reader)

In [11]:
def predictSurpriseModel(modelsDir, filePrefix, modelName, dsName, dataset, dfIds, subrunDir):
  # Load the algorithm from the file, the predictions aren't used so that variable will be None
  predictions, algorithm = dump.load(modelsDir + filePrefix + "_" + modelName + "_predictor.model")
  
  # Make Predictions using the model
  NA,valset = train_test_split(dataset, test_size=1.0)
  predictions = algorithm.test(valset)
  
  # Display the MAE
  mae = accuracy.mae(predictions,verbose=True)
  print("MAE for " + modelName + ": " + str(mae))

  # Convert the Predictions to a dataframe so we can lookup predictions easy
  # uid == BeerId, iid == ReviewerId, r_ui == Original Ration, est = Predicted rating
  lstUIds = list(map(lambda x: x.uid, predictions))
  lstIIds = list(map(lambda x: x.iid, predictions))
  lstTrueRatings = list(map(lambda x: x.r_ui, predictions))
  lstRatingEst = list(map(lambda x: x.est, predictions))
  dfPredictions = pd.DataFrame({ "uid": lstUIds,"iid": lstIIds, "r_ui": lstTrueRatings, "Predict": lstRatingEst })  

  # join the predictions to the ids, sort by rowid and write to out the subrun file
  subRunFilePath = subrunDir + filePrefix + "_" + modelName + "_" + dsName + "_subrun.csv"
  dfPredictions = pd.merge(dfIds, dfPredictions, how="inner", left_on=["BeerID", "ReviewerID"], right_on=["uid", "iid"])
  dfPredictions.sort_values("RowID")[["RowID", "BeerID", "ReviewerID", "Predict"]].to_csv(subRunFilePath, index=False)

  # Clean up the variables from memory
  del predictions
  del algorithm
  del valset
  del lstUIds
  del lstIIds
  del lstTrueRatings
  del lstRatingEst
  del dfPredictions


In [12]:
predictSurpriseModel(modelsDir, filePrefix, "knnwithmeans", "val", dsetValiFeatures, dfValiIds, subrunDir)
predictSurpriseModel(modelsDir, filePrefix, "baselineonly", "val", dsetValiFeatures, dfValiIds, subrunDir)
# predictSurpriseModel(modelsDir, filePrefix, "svdpp_val", dsetValiFeatures, dfValiIds, subrunDir)

MAE:  0.4401
MAE for knnwithmeans: 0.4401281860792271
MAE:  0.4399
MAE for baselineonly: 0.4399004824650012


In [13]:
# Clean up variables from the Predict Stage
del df_vali
del reader
del dfValiIds
del dfValiFeatures
del dsetValiFeatures

### Train the Ensemble Model

Now that all the sub run files have been generated, combine all the predictions into one dataset, train a new final, ensemble model, predict on the validation data and get an MAE and save the model for use later on the Test data.

In [14]:
# Read the validation data (in full) again. But this time, we just want the Row and the rating
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

df_ensemble_full = df_vali[["RowID", "rating"]]      

del df_vali

In [15]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

# fileName = filePrefix + "_" + "svdpp" + "_val" + "_subrun"
# df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

# # Content Filter Runs
# df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_062_lgbm_regression_beercontext_subrun")
# df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_063_lgbm_regression_consumercontext_subrun")
# df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_080_sk_linreg1_subrun")

In [16]:
# Get all the columns
col_names = df_ensemble_full.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID','rating'])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_ensemble_full[idCols]
dfTrainFeatures = df_ensemble_full[feature_cols]
dfTrainTarget = df_ensemble_full[target_col]


In [17]:
# Doing the final Ensemble prediction using Light GBM Regression, params tuned

# Create the model and predict
model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed,
  learning_rate=0.298864877137463, num_leaves=127, max_depth=26, n_estimators=974
)
model.fit(X=dfTrainFeatures, y=dfTrainTarget)

# use the model to predict
test_predicted = model.predict(dfTrainFeatures)
dfPredicted = pd.DataFrame({"Predict": test_predicted})

# Calc the MAE and display
mae = mean_absolute_error(dfTrainTarget, test_predicted)
print("Ensemble Final Average MAE (from validation data): " + str(mae))

# Save the model to file
model.booster_.save_model(modelsDir + filePrefix + "_ensemble_predictor.model")

Ensemble Final Average MAE (from validation data): 0.4157887110103682


<lightgbm.basic.Booster at 0x2452e152f70>

In [18]:
# Clean up all the variables
del df_ensemble_full
del dfTrainIds
del dfTrainFeatures
del dfTrainTarget
del model
del test_predicted
del dfPredicted

## Predict on the Test data with Models for Subruns

Now that we have the final Ensemble model, we can process the Test data

First we need to load the test data, and create all the sub runs by using all the base level models to predict

In [19]:
# Read the validation data (in full)
df_test = pd.read_csv(testFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType'])

# The test set is unlabeled, so we don't know the true ratings. Populate a rating col with zeros, as we are going
# to predict these values
df_test["rating"] = 0

reader = Reader(rating_scale=(0, 5))

idCols = ['RowID','BeerID','ReviewerID']
dfTestIds = df_test[idCols]
dfTestFeatures = df_test.drop(['RowID','BeerName','BeerType'],axis=1)

dsetTestFeatures = Dataset.load_from_df(dfTestFeatures[['BeerID','ReviewerID','rating']],reader)

In [20]:
predictSurpriseModel(modelsDir, filePrefix, "knnwithmeans", "test", dsetTestFeatures, dfTestIds, subrunDir)
predictSurpriseModel(modelsDir, filePrefix, "baselineonly", "test", dsetTestFeatures, dfTestIds, subrunDir)
# predictSurpriseModel(modelsDir, filePrefix, "svdpp", "test", dsetTestFeatures, dfTestIds, subrunDir)

MAE:  3.8224
MAE for knnwithmeans: 3.8224290696665966
MAE:  3.8284
MAE for baselineonly: 3.8284220621788614


In [21]:
# Clean up variables from the Predict Stage
del reader
del dfTestIds
del dfTestFeatures
del dsetTestFeatures

# Keep this, as we will use this in the next stage
# del df_test 

### Load the Ensemble Model and predict on the Test data

Load the test data

In [22]:
df_ensemble_test = df_test[["RowID"]]      

del df_test

In [23]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_test" + "_subrun"
df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

# fileName = filePrefix + "_" + "svdpp" + "_test" + "_subrun"
# df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, fileName)

# # Content Filter Runs
# df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, "A3_062_lgbm_regression_beercontext_subrun")
# df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, "A3_063_lgbm_regression_consumercontext_subrun")
# df_ensemble_test = featutil.joinRunToEnsembleFrame(df_ensemble_test, subrunDir, "A3_080_sk_linreg1_subrun")

In [24]:
df_ensemble_test.head()

Unnamed: 0,RowID,A3_110_ensemble_v1_fullrun_test_knnwithmeans_test_subrun,A3_110_ensemble_v1_fullrun_test_baselineonly_test_subrun
0,18,4.054958,4.066056
1,20,4.144754,4.092544
2,30,4.328965,4.285232
3,46,4.056218,4.114596
4,47,4.357345,4.367196


In [25]:
# Get all the columns
col_names = df_ensemble_test.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID'])

# Create the sub data sets of the features and the target
dfTestFeatures = df_ensemble_test[feature_cols]

In [26]:
# load the ensemble model  and predict
model = lgb.Booster(model_file=modelsDir + filePrefix + "_ensemble_predictor.model")
predicted = model.predict(dfTestFeatures)

dfPredictions = df_ensemble_test[idCols]
dfPredictions["Score"] = predicted

# join the predictions to the ids, sort by rowid and write to out the subrun file
finalRunFilePath = runDir + filePrefix + "_run.tsv"
dfPredictions.to_csv(finalRunFilePath, sep="\t", index=False, header=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfPredictions["Score"] = predicted


In [27]:
# Clean up variables
del df_ensemble_test
del dfTestFeatures
del model
del predicted
del dfPredictions