In [16]:
# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.metrics import mean_absolute_error
import optuna

import features_utility as featutil
from utilities import data_basic_utility as databasic

## File Details

Basic run with SVD without cross validation. This can be the basis for parameter tuning and other stuff later


# Assignment 3 -- Recommendation Systems

* The final challenge is much like Assignment 2 -- but scoped appropriately for the time and your current abilities.
* It is ratings prediction, just like the movielens recommendations we have seen and many other similar problems.
* The features created are based on Beer Reviews from experts on a website.
* Each beer has been scored between 0 and 5 (on a real scale, so 2.75 or 3.5 is OK).
* The official measure is Mean Average Error (MAE) which is pretty intuitive to work with. Everything supports is and it is easy to interpret.
* A set of features have been created based on the reviewer, the written review, and information about the Beer being reviewed.
* Not all features have to be used, and you can easily create new features using the data if you like.
* The features included are:

![title](Images/A3Features.png)

* Sizes of the files are:
|Size | File|
|---|---|
| 1.9G | features.tsv |
| 88B  | header-features.tsv|
| 48B  | header.tsv |
| 15M  | test.tsv |
| 50M  | train.tsv |
| 16M |  val.tsv |


In [17]:
filePrefix = "A3_153_ensemble_v5_complete_run"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
seed = databasic.get_random_seed()

In [18]:

valiFilePath = baseDataDir + 'val.tsv'


Column List: 
RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag



In [19]:
# Read the validation data (in full) again. But this time, we just want the Row and the rating
df_vali = pd.read_csv(valiFilePath,sep='\t',
              names=['RowID','BeerID','ReviewerID','BeerName','BeerType','rating'])

df_ensemble_full = df_vali[["RowID", "rating"]]      

del df_vali

In [20]:
# Load all the sub runs and join them together with the ensemble data

# Collaborative Filter Runs
fileName = filePrefix + "_" + "knnwithmeans" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "baselineonly" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "svdpp" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "slopeone" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

# # Content Filter Runs
fileName = filePrefix + "_" + "lgbm_beercontext" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "lgbm_allcols" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "sklinearreg" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)


# # Content Filter with NLP Runs
fileName = filePrefix + "_" + "lgbm_allcols_inc_nlp" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "lgbm_allcols_nlp_beer_name" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

fileName = filePrefix + "_" + "lgbm_allcols_nlp_text" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)


# Hybrid Filter 
  
fileName = filePrefix + "_" + "contentknn" + "_val" + "_subrun"
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, fileName)

In [21]:
# Get all the columns
col_names = df_ensemble_full.columns

feature_cols =  col_names.drop(['RowID','rating' ])
target_col = 'rating'

dfTrainFeatures = df_ensemble_full[feature_cols]
dfTrainTarget = df_ensemble_full[target_col]

In [22]:
def objective(trial):
  # Create the Light GBM Regression model and train
  model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
    ,learning_rate=trial.suggest_float("learning_rate", 0.005, 0.3)
    ,num_leaves=trial.suggest_int("num_leaves", 2, 127)
    ,max_depth=trial.suggest_int("max_depth", 2, 30)
    ,n_estimators=trial.suggest_int("n_estimators ", 50, 1000)
    # ,min_split_gain=trial.suggest_float("min_split_gain", 0.001, 1.0)
    # ,min_child_samples=trial.suggest_int("min_child_samples", 1, 100)  
    # #,min_child_weight =trial.suggest_float("min_child_weight", 0.0001, 0.1) 
    # ,subsample =trial.suggest_float("subsample", 0.1, 1.0) 
    # ,subsample_freq =trial.suggest_int("subsample_freq", 0, 15)
    # ,colsample_bytree =trial.suggest_float("colsample_bytree", 0.1, 1.0) 
    # ,reg_alpha =trial.suggest_float("reg_alpha", 0.1, 1.0) 
    # ,reg_lambda =trial.suggest_float("reg_lambda", 0.1, 1.0)      
  )

  model.fit(X=dfTrainFeatures, y=dfTrainTarget)

  # use the model to predict
  test_predicted = model.predict(dfTrainFeatures)

  # Calc the MAE and display
  mae = mean_absolute_error(dfTrainTarget, test_predicted)
  return mae

In [23]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

print("\n---------")
print("Study Complete")
study.best_params

[32m[I 2021-10-24 20:39:03,743][0m A new study created in memory with name: no-name-a75255d0-fad9-4499-b0f9-56288b61221c[0m
[32m[I 2021-10-24 20:39:10,301][0m Trial 0 finished with value: 0.40235993795049557 and parameters: {'learning_rate': 0.1383603776488615, 'num_leaves': 103, 'max_depth': 9, 'n_estimators ': 553}. Best is trial 0 with value: 0.40235993795049557.[0m
[32m[I 2021-10-24 20:39:17,099][0m Trial 1 finished with value: 0.4045291849066634 and parameters: {'learning_rate': 0.041371240178682975, 'num_leaves': 75, 'max_depth': 17, 'n_estimators ': 584}. Best is trial 0 with value: 0.40235993795049557.[0m
[32m[I 2021-10-24 20:39:19,643][0m Trial 2 finished with value: 0.42436957402436465 and parameters: {'learning_rate': 0.21750222911093045, 'num_leaves': 99, 'max_depth': 3, 'n_estimators ': 438}. Best is trial 0 with value: 0.40235993795049557.[0m
[32m[I 2021-10-24 20:39:20,820][0m Trial 3 finished with value: 0.4053997233344139 and parameters: {'learning_rate': 


---------
Study Complete


{'learning_rate': 0.2743431718467076,
 'num_leaves': 119,
 'max_depth': 24,
 'n_estimators ': 722}

In [24]:
print(study.best_params)
print("Best Rank Score: " + str(study.best_value))
print("-------")
print(study.best_trial)

{'learning_rate': 0.2743431718467076, 'num_leaves': 119, 'max_depth': 24, 'n_estimators ': 722}
Best Rank Score: 0.3845116238056235
-------
FrozenTrial(number=21, values=[0.3845116238056235], datetime_start=datetime.datetime(2021, 10, 24, 20, 41, 42, 403246), datetime_complete=datetime.datetime(2021, 10, 24, 20, 41, 54, 438243), params={'learning_rate': 0.2743431718467076, 'num_leaves': 119, 'max_depth': 24, 'n_estimators ': 722}, distributions={'learning_rate': UniformDistribution(high=0.3, low=0.005), 'num_leaves': IntUniformDistribution(high=127, low=2, step=1), 'max_depth': IntUniformDistribution(high=30, low=2, step=1), 'n_estimators ': IntUniformDistribution(high=1000, low=50, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=21, state=TrialState.COMPLETE, value=None)


## Results 

Full:
* params 1: param_grid = { 'bsl_options': {'n_epochs': [5, 8, 12], 'reg_u': [3, 4, 6, 8], 'reg_i': [10, 15, 18]} }
* Best MAE: 0.44255455726578596
* Best MAE Params: {'bsl_options': {'n_epochs': 5, 'reg_u': 3, 'reg_i': 15}}

* params 2: { 'bsl_options': {'n_epochs': [4, 5, 8, 12, 15], 'reg_u': [3, 8, 12, 16], 'reg_i': [7, 11, 16, 20]} }
* Best MAE: 0.44246073687546666
* Best MAE Params: {'bsl_options': {'n_epochs': 5, 'reg_u': 3, 'reg_i': 16}}


200k
* Best MAE: 0.43026587512679404
* Best MAE Params: {'bsl_options': {'n_epochs': 8, 'reg_u': 4, 'reg_i': 15}}

In [25]:

# print("Best RMSE: " + str(grid_search.best_score['rmse']))
# print("Best RMSE Params: " + str(grid_search.best_params['rmse']))