# File: Ensemble Version 1 (Basic) with 200k data

This will be the first working notebook on an Ensemble Run.

Here we will take the results of 6 runs: 3 Surprise Collab Filters and 3 Regression Content Filters. Their results from predictions (on the vali set) should all be saved to file. Here are the runs/files we'll use from the subrun folder:

Collabrative Filter Runs (from different techniques)
* A3_073_surprise_svdpp_subrun
* A3_074_surprise_knnmeans_subrun
* A3_076_surprise_baseline_subrun
Content Filter Runs
* A3_062_lgbm_regression_beercontext_subrun
* A3_063_lgbm_regression_consumercontext_subrun
* A3_080_sk_linreg1_subrun

All the files will be loaded and all the predicted ratings will be loaded into a single dataframe. 
Then, the dataframe will be fed into a LightGBM Regressor and the model fitted. Then, predict on the validation dataframe, and compare the results to see what our MAE is.


In [11]:
# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna

from sklearn.metrics import mean_absolute_error

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil

In [12]:
filePrefix = "A3_091_ensemble_v1_200k_tuning"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
runDir = "runs/"
seed = databasic.get_random_seed()

In [13]:
# Load the Validation file

# df_vali = pd.read_csv(baseDataDir + 'vali_500k.tsv',sep='\t',
df_vali = pd.read_csv(baseDataDir + 'vali_200k.tsv',sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)

Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch �r Bock,Rauchbier,4.0
1,27,12300,5634,Rauch �r Bock,Rauchbier,4.5
2,28,12300,3544,Rauch �r Bock,Rauchbier,4.5
3,40,12300,6521,Rauch �r Bock,Rauchbier,4.0
4,43,12300,10177,Rauch �r Bock,Rauchbier,4.5
5,48,12300,2907,Rauch �r Bock,Rauchbier,3.5
6,49,12300,1532,Rauch �r Bock,Rauchbier,4.0
7,50,12300,3452,Rauch �r Bock,Rauchbier,3.5
8,59,12300,6861,Rauch �r Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


In [14]:
# For the full data to use for training and then validation, just get the Row ID and rating.
df_ensemble_full = df_vali[["RowID", "rating"]]

In [15]:
# Collaborative Filter Runs
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_073_surprise_svdpp_subrun")
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_074_surprise_knnmeans_subrun")
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_076_surprise_baselineonly_subrun")

# Content Filter Runs
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_062_lgbm_regression_beercontext_subrun")
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_063_lgbm_regression_consumercontext_subrun")
df_ensemble_full = featutil.joinRunToEnsembleFrame(df_ensemble_full, subrunDir, "A3_080_sk_linreg1_subrun")

df_ensemble_full.head(10)


Unnamed: 0,RowID,rating,A3_073_surprise_svdpp_subrun,A3_074_surprise_knnmeans_subrun,A3_076_surprise_baselineonly_subrun,A3_062_lgbm_regression_beercontext_subrun,A3_063_lgbm_regression_consumercontext_subrun,A3_080_sk_linreg1_subrun
0,22,4.0,4.307723,4.411133,4.265737,4.0,4.0,3.903601
1,27,4.5,4.200102,4.24606,4.202083,4.0,4.0,3.905488
2,28,4.5,4.335563,4.432695,4.371105,4.0,4.0,3.916742
3,40,4.0,4.217186,4.259541,4.231804,4.0,4.0,3.917982
4,43,4.5,4.162843,4.10038,4.17668,4.0,4.0,3.92235
5,48,3.5,3.940503,4.06485,3.993235,4.0,4.0,3.916427
6,49,4.0,4.225543,4.285378,4.236586,4.0,4.0,3.915981
7,50,3.5,4.080092,4.123385,4.116234,4.0,4.0,3.917165
8,59,4.0,4.258105,4.372299,4.294158,4.0,4.0,3.906602
9,64,4.5,3.99156,4.061589,4.003137,4.0,4.0,3.871152


In [16]:
# Get all the columns
col_names = df_ensemble_full.columns

idCols = ['RowID']
feature_cols =  col_names.drop(['RowID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_ensemble_full[idCols]
dfTrainFeatures = df_ensemble_full[feature_cols]
dfTrainTarget = df_ensemble_full[target_col]


In [17]:
print(dfTrainFeatures.shape)
dfTrainFeatures.head()

(36385, 6)


Unnamed: 0,A3_073_surprise_svdpp_subrun,A3_074_surprise_knnmeans_subrun,A3_076_surprise_baselineonly_subrun,A3_062_lgbm_regression_beercontext_subrun,A3_063_lgbm_regression_consumercontext_subrun,A3_080_sk_linreg1_subrun
0,4.307723,4.411133,4.265737,4.0,4.0,3.903601
1,4.200102,4.24606,4.202083,4.0,4.0,3.905488
2,4.335563,4.432695,4.371105,4.0,4.0,3.916742
3,4.217186,4.259541,4.231804,4.0,4.0,3.917982
4,4.162843,4.10038,4.17668,4.0,4.0,3.92235


In [18]:
def objective(trial):
  # Create the Light GBM Regression model and train
  model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
    ,learning_rate=trial.suggest_float("learning_rate", 0.005, 0.3)
    ,num_leaves=trial.suggest_int("num_leaves", 2, 127)
    ,max_depth=trial.suggest_int("max_depth", 2, 30)
    ,n_estimators=trial.suggest_int("n_estimators ", 50, 1000)
    # ,min_split_gain=trial.suggest_float("min_split_gain", 0.001, 1.0)
    # ,min_child_samples=trial.suggest_int("min_child_samples", 1, 100)  
    # #,min_child_weight =trial.suggest_float("min_child_weight", 0.0001, 0.1) 
    # ,subsample =trial.suggest_float("subsample", 0.1, 1.0) 
    # ,subsample_freq =trial.suggest_int("subsample_freq", 0, 15)
    # ,colsample_bytree =trial.suggest_float("colsample_bytree", 0.1, 1.0) 
    # ,reg_alpha =trial.suggest_float("reg_alpha", 0.1, 1.0) 
    # ,reg_lambda =trial.suggest_float("reg_lambda", 0.1, 1.0)      
  )

  # Doing the final Ensemble prediction using Light GBM Regression
  model.fit(X=dfTrainFeatures, y=dfTrainTarget)

  # Use the model to predict against our validation data
  test_predicted = model.predict(dfTrainFeatures)  
  mae = mean_absolute_error(dfTrainTarget, test_predicted)
  return mae

In [19]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=250)

print("\n---------")
print("Study Complete")
study.best_params

[32m[I 2021-10-19 16:43:25,978][0m A new study created in memory with name: no-name-8042b76c-3359-4773-a866-da4803057fcd[0m
[32m[I 2021-10-19 16:43:27,951][0m Trial 0 finished with value: 0.34590363325118156 and parameters: {'learning_rate': 0.12435052762838233, 'num_leaves': 90, 'max_depth': 19, 'n_estimators ': 543}. Best is trial 0 with value: 0.34590363325118156.[0m
[32m[I 2021-10-19 16:43:28,733][0m Trial 1 finished with value: 0.37549835032991 and parameters: {'learning_rate': 0.0845806106297268, 'num_leaves': 117, 'max_depth': 25, 'n_estimators ': 174}. Best is trial 0 with value: 0.34590363325118156.[0m
[32m[I 2021-10-19 16:43:29,325][0m Trial 2 finished with value: 0.39926014807984433 and parameters: {'learning_rate': 0.24612800571666035, 'num_leaves': 37, 'max_depth': 16, 'n_estimators ': 269}. Best is trial 0 with value: 0.34590363325118156.[0m
[32m[I 2021-10-19 16:43:32,813][0m Trial 3 finished with value: 0.33440004178600463 and parameters: {'learning_rate': 


---------
Study Complete


{'learning_rate': 0.298864877137463,
 'num_leaves': 127,
 'max_depth': 26,
 'n_estimators ': 974}

In [20]:
print(study.best_params)
print("Best Rank Score: " + str(study.best_value))
print("-------")
print(study.best_trial)

{'learning_rate': 0.298864877137463, 'num_leaves': 127, 'max_depth': 26, 'n_estimators ': 974}
Best Rank Score: 0.2810226063295058
-------
FrozenTrial(number=79, values=[0.2810226063295058], datetime_start=datetime.datetime(2021, 10, 19, 16, 48, 23, 322193), datetime_complete=datetime.datetime(2021, 10, 19, 16, 48, 30, 737194), params={'learning_rate': 0.298864877137463, 'num_leaves': 127, 'max_depth': 26, 'n_estimators ': 974}, distributions={'learning_rate': UniformDistribution(high=0.3, low=0.005), 'num_leaves': IntUniformDistribution(high=127, low=2, step=1), 'max_depth': IntUniformDistribution(high=30, low=2, step=1), 'n_estimators ': IntUniformDistribution(high=1000, low=50, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=79, state=TrialState.COMPLETE, value=None)


# Results

### All Cols
* 100 trial of just top 4 params (4 min ish):
* 
* 