In [None]:
# Import libraries
# import pandas as pd
import dask.dataframe as pd
import numpy as np
import lightgbm as lgb
import optuna

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil

## File Details - Light GBM Regression All cols inc nlp

This is a Candidate for being used in an Ensemble 2. 
Characteristicts:
* Light GBM Regression Algorithm
* All columns, including Review Counts and NLP doc vecs
* Uses the full files outputted from A3_130
* Todo: use optimised parameters for Light GBM Regression


In [None]:
filePrefix = "A3_140_lgbm_regression_inc_nlp"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [None]:

trainFilePath = baseDataDir + 'train_features_preprocessed.csv'
valiFilePath = baseDataDir + 'vali_features_preprocessed.csv'
testFilePath = baseDataDir + 'test_features_preprocessed.csv'

In [None]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
# df_train = pd.read_csv(baseDataDir + 'train_500k.tsv',sep='\t',
df_train = pd.read_csv(trainFilePath)
df_train.head(10)

In [None]:

# df_vali = pd.read_csv(baseDataDir + 'vali_500k.tsv',sep='\t',
df_vali = pd.read_csv(valiFilePath)
df_vali.head(10)

In [None]:

del df_train["BeerName"]
del df_train["Lemmatized"]
del df_vali["BeerName"]
del df_vali["Lemmatized"]

In [None]:
# Get all the columns
col_names = df_train.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train[idCols]
dfTrainFeatures = df_train[feature_cols]
dfTrainTarget = df_train[target_col]

dfValiIds = df_vali[idCols]
dfValiFeatures = df_vali[feature_cols]
dfValiTarget = df_vali[target_col]


In [None]:
print(dfTrainFeatures.shape)
dfTrainFeatures.head()

In [None]:
def objective(trial):
  # Create the Light GBM Regression model and train
  model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
    ,learning_rate=trial.suggest_float("learning_rate", 0.005, 0.3)
    ,num_leaves=trial.suggest_int("num_leaves", 60, 127)
    ,max_depth=trial.suggest_int("max_depth", 10, 30)
    ,n_estimators=trial.suggest_int("n_estimators ", 200, 1000)
    # ,min_split_gain=trial.suggest_float("min_split_gain", 0.001, 1.0)
    # ,min_child_samples=trial.suggest_int("min_child_samples", 1, 100)  
    # #,min_child_weight =trial.suggest_float("min_child_weight", 0.0001, 0.1) 
    # ,subsample =trial.suggest_float("subsample", 0.1, 1.0) 
    # ,subsample_freq =trial.suggest_int("subsample_freq", 0, 15)
    # ,colsample_bytree =trial.suggest_float("colsample_bytree", 0.1, 1.0) 
    # ,reg_alpha =trial.suggest_float("reg_alpha", 0.1, 1.0) 
    # ,reg_lambda =trial.suggest_float("reg_lambda", 0.1, 1.0)      
  )

  model.fit(X=dfTrainFeatures, y=dfTrainTarget)

  # Use the model to predict against our validation data
  test_predicted = model.predict(dfValiFeatures)  

  mae = mean_absolute_error(dfValiTarget, test_predicted)

  return mae

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

print("\n---------")
print("Study Complete")
study.best_params

In [None]:
print(study.best_params)
print("Best Rank Score: " + str(study.best_value))
print("-------")
print(study.best_trial)

# Summary
