In [None]:
# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
import fasttext as ft
import optuna

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
from utilities import regex_utility as reutil
import features_utility as featutil

import nltk

## File Details - Light GBM Regression NLP

First File on looking at doing NLP on the Beer name and text features. Investigate using fast text, because fast!
This file might start off with just the beer name, then we will go from there

Characteristics:
* Light GBM Regression Algorithm
* Start working on NLP on the Beer name text columns


In [None]:
filePrefix = "A3_122_lgbm_nlp_beername_tuning"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
featuresDataDir = "features/"
modelsDir = "models/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [None]:
# trainFilePath = baseDataDir + 'train.tsv'
# valiFilePath = baseDataDir + 'val.tsv'
# featuresFilePath = baseDataDir + 'features.tsv'
# testFilePath = baseDataDir + 'test.tsv'

trainFilePath = baseDataDir + 'train_200k.tsv'
valiFilePath = baseDataDir + 'vali_200k.tsv'
featuresFilePath = baseDataDir + 'features_200k.tsv'
testFilePath = baseDataDir + 'test_200k.tsv'

In [None]:
df_train = pd.read_csv(trainFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])

df_vali = pd.read_csv(valiFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])


df_test = pd.read_csv(testFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])                                

print(df_train.shape)

Let's just look at Beer name first. Compile a full list of the beer names, save it to file with one per line. Then we can load it with fasttext and build a language model

In [None]:
colName = "BeerName"
df_train, df_vali, df_test, documentFilePath = featutil.formatTextColForNLP(df_train, df_vali, df_test, colName, featuresDataDir, filePrefix, 0)

Now train a Fast Text language model. Check to see if there is a saved model to use, else train a new one

In [None]:
# pass in an existing model file to use that was generated from another run. Or leave this empty to automatically
# use a language model file specific to this run notebook name
modelFileToUse = ""

fasttext_model = featutil.getFastTextLangModel(colName, modelFileToUse,  modelsDir, filePrefix, documentFilePath, 200, True)

print(fasttext_model.words[0:50])

# examine some of the word vectors
# print(fasttext_model.get_word_vector("stout"))

In [None]:
df_train_data = df_train
df_vali_data = df_vali
df_test_data = df_test

print(df_train_data.shape)
print(df_vali_data.shape)

df_train_data.head()

In [None]:
# Create a new dataframe that just the Ids, rating and document vectors, but at columns
df_train_data = featutil.convertToDocVectorDataSet(df_train_data, colName, fasttext_model)
df_vali_data = featutil.convertToDocVectorDataSet(df_vali_data, colName, fasttext_model)
df_test_data = featutil.convertToDocVectorDataSet(df_test_data, colName, fasttext_model)

df_vali_data.head()

In [None]:
# Write test data to file, when we do a complete run. Otherwise, just drop the test data out of memory
del df_test
del df_test_data

In [None]:
# Get all the columns
col_names = df_train_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train_data[idCols]
dfTrainFeatures = df_train_data[feature_cols]
dfTrainTarget = df_train_data[target_col]

dfValiIds = df_vali_data[idCols]
dfValiFeatures = df_vali_data[feature_cols]
dfValiTarget = df_vali_data[target_col]


In [None]:
dfValiIds.head()

In [None]:
print(dfTrainFeatures.shape)
dfTrainFeatures.head()

In [None]:
def objective(trial):
  # Create the Light GBM Regression model and train
  model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
    ,learning_rate=trial.suggest_float("learning_rate", 0.005, 0.3)
    ,num_leaves=trial.suggest_int("num_leaves", 2, 127)
    ,max_depth=trial.suggest_int("max_depth", 2, 30)
    ,n_estimators=trial.suggest_int("n_estimators ", 50, 1000)
    # ,min_split_gain=trial.suggest_float("min_split_gain", 0.001, 1.0)
    # ,min_child_samples=trial.suggest_int("min_child_samples", 1, 100)  
    # #,min_child_weight =trial.suggest_float("min_child_weight", 0.0001, 0.1) 
    # ,subsample =trial.suggest_float("subsample", 0.1, 1.0) 
    # ,subsample_freq =trial.suggest_int("subsample_freq", 0, 15)
    # ,colsample_bytree =trial.suggest_float("colsample_bytree", 0.1, 1.0) 
    # ,reg_alpha =trial.suggest_float("reg_alpha", 0.1, 1.0) 
    # ,reg_lambda =trial.suggest_float("reg_lambda", 0.1, 1.0)      
  )

  model.fit(X=dfTrainFeatures, y=dfTrainTarget)

  # Use the model to predict against our validation data
  test_predicted = model.predict(dfValiFeatures)  

  mae = mean_absolute_error(dfValiTarget, test_predicted)

  return mae

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

print("\n---------")
print("Study Complete")
study.best_params

In [None]:
print(study.best_params)
print("Best Rank Score: " + str(study.best_value))
print("-------")
print(study.best_trial)

# Summary

Run 1 - 50 ish trial
* Beer Name NLP, Full Data
* parameters: {'learning_rate': 0.2472480229316372, 'num_leaves': 125, 'max_depth': 24, 'n_estimators ': 796}. 
* Best is trial 27 with value: 0.4170567352847134.

