In [None]:
# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
import fasttext as ft
import optuna

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
from utilities import regex_utility as reutil
import features_utility as featutil

import nltk

## File Details - Light GBM Regression NLP on Beer Text

First look at NLP on the Text. Probably need to look at the Lemmatized column, possibly filter on the POS.
But first run, will literally just feed in Lemmatized, see what happens

Characteristics:
* Light GBM Regression Algorithm
* Start working on NLP on the Beer text columns


In [None]:
filePrefix = "A3_124_lgbm_nlp_beertext_tuning"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
featuresDataDir = "features/"
modelsDir = "models/"
writeSubRunFile = True
seed = databasic.get_random_seed()

# pass in an existing model file to use that was generated from another run
modelFileToUse = "FastText_beertext_Lemmatized_full_lang_model.model"

In [None]:
trainFilePath = baseDataDir + 'train.tsv'
valiFilePath = baseDataDir + 'val.tsv'
featuresFilePath = baseDataDir + 'features.tsv'
testFilePath = baseDataDir + 'test.tsv'

# trainFilePath = baseDataDir + 'train_200k.tsv'
# valiFilePath = baseDataDir + 'vali_200k.tsv'
# featuresFilePath = baseDataDir + 'features_200k.tsv'
# testFilePath = baseDataDir + 'test_200k.tsv'

In [None]:
df_train = pd.read_csv(trainFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])

df_vali = pd.read_csv(valiFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])


df_test = pd.read_csv(testFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])                                

print(df_train.shape)

In [None]:
# # RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag
# # df_features = pd.read_csv(baseDataDir + 'features_500k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
df_features = pd.read_csv(featuresFilePath,sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
                                                                 'DayofMonth','Year','TimeOfDay','Gender',
                                                                 'Birthday','Text','Lemmatized','POS_Tag'])

df_features.head()

colsToUse = ["Text", "Lemmatized", "POS_Tag"]

# Find the feature records that match the training and validation data and join them together
dfFullData = df_train.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
dfFullDataVali = df_vali.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
dfFullDataTest = df_test.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")

dfFullData.head()

# Remove the duplicated Row ID, also remove Beer Name at this point, we're nt using it
# df_train_data = dfFullData.drop(['RowIDFeat', "BeerName"],axis=1)
# df_vali_data = dfFullDataVali.drop(['RowIDFeat', "BeerName"],axis=1)
df_train_data = dfFullData.drop(["BeerName", "BeerType", "Text", "POS_Tag"],axis=1)
df_vali_data = dfFullDataVali.drop(["BeerName", "BeerType", "Text", "POS_Tag"],axis=1)
df_test_data = dfFullDataTest.drop(["BeerName", "BeerType", "Text", "POS_Tag"],axis=1)

df_train_data.head()

Let's just look at Beer name first. Compile a full list of the beer names, save it to file with one per line. Then we can load it with fasttext and build a language model

In [None]:
colName = "Lemmatized"
df_train_data, df_vali_data, df_test_data, documentFilePath = featutil.formatTextColForNLP(df_train_data, df_vali_data, df_test_data, colName, featuresDataDir, filePrefix, 50, 50)

Most Frequent Words and Bigrams:
[('beer', 174641), ('hop', 144586), ('head', 136003), ('taste', 115450), ('malt', 114879), ('pour', 113880), ('nice', 96037), ('flavor', 94335), ('good', 86343), ('like', 75634), ('smell', 75215), ('light', 73257), ('aroma', 72055), ('sweet', 70629), ('one', 70319), ('bit', 64462), ('bottle', 64312), ('finish', 62183), ('dark', 60571), ('glass', 59983), ('carbonation', 58731), ('color', 57863), ('well', 56893), ('little', 52933), ('mouthfeel', 47370), ('would', 45244), ('chocolate', 45155), ('lacing', 44635), ('note', 42172), ('really', 42148), ('brown', 41158), ('alcohol', 39851), ('caramel', 39761), ('body', 39751), ('great', 38695), ('much', 38127), ('get', 37967), ('white', 37964), ('medium', 37656), ('nose', 36845), ('bitter', 36704), ('citrus', 36594), ('bitterness', 36504), ('coffee', 34664), ('drink', 34411), ('leave', 33541), ('smooth', 33516), ('brew', 33311), ('hint', 32242), ('pretty', 30249)]

[(('white', 'head'), 26219), (('pint', 'glass'), 15662), (('roasted', 'malt'), 13588), (('tan', 'head'), 13566), (('sweet', 'malt'), 10066), (('medium', 'bodied'), 9855), (('beer', 'pour'), 9270), (('caramel', 'malt'), 9250), (('dark', 'brown'), 9016), (('bottle', 'pour'), 8550), (('dark', 'fruit'), 8504), (('oz', 'bottle'), 8069), (('medium', 'body'), 8026), (('hop', 'flavor'), 8002), (('pour', 'dark'), 7601), (('hop', 'bitterness'), 7299), (('amber', 'color'), 7210), (('12', 'oz'), 6761), (('dark', 'chocolate'), 6717), (('pale', 'ale'), 6651), (('little', 'bit'), 6595), (('taste', 'like'), 6426), (('offwhite', 'head'), 6300), (('citrus', 'hop'), 6226), (('head', 'leave'), 6019), (('smell', 'like'), 6018), (('lacing', 'smell'), 5995), (('good', 'beer'), 5940), (('pour', 'clear'), 5806), (('12oz', 'bottle'), 5711), (('brown', 'sugar'), 5456), (('pretty', 'good'), 5415), (('head', 'smell'), 5367), (('easy', 'drink'), 5357), (('well', 'balanced'), 5290), (('floral', 'hop'), 5268), (('finger', 'head'), 5252), (('glass', 'pour'), 5189), (('hop', 'aroma'), 5162), (('malt', 'flavor'), 5054), (('sierra', 'nevada'), 4776), (('dry', 'finish'), 4737), (('pour', 'deep'), 4656), (('full', 'bodied'), 4652), (('brown', 'head'), 4525), (('golden', 'color'), 4509), (('brown', 'color'), 4498), (('pour', 'nice'), 4490), (('hop', 'taste'), 4427), (('head', 'aroma'), 4384)]

Now train a Fast Text language model. Check to see if there is a saved model to use, else train a new one

In [None]:
fasttext_model = featutil.getFastTextLangModel(colName, modelFileToUse,  modelsDir, filePrefix, documentFilePath, 200, True)

print(fasttext_model.words[0:50])

# examine some of the word vectors
# print(fasttext_model.get_word_vector("stout"))

In [None]:
# df_train_data = df_train
# df_vali_data = df_vali
# df_test_data = df_test

print(df_train_data.shape)
print(df_vali_data.shape)

df_train_data.head()

In [None]:
# Create a new dataframe that just the Ids, rating and document vectors, but at columns
df_train_data = featutil.convertToDocVectorDataSet(df_train_data, colName, fasttext_model)
df_vali_data = featutil.convertToDocVectorDataSet(df_vali_data, colName, fasttext_model)
df_test_data = featutil.convertToDocVectorDataSet(df_test_data, colName, fasttext_model)

df_vali_data.head()

In [None]:
# Write test data to file, when we do a complete run. Otherwise, just drop the test data out of memory
del df_test
del df_test_data

In [None]:
# Get all the columns
col_names = df_train_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train_data[idCols]
dfTrainFeatures = df_train_data[feature_cols]
dfTrainTarget = df_train_data[target_col]

dfValiIds = df_vali_data[idCols]
dfValiFeatures = df_vali_data[feature_cols]
dfValiTarget = df_vali_data[target_col]


In [None]:
dfValiIds.head()

In [None]:
print(dfTrainFeatures.shape)
dfTrainFeatures.head()

In [None]:
def objective(trial):
  # Create the Light GBM Regression model and train
  model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
    ,learning_rate=trial.suggest_float("learning_rate", 0.005, 0.3)
    ,num_leaves=trial.suggest_int("num_leaves", 2, 127)
    ,max_depth=trial.suggest_int("max_depth", 2, 30)
    ,n_estimators=trial.suggest_int("n_estimators ", 50, 1000)
    # ,min_split_gain=trial.suggest_float("min_split_gain", 0.001, 1.0)
    # ,min_child_samples=trial.suggest_int("min_child_samples", 1, 100)  
    # #,min_child_weight =trial.suggest_float("min_child_weight", 0.0001, 0.1) 
    # ,subsample =trial.suggest_float("subsample", 0.1, 1.0) 
    # ,subsample_freq =trial.suggest_int("subsample_freq", 0, 15)
    # ,colsample_bytree =trial.suggest_float("colsample_bytree", 0.1, 1.0) 
    # ,reg_alpha =trial.suggest_float("reg_alpha", 0.1, 1.0) 
    # ,reg_lambda =trial.suggest_float("reg_lambda", 0.1, 1.0)      
  )

  model.fit(X=dfTrainFeatures, y=dfTrainTarget)

  # Use the model to predict against our validation data
  test_predicted = model.predict(dfValiFeatures)  

  mae = mean_absolute_error(dfValiTarget, test_predicted)

  return mae

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)

print("\n---------")
print("Study Complete")
study.best_params

In [None]:
print(study.best_params)
print("Best Rank Score: " + str(study.best_value))
print("-------")
print(study.best_trial)

# Summary


