In [41]:
# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
import fasttext as ft

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
from utilities import regex_utility as reutil
import features_utility as featutil

import nltk

## File Details - Light GBM Regression NLP

First File on looking at doing NLP on the Beer name and text features. Investigate using fast text, because fast!
This file might start off with just the beer name, then we will go from there

Characteristics:
* Light GBM Regression Algorithm
* Start working on NLP on the Beer name text columns


In [42]:
filePrefix = "A3_126_lgbm_nlp_beertext_supervised"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
featuresDataDir = "features/"
modelsDir = "models/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [43]:
trainFilePath = baseDataDir + 'train.tsv'
valiFilePath = baseDataDir + 'val.tsv'
featuresFilePath = baseDataDir + 'features.tsv'
testFilePath = baseDataDir + 'test.tsv'

# trainFilePath = baseDataDir + 'train_200k.tsv'
# valiFilePath = baseDataDir + 'vali_200k.tsv'
# featuresFilePath = baseDataDir + 'features_200k.tsv'
# testFilePath = baseDataDir + 'test_200k.tsv'

In [44]:
df_train = pd.read_csv(trainFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])

df_vali = pd.read_csv(valiFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])


df_test = pd.read_csv(testFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])                                

print(df_train.shape)

(845008, 6)


In [45]:
# # RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag
# # df_features = pd.read_csv(baseDataDir + 'features_500k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
df_features = pd.read_csv(featuresFilePath,sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
                                                                 'DayofMonth','Year','TimeOfDay','Gender',
                                                                 'Birthday','Text','Lemmatized','POS_Tag'])

df_features.head()

colsToUse = ["Lemmatized"]


In [46]:

# Find the feature records that match the training and validation data and join them together
dfFullData = df_train.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
dfFullDataVali = df_vali.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
dfFullDataTest = df_test.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")

dfFullData.head()

# Remove the duplicated Row ID, also remove Beer Name at this point, we're nt using it
# df_train_data = dfFullData.drop(['RowIDFeat', "BeerName"],axis=1)
# df_vali_data = dfFullDataVali.drop(['RowIDFeat', "BeerName"],axis=1)
df_train_data = dfFullData.drop(["BeerName", "BeerType"],axis=1)
df_vali_data = dfFullDataVali.drop(["BeerName", "BeerType"],axis=1)
df_test_data = dfFullDataTest.drop(["BeerName", "BeerType"],axis=1)

df_train_data.head()

Unnamed: 0,RowID,BeerID,ReviewerID,rating,Lemmatized
0,19,12300,10635,4.0,"`` warning , this beer have a intense smoke fl..."
1,21,12300,6547,4.5,Amber brown in color with very little head but...
2,23,12300,9789,4.5,"complex enjoyable smoke . tasty rich malt , ca..."
3,24,12300,7372,5.0,"this beer pour a dark amber color , with a per..."
4,25,12300,1302,4.5,pour a rich burn caramel hue with some deep am...


Let's just look at Beer name first. Compile a full list of the beer names, save it to file with one per line. Then we can load it with fasttext and build a language model

In [47]:
print(df_vali_data.shape)

(243834, 5)


In [48]:
colName = "Lemmatized"
df_train_data, df_vali_data, df_test_data, documentFilePath = featutil.formatTextColForNLPSupervised(
    df_train_data, df_vali_data, df_test_data, colName, featuresDataDir, filePrefix, 0)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nelso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                     LemmatizedLabel
1  __label__4.5 amber brown color little head nic...
2  __label__4.5 complex enjoyable smoke tasty ric...
3  __label__5.0 beer pour dark amber color perfec...
4  __label__4.5 pour rich burn caramel hue deep a...
5  __label__4.5 draught beer stein rauch bock che...
6  __label__5.0 22oz bottle pour clear amber tigh...
7  __label__4.5 deep cherry body faint beige head...
8  __label__4.0 pour 22oz bomber ayinger celebrat...
9  __label__4.0 share bottle tenderbranson69 pick...


Now train a Fast Text language model. Check to see if there is a saved model to use, else train a new one

In [49]:
print(df_vali_data.shape)

(243834, 5)


In [50]:
# pass in an existing model file to use that was generated from another run
modelFileToUse = ""

fasttext_model = featutil.getFastTextLangModelSupervised(colName, modelFileToUse,  modelsDir, filePrefix, documentFilePath, 200, True)

print(fasttext_model.words[0:50])

# examine some of the word vectors
# print(fasttext_model.get_word_vector("stout"))

['</s>', 'beer', 'head', 'hop', 'taste', 'pour', 'malt', 'flavor', 'nice', 'good', 'light', 'like', 'smell', 'sweet', 'aroma', 'one', 'bit', 'bottle', 'finish', 'dark', 'carbonation', 'color', 'glass', 'well', 'little', 'mouthfeel', 'would', 'lacing', 'brown', 'note', 'alcohol', 'really', 'white', 'body', 'much', 'caramel', 'chocolate', 'get', 'medium', 'drink', 'nose', 'great', 'brew', 'fruit', 'hint', 'smooth', 'leave', 'bitterness', 'bitter', 'citrus']


In [51]:
# Create a new dataframe that just the Ids, rating and document vectors, but at columns
df_train_data = featutil.convertToDocVectorDataSet(df_train_data, colName, fasttext_model)
# df_vali_data = featutil.convertToDocVectorDataSet(df_vali_data, colName, fasttext_model)
# df_test_data = featutil.convertToDocVectorDataSet(df_test_data, colName, fasttext_model)

df_train_data.head()

Unnamed: 0,RowID,BeerID,ReviewerID,rating,Lemmatized_DocVec_0,Lemmatized_DocVec_1,Lemmatized_DocVec_2,Lemmatized_DocVec_3,Lemmatized_DocVec_4,Lemmatized_DocVec_5,...,Lemmatized_DocVec_190,Lemmatized_DocVec_191,Lemmatized_DocVec_192,Lemmatized_DocVec_193,Lemmatized_DocVec_194,Lemmatized_DocVec_195,Lemmatized_DocVec_196,Lemmatized_DocVec_197,Lemmatized_DocVec_198,Lemmatized_DocVec_199
0,19,12300,10635,4.0,0.027748,0.017034,-0.033596,-0.036577,0.093753,0.011921,...,0.003158,0.019271,0.038356,-0.011493,0.023297,-0.031086,-0.005403,0.013005,-0.033269,-0.02645
1,21,12300,6547,4.5,-0.009907,0.040332,-0.024713,0.002329,0.063008,0.026923,...,0.013729,0.022427,0.087125,0.004723,0.021917,-0.01153,-0.013467,0.03344,-0.050096,-0.07478
2,23,12300,9789,4.5,0.026733,0.009134,0.011481,-0.012636,0.034114,0.016022,...,0.026691,0.02061,0.117741,0.013414,0.025082,-0.030421,-0.022731,0.044451,-0.098812,-0.101796
3,24,12300,7372,5.0,-0.001222,0.04585,-0.030376,-0.008558,0.098255,0.018821,...,0.010711,0.022204,0.093252,0.007135,0.033894,-0.023813,-0.013041,0.041588,-0.087465,-0.087643
4,25,12300,1302,4.5,-0.004483,0.031152,-0.027568,-0.009516,0.063459,0.01456,...,0.006685,0.014033,0.063868,0.002771,0.029713,-0.022151,-0.021135,0.035744,-0.063698,-0.071717


In [52]:
# # Write test data to file, when we do a complete run. Otherwise, just drop the test data out of memory
# del df_test
# del df_test_data

In [53]:
# Get all the columns
col_names = df_train_data.columns

idCols = ['RowID','BeerID','ReviewerID']
#feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
feature_cols = list(filter(lambda x: x.startswith("Lemmatized_"), col_names))
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train_data[idCols]
dfTrainFeatures = df_train_data[feature_cols]
dfTrainTarget = df_train_data[target_col]

dfValiIds = df_vali_data[idCols]
# dfValiFeatures = df_vali_data[feature_cols]
dfValiTarget = df_vali_data[target_col]





In [54]:

lstValiTextCol = df_vali_data[colName].to_list()
dfValiIds.head()

Unnamed: 0,RowID,BeerID,ReviewerID
0,22,12300,2634
1,27,12300,5634
2,28,12300,3544
3,40,12300,6521
4,43,12300,10177


In [55]:
print(dfTrainFeatures.shape)
dfTrainFeatures.head()

(746207, 200)


Unnamed: 0,Lemmatized_DocVec_0,Lemmatized_DocVec_1,Lemmatized_DocVec_2,Lemmatized_DocVec_3,Lemmatized_DocVec_4,Lemmatized_DocVec_5,Lemmatized_DocVec_6,Lemmatized_DocVec_7,Lemmatized_DocVec_8,Lemmatized_DocVec_9,...,Lemmatized_DocVec_190,Lemmatized_DocVec_191,Lemmatized_DocVec_192,Lemmatized_DocVec_193,Lemmatized_DocVec_194,Lemmatized_DocVec_195,Lemmatized_DocVec_196,Lemmatized_DocVec_197,Lemmatized_DocVec_198,Lemmatized_DocVec_199
0,0.027748,0.017034,-0.033596,-0.036577,0.093753,0.011921,0.009011,-0.032714,0.088039,0.14733,...,0.003158,0.019271,0.038356,-0.011493,0.023297,-0.031086,-0.005403,0.013005,-0.033269,-0.02645
1,-0.009907,0.040332,-0.024713,0.002329,0.063008,0.026923,0.00444,-0.017439,0.078392,0.088087,...,0.013729,0.022427,0.087125,0.004723,0.021917,-0.01153,-0.013467,0.03344,-0.050096,-0.07478
2,0.026733,0.009134,0.011481,-0.012636,0.034114,0.016022,-0.011701,-0.026069,0.040446,0.07389,...,0.026691,0.02061,0.117741,0.013414,0.025082,-0.030421,-0.022731,0.044451,-0.098812,-0.101796
3,-0.001222,0.04585,-0.030376,-0.008558,0.098255,0.018821,0.020067,-0.038238,0.088649,0.090147,...,0.010711,0.022204,0.093252,0.007135,0.033894,-0.023813,-0.013041,0.041588,-0.087465,-0.087643
4,-0.004483,0.031152,-0.027568,-0.009516,0.063459,0.01456,0.009373,-0.019233,0.060789,0.080719,...,0.006685,0.014033,0.063868,0.002771,0.029713,-0.022151,-0.021135,0.035744,-0.063698,-0.071717


In [56]:
print(dfValiTarget.shape)
print(df_vali_data.shape)
df_vali_data.head()

(243834,)
(243834, 5)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,Lemmatized
0,22,12300,2634,4.0,pour bomber shaker pint glass
1,27,12300,5634,4.5,get 22 ounce bomber pour deep bright brown pre...
2,28,12300,3544,4.5,mood something different usual assortment ipa ...
3,40,12300,6521,4.0,caldera pale ale pick grape gourmet vb light g...
4,43,12300,10177,4.5,receive beer powerball really sorry know send ...


In [57]:
#  best params
# {'learning_rate': 0.2472480229316372, 'num_leaves': 125, 'max_depth': 24, 'n_estimators ': 796}. 


# No tuning
# model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed)

# best params  
  
# model = lgb.LGBMRegressor(objective="regression_l1", metric="mae", random_state=seed
#     ,learning_rate=0.2472480229316372, num_leaves = 125, max_depth = 24, n_estimators = 796
#   )

# model.fit(X=dfTrainFeatures, y=dfTrainTarget)

In [58]:
# Use the model to predict against our validation data
test_predicted = fasttext_model.predict(lstValiTextCol)
print(type(test_predicted))
print(type(test_predicted[0]))
print(type(test_predicted[1]))

<class 'tuple'>
<class 'list'>
<class 'list'>


In [59]:
print(str(len(test_predicted[0])))
print(test_predicted[0][0:20])

243834
[['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0'], ['__label__4.0']]


In [60]:
print(test_predicted[1][0:20])

[array([0.35361546], dtype=float32), array([0.31673735], dtype=float32), array([0.263824], dtype=float32), array([0.32535633], dtype=float32), array([0.3074722], dtype=float32), array([0.3102886], dtype=float32), array([0.29837528], dtype=float32), array([0.30455717], dtype=float32), array([0.3141752], dtype=float32), array([0.3262775], dtype=float32), array([0.28866246], dtype=float32), array([0.31331453], dtype=float32), array([0.29002008], dtype=float32), array([0.3293735], dtype=float32), array([0.3487898], dtype=float32), array([0.3293735], dtype=float32), array([0.29153654], dtype=float32), array([0.32098266], dtype=float32), array([0.2874152], dtype=float32), array([0.3262775], dtype=float32)]


In [61]:
lstPredictedNum = list(map(lambda x: float(x[0].replace("__label__", "")), test_predicted[0]))

In [77]:
print(lstPredictedNum[0:20])
predList = list(map(lambda x: x[0], test_predicted[0]))
print(predList[0:10])

dfPredTemp = pd.DataFrame( { "Label": predList, "Prediction" : lstPredictedNum } )
dfPredTemp["Label"].drop_duplicates().head(20)


dfPredTemp["Prediction"].drop_duplicates().head(20)


[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]
['__label__4.0', '__label__4.0', '__label__4.0', '__label__4.0', '__label__4.0', '__label__4.0', '__label__4.0', '__label__4.0', '__label__4.0', '__label__4.0']


0         4.0
211       NaN
8588      0.0
12869     3.5
41274     1.0
89539     3.0
93169     4.5
225596    1.5
Name: Prediction, dtype: float64

In [78]:
np.mean(dfPredTemp["Prediction"])
dfPredTemp["Prediction"] = dfPredTemp["Prediction"].fillna(4.0)

In [79]:
dfPredTemp["Label"].head()
dfPredTemp["Prediction"].drop_duplicates().head(20)


0         4.0
8588      0.0
12869     3.5
41274     1.0
89539     3.0
93169     4.5
225596    1.5
Name: Prediction, dtype: float64

In [80]:
mae = mean_absolute_error(dfValiTarget, dfPredTemp["Prediction"])

print("Average MAE: " + str(mae))
print("analyse_maes.append(" + str(mae) + ")")

Average MAE: 0.49473002124396104
analyse_maes.append(0.49473002124396104)


In [None]:
dfPredicted = pd.DataFrame({"Predict": test_predicted})
dfPredicted['Predict'].hist(bins=10)


Write to a subrun file

In [None]:
dfPredicted = pd.concat([dfValiIds.reset_index(), dfPredicted], axis=1).drop(columns="index")

if writeSubRunFile:
  dfPredicted.to_csv(subrunDir + filePrefix + "_subrun.csv", index=False)

print("Average MAE: " + str(mae))
print(dfPredicted.shape)
dfPredicted.sort_values("RowID").head(20)


# Summary

No Param Tuning
* Beer Name NLP, 200k
* MAE 0.4382028648981029

