In [None]:
# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error


import nltk

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil

## File Details - 

This isn't a run file, this will load in all the data, do all the data preprocessing and write out new full files with all the data so that we can just load these in 
in the future


In [None]:
filePrefix = "A3_130_create_full_featres_processed"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
modelsDir = "models/"
featuresDataDir = "features/"
writeSubRunFile = False
seed = databasic.get_random_seed()
fastTextModelForceRetrain = False

In [None]:
trainFilePath = baseDataDir + 'train.tsv'
valiFilePath = baseDataDir + 'val.tsv'
featuresFilePath = baseDataDir + 'features.tsv'
testFilePath = baseDataDir + 'test.tsv'

# trainFilePath = baseDataDir + 'train_200k.tsv'
# valiFilePath = baseDataDir + 'vali_200k.tsv'
# featuresFilePath = baseDataDir + 'features_200k.tsv'
# testFilePath = baseDataDir + 'test_200k.tsv'

In [None]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
df_train = pd.read_csv(trainFilePath,sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

df_vali = pd.read_csv(valiFilePath,sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)

df_test = pd.read_csv(testFilePath, sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])       


In [None]:

print(df_train.shape)
print(df_vali.shape)
print(df_test.shape)

df_train.head(10)

Add the Review Count columns for Reviewers and Beers to both the Train and Validation sets

In [None]:
df_train = featutil.addReviewerReviewCount(df_train)
df_train = featutil.addBeerReviewCount(df_train)

df_vali = featutil.addReviewerReviewCount(df_vali)
df_vali = featutil.addBeerReviewCount(df_vali)

df_test = featutil.addReviewerReviewCount(df_test)
df_test = featutil.addBeerReviewCount(df_test)


In [None]:

print(df_train.shape)
print(df_vali.shape)
print(df_test.shape)
df_train.sort_values("ReviewerID").head(10)


In [None]:
# One hot encode Beer Type
df_train, df_vali, df_test = dfutil.getDummiesForTripleSets(df_train, df_vali, df_test, "BeerType")

In [None]:

print(df_train.shape)
print(df_vali.shape)
print(df_test.shape)

df_train.head(10)

In [None]:
# Convert Beer Name to document vector columns
df_train, df_vali, df_test, documentFilePath = featutil.formatTextColForNLP(df_train, df_vali, df_test, "BeerName", featuresDataDir, filePrefix, 0, 50)
fasttext_model_bn = featutil.getFastTextLangModel("BeerName", "",  modelsDir, filePrefix, documentFilePath, 200, fastTextModelForceRetrain)

df_train_doc_vect = featutil.convertToDocVectorDataSet(df_train, "BeerName", fasttext_model_bn)
df_vali_doc_vect = featutil.convertToDocVectorDataSet(df_vali, "BeerName", fasttext_model_bn)
df_test_doc_vect = featutil.convertToDocVectorDataSet(df_test, "BeerName", fasttext_model_bn)

del df_train_doc_vect["BeerID"]
del df_train_doc_vect["ReviewerID"]
del df_train_doc_vect["rating"]
del df_vali_doc_vect["BeerID"]
del df_vali_doc_vect["ReviewerID"]
del df_vali_doc_vect["rating"]
del df_test_doc_vect["BeerID"]
del df_test_doc_vect["ReviewerID"]
del df_test_doc_vect["rating"]


In [None]:

print(df_train.shape)
print(df_vali.shape)
print(df_test.shape)
print(df_train_doc_vect.shape)
print(df_vali_doc_vect.shape)
print(df_test_doc_vect.shape)

Now load the Features and join 

In [None]:
del df_train_doc_vect["RowID"]
del df_vali_doc_vect["RowID"]
del df_test_doc_vect["RowID"]

df_train = pd.concat([df_train.reset_index(), df_train_doc_vect], axis=1).drop(columns="index")
df_vali = pd.concat([df_vali.reset_index(), df_vali_doc_vect], axis=1).drop(columns="index")
df_test = pd.concat([df_test.reset_index(), df_test_doc_vect], axis=1).drop(columns="index")

# Remove the original column from the dataset
del df_train["BeerName"]
del df_vali["BeerName"]
del df_test["BeerName"]


del fasttext_model_bn

del df_train_doc_vect
del df_vali_doc_vect
del df_test_doc_vect

In [None]:

print(df_train.shape)
print(df_vali.shape)
print(df_test.shape)

df_train.head(10)

In [None]:
# RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag
df_features = pd.read_csv(featuresFilePath,sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
                                                                 'DayofMonth','Year','TimeOfDay','Gender',
                                                                 'Birthday','Text','Lemmatized','POS_Tag'])


print(df_features.shape)
df_features.head()

Do NLP processing on Beer Name and one hot encoding on Beer Type

In [None]:
df_train = df_train.join(df_features, on="RowID", how="inner", rsuffix="Feat")
df_vali = df_vali.join(df_features, on="RowID", how="inner", rsuffix="Feat")
df_test = df_test.join(df_features, on="RowID", how="inner", rsuffix="Feat")

del df_features

In [None]:

print(df_train.shape)
print(df_vali.shape)
print(df_test.shape)

In [None]:
# at this stage, only working with Lemmatize column, remove text and postag
del df_train["Text"]
del df_train["POS_Tag"]
del df_vali["Text"]
del df_vali["POS_Tag"]
del df_test["Text"]
del df_test["POS_Tag"]

In [None]:
# do the feature transformations
df_train = featutil.fixNullABV(df_train)
df_vali = featutil.fixNullABV(df_vali)
df_test = featutil.fixNullABV(df_test)

df_train, df_vali, df_test = dfutil.getDummiesForTripleSets(df_train, df_vali, df_test, "BrewerID")

df_train, df_vali, df_test = dfutil.getDummiesForTripleSets(df_train, df_vali, df_test, "Gender")


In [None]:
print(df_train.shape)
print(df_vali.shape)
print(df_test.shape)

In [None]:

df_train = featutil.formatDayOfWeek(df_train)
df_vali = featutil.formatDayOfWeek(df_vali)
df_test = featutil.formatDayOfWeek(df_test)

df_train = featutil.formatMonth(df_train)
df_vali = featutil.formatMonth(df_vali)
df_test = featutil.formatMonth(df_test)

df_train = featutil.formatTimeToSec(df_train)
df_vali = featutil.formatTimeToSec(df_vali)
df_test = featutil.formatTimeToSec(df_test)

df_train = featutil.convertBirthdayToAge(df_train)
df_vali = featutil.convertBirthdayToAge(df_vali)
df_test = featutil.convertBirthdayToAge(df_test)

In [None]:
print(df_train.shape)
print(df_vali.shape)
print(df_test.shape)
df_train.head()

In [None]:
# Convert Lemmatized Review Text to document vector columns
df_train, df_vali, df_test, documentFilePath = featutil.formatTextColForNLP(df_train, df_vali, df_test, "Lemmatized", featuresDataDir, filePrefix, 0, 50)
fasttext_model_lem = featutil.getFastTextLangModel("Lemmatized", "",  modelsDir, filePrefix, documentFilePath, 200, fastTextModelForceRetrain)

In [None]:
# scaler = StandardScaler()

In [None]:
df_train_doc_vect = featutil.convertToDocVectorDataSet(df_train, "Lemmatized", fasttext_model_lem)

del df_train_doc_vect["BeerID"]
del df_train_doc_vect["ReviewerID"]
del df_train_doc_vect["rating"]
del df_train_doc_vect["RowID"]
df_train = pd.concat([df_train.reset_index(), df_train_doc_vect], axis=1).drop(columns="index")

# Remove the original column from the dataset
del df_train["Lemmatized"]
del df_train_doc_vect

print(df_train.shape)

In [None]:
# # Apply Standard Scaling to the set of feature columns we want to target
# df_train = featutil.scaleMinMaxFeatureDataFrame(df_train)

In [None]:

df_train.to_csv(baseDataDir + "train_features_preprocessed.csv", index=False)

del df_train

In [None]:

df_vali_doc_vect = featutil.convertToDocVectorDataSet(df_vali, "Lemmatized", fasttext_model_lem)

del df_vali_doc_vect["BeerID"]
del df_vali_doc_vect["ReviewerID"]
del df_vali_doc_vect["rating"]
del df_vali_doc_vect["RowID"]
df_vali = pd.concat([df_vali.reset_index(), df_vali_doc_vect], axis=1).drop(columns="index")

# Remove the original column from the dataset
del df_vali["Lemmatized"]
del df_vali_doc_vect

# # Apply Standard Scaling to the set of feature columns we want to target
# df_vali = featutil.scaleMinMaxFeatureDataFrame(df_vali)

print(df_vali.shape)
df_vali.to_csv(baseDataDir + "vali_features_preprocessed.csv", index=False)

del df_vali



In [None]:

df_test_doc_vect = featutil.convertToDocVectorDataSet(df_test, "Lemmatized", fasttext_model_lem)

del df_test_doc_vect["BeerID"]
del df_test_doc_vect["ReviewerID"]
del df_test_doc_vect["rating"]
del df_test_doc_vect["RowID"]
df_test = pd.concat([df_test.reset_index(), df_test_doc_vect], axis=1).drop(columns="index")

# Remove the original column from the dataset
del df_test["Lemmatized"]
del df_test_doc_vect

# # Apply Standard Scaling to the set of feature columns we want to target
# df_test = featutil.scaleMinMaxFeatureDataFrame(df_test)

print(df_test.shape)
df_test.to_csv(baseDataDir + "test_features_preprocessed.csv", index=False)

del df_test

In [None]:
# # free up the memory
del fasttext_model_lem