In [1]:
# Import libraries
import pandas as pd
import numpy as np

from lightfm import LightFM

from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil
import features_utility as featutil



## File Details - Light GBM Regression on Beer Context columns inc review counts

This is a Candidate for being used in an Ensemble. 
Characteristicts:
* Light GBM Regression Algorithm
* Using Beer Context columns inc ABV, Year and Review Counts
* Todo: use optimised parameters for Light GBM Regression


In [2]:
filePrefix = "A3_161_contentknn_tinkering2"
baseDataDir = "C:/Development/Data/COSC2670/Assignment3/A3data/"
subrunDir = "subruns/"
writeSubRunFile = True
seed = databasic.get_random_seed()

In [3]:
# trainFilePath = baseDataDir + 'train.tsv'
# valiFilePath = baseDataDir + 'val.tsv'
# featuresFilePath = baseDataDir + 'features.tsv'
# testFilePath = baseDataDir + 'test.tsv'

trainFilePath = baseDataDir + 'train_200k.tsv'
valiFilePath = baseDataDir + 'vali_200k.tsv'
featuresFilePath = baseDataDir + 'features_200k.tsv'
testFilePath = baseDataDir + 'test_200k.tsv'

In [4]:
# RowID  BeerID  ReviewerID  BeerName  BeerType  Label
df_train = pd.read_csv(trainFilePath,sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_train.head(10)

# df_vali = pd.read_csv(baseDataDir + 'vali_500k.tsv',sep='\t',
df_vali = pd.read_csv(valiFilePath,sep='\t',
                         names=['RowID','BeerID','ReviewerID',
                                  'BeerName','BeerType','rating'])
df_vali.head(10)


Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating
0,22,12300,2634,Rauch �r Bock,Rauchbier,4.0
1,27,12300,5634,Rauch �r Bock,Rauchbier,4.5
2,28,12300,3544,Rauch �r Bock,Rauchbier,4.5
3,40,12300,6521,Rauch �r Bock,Rauchbier,4.0
4,43,12300,10177,Rauch �r Bock,Rauchbier,4.5
5,48,12300,2907,Rauch �r Bock,Rauchbier,3.5
6,49,12300,1532,Rauch �r Bock,Rauchbier,4.0
7,50,12300,3452,Rauch �r Bock,Rauchbier,3.5
8,59,12300,6861,Rauch �r Bock,Rauchbier,4.0
9,64,6699,6401,Caldera Pale Ale,American Pale Ale (APA),4.5


Add the Review Count columns for Reviewers and Beers to both the Train and Validation sets

In [5]:
df_train = featutil.addReviewerReviewCount(df_train)
df_train = featutil.addBeerReviewCount(df_train)

df_vali = featutil.addReviewerReviewCount(df_vali)
df_vali = featutil.addBeerReviewCount(df_vali)


In [6]:
# RowID BrewerID ABV DayofWeek Month DayofMonth Year TimeOfDay Gender Birthday Text Lemmatized POS_Tag
# df_features = pd.read_csv(baseDataDir + 'features_500k.tsv',sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
df_features = pd.read_csv(featuresFilePath,sep='\t', names=['RowID','BrewerID','ABV','DayofWeek','Month',
                                                                 'DayofMonth','Year','TimeOfDay','Gender',
                                                                 'Birthday','Text','Lemmatized','POS_Tag'])

df_features.head()

Unnamed: 0,RowID,BrewerID,ABV,DayofWeek,Month,DayofMonth,Year,TimeOfDay,Gender,Birthday,Text,Lemmatized,POS_Tag
0,18,1075,7.4,Mon,Jan,2,2012,15:20:04,Male,unknown,Pours a murky light brown with a 1 inch fizzy ...,pour a murky light brown with a 1 inch fizzy t...,VBZ DT JJ NN JJ IN DT CD NN JJ NN NN WDT VBZ I...
1,19,1075,7.4,Sun,Jan,1,2012,06:46:52,Male,unknown,Faint sudsy head with some with some dissipati...,faint sudsy head with some with some dissipate...,NN JJ NN IN DT IN DT VBG JJ NN . JJ JJ NN . DT...
2,20,1075,7.4,Tue,Nov,29,2011,05:51:44,Male,unknown,A new arrival to the West TN area ... Pours qu...,a new arrival to the West TN area ... pour qui...,"DT JJ NN IN DT NNP NNP NN , VBZ PDT DT NN JJR ..."
3,21,1075,7.4,Sat,Nov,5,2011,22:59:57,Male,unknown,Sampled 10/30/11 - Transferring the notes . A ...,sample 10/30/11 - transfer the note . a ruby p...,VBN CD HYPH VBG DT NNS . DT NN VBP IN DT NN NN...
4,22,1075,7.4,Tue,Nov,1,2011,20:40:21,Male,"Oct 14, 1983",This is my first rauchbier . Pours a burnt amb...,this be my first rauchbier . pour a burn amber...,DT VBZ PRP$ JJ NN . VBZ DT VBN NN . JJ NN . NN...


In [7]:
colsToUse = ["RowID", "BrewerID", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender", "TimeOfDay", "Birthday"]

# Find the feature records that match the training and validation data and join them together
dfFullData = df_train.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")
dfFullDataVali = df_vali.join(df_features[colsToUse], on="RowID", how="inner", rsuffix="Feat")

dfFullData.head()

Unnamed: 0,RowID,BeerID,ReviewerID,BeerName,BeerType,rating,ReviewerReviewCount,BeerReviewCount,RowIDFeat,BrewerID,ABV,DayofWeek,DayofMonth,Month,Year,Gender,TimeOfDay,Birthday
0,19,12300,10635,Rauch �r Bock,Rauchbier,4.0,36,23,37,1075,7.4,Mon,23,May,2011,Male,15:36:28,unknown
1,21,12300,6547,Rauch �r Bock,Rauchbier,4.5,4,23,40,1075,7.4,Mon,16,May,2011,Male,00:31:46,unknown
2,23,12300,9789,Rauch �r Bock,Rauchbier,4.5,36,23,42,1075,7.4,Sun,10,Apr,2011,unknown,12:17:26,unknown
3,24,12300,7372,Rauch �r Bock,Rauchbier,5.0,69,23,43,1075,7.4,Wed,30,Mar,2011,Male,14:08:00,unknown
4,25,12300,1302,Rauch �r Bock,Rauchbier,4.5,59,23,45,1075,7.4,Thu,24,Mar,2011,Male,14:07:00,"Jul 25, 1984"


In [8]:
# Remove the duplicated Row ID, also remove Beer Name at this point, we're nt using it
df_train_data = dfFullData.drop(['RowIDFeat', "BeerName"],axis=1)
df_vali_data = dfFullDataVali.drop(['RowIDFeat', "BeerName"],axis=1)

df_train_data.head()

Unnamed: 0,RowID,BeerID,ReviewerID,BeerType,rating,ReviewerReviewCount,BeerReviewCount,BrewerID,ABV,DayofWeek,DayofMonth,Month,Year,Gender,TimeOfDay,Birthday
0,19,12300,10635,Rauchbier,4.0,36,23,1075,7.4,Mon,23,May,2011,Male,15:36:28,unknown
1,21,12300,6547,Rauchbier,4.5,4,23,1075,7.4,Mon,16,May,2011,Male,00:31:46,unknown
2,23,12300,9789,Rauchbier,4.5,36,23,1075,7.4,Sun,10,Apr,2011,unknown,12:17:26,unknown
3,24,12300,7372,Rauchbier,5.0,69,23,1075,7.4,Wed,30,Mar,2011,Male,14:08:00,unknown
4,25,12300,1302,Rauchbier,4.5,59,23,1075,7.4,Thu,24,Mar,2011,Male,14:07:00,"Jul 25, 1984"


In [9]:
print(df_train_data.shape)
print(df_vali_data.shape)

(106546, 16)
(35089, 16)


In [10]:
colsToUse.append("BeerType")
colsToUse.append("ReviewerReviewCount")
colsToUse.append("BeerReviewCount")

considerCol = {}

# Initialize them all to True, use all the cols
for colName in colsToUse:
  considerCol[colName] = True

# "RowID", "BrewerID", "BeerType", "ABV", "DayofWeek", "DayofMonth", "Month", "Year", "Gender", "TimeOfDay"
# Disable any column we want

# Usually always use Year and REview Counts
# considerCol["Year"] = False
# considerCol["ReviewerReviewCount"] = False
# considerCol["BeerReviewCount"] = False

# Beer Context Columns
# considerCol["ABV"] = False
# considerCol["BrewerID"] = False
# considerCol["BeerType"] = False

# Consumer Context Columns
# considerCol["DayofWeek"] = False
# considerCol["DayofMonth"] = False
# considerCol["Month"] = False
# considerCol["TimeOfDay"] = False
# considerCol["Birthday"] = False
# considerCol["Gender"] = False


In [11]:
if considerCol["ABV"] == False:
  del df_train_data["ABV"]
  del df_vali_data["ABV"]
else:
  df_train_data = featutil.fixNullABV(df_train_data)
  df_vali_data = featutil.fixNullABV(df_vali_data)

In [12]:
if considerCol["BrewerID"] == False:
  del df_train_data["BrewerID"]
  del df_vali_data["BrewerID"]
else: 
  df_train_data, df_vali_data = dfutil.getDummiesForSplitSets(df_train_data, df_vali_data, "BrewerID")


  df_combined.columns = df_combined.columns.str.replace(" ", "").str.replace("/", "").str.replace("-", "") \


In [13]:
if considerCol["BeerType"] == False:
  del df_train_data["BeerType"]
  del df_vali_data["BeerType"]
else: 
  df_train_data, df_vali_data = dfutil.getDummiesForSplitSets(df_train_data, df_vali_data, "BeerType")

In [14]:
if considerCol["Gender"] == False:
  del df_train_data["Gender"]
  del df_vali_data["Gender"]
else: 
  df_train_data, df_vali_data = dfutil.getDummiesForSplitSets(df_train_data, df_vali_data, "Gender")

In [15]:
if considerCol["DayofWeek"] == False:
  del df_train_data["DayofWeek"]
  del df_vali_data["DayofWeek"]
else: 
  df_train_data = featutil.formatDayOfWeek(df_train_data)
  df_vali_data = featutil.formatDayOfWeek(df_vali_data)

In [16]:
if considerCol["Month"] == False:
  del df_train_data["Month"]
  del df_vali_data["Month"]
else: 
  df_train_data = featutil.formatMonth(df_train_data)
  df_vali_data = featutil.formatMonth(df_vali_data)

In [17]:
if considerCol["DayofMonth"] == False:
  del df_train_data["DayofMonth"]
  del df_vali_data["DayofMonth"]

In [18]:
if considerCol["Year"] == False:
  del df_train_data["Year"]
  del df_vali_data["Year"]

In [19]:
if considerCol["TimeOfDay"] == False:
  del df_train_data["TimeOfDay"]
  del df_vali_data["TimeOfDay"]
else: 
  df_train_data = featutil.formatTimeToSec(df_train_data)
  df_vali_data = featutil.formatTimeToSec(df_vali_data)

In [20]:
if considerCol["Birthday"] == False:
  del df_train_data["Birthday"]
  del df_vali_data["Birthday"]
else: 
  df_train_data = featutil.convertBirthdayToAge(df_train_data)
  df_vali_data = featutil.convertBirthdayToAge(df_vali_data)

In [21]:
if considerCol["ReviewerReviewCount"] == False:
  del df_train_data["ReviewerReviewCount"]
  del df_vali_data["ReviewerReviewCount"]

In [22]:
if considerCol["BeerReviewCount"] == False:
  del df_train_data["BeerReviewCount"]
  del df_vali_data["BeerReviewCount"]

In [23]:
print(df_train_data.shape)
print(df_vali_data.shape)

df_train_data.head()

(106546, 403)
(35089, 403)


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,19,12300,10635,4.0,36,23,7.4,1,23,5,...,0,0,0,0,0,0,0,0,1,0
1,21,12300,6547,4.5,4,23,7.4,1,16,5,...,0,0,0,0,0,0,0,0,1,0
2,23,12300,9789,4.5,36,23,7.4,7,10,4,...,0,0,0,0,0,0,0,0,0,1
3,24,12300,7372,5.0,69,23,7.4,3,30,3,...,0,0,0,0,0,0,0,0,1,0
4,25,12300,1302,4.5,59,23,7.4,4,24,3,...,0,0,0,0,0,0,0,0,1,0


In [24]:
df_vali_data.head()

Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,22,12300,2634,4.0,4,9,7.4,6,14,5,...,0,0,0,0,0,0,0,0,0,1
1,27,12300,5634,4.5,9,9,7.4,4,10,2,...,0,0,0,0,0,0,0,0,0,1
2,28,12300,3544,4.5,40,9,7.4,5,10,12,...,0,0,0,0,0,0,0,0,0,1
3,40,12300,6521,4.0,19,9,5.5,4,27,8,...,0,0,0,0,0,0,0,0,1,0
4,43,12300,10177,4.5,2,9,5.5,1,10,8,...,0,0,0,0,0,0,0,1,0,0


In [25]:
# free up the memory
del dfFullData
del dfFullDataVali

In [26]:
# Get all the columns
col_names = df_train_data.columns

idCols = ['RowID','BeerID','ReviewerID']
feature_cols =  col_names.drop(['RowID','BeerID','ReviewerID','rating' ])
target_col = 'rating'

# Create the sub data sets of the features and the target
dfTrainIds = df_train_data[idCols]
dfTrainFeatures = df_train_data[feature_cols]
dfTrainTarget = df_train_data[target_col]

dfValiIds = df_vali_data[idCols]
dfValiFeatures = df_vali_data[feature_cols]
dfValiTarget = df_vali_data[target_col]


In [27]:
df_vali_data[['BeerID','ReviewerID','rating']].head()

Unnamed: 0,BeerID,ReviewerID,rating
0,12300,2634,4.0
1,12300,5634,4.5
2,12300,3544,4.5
3,12300,6521,4.0
4,12300,10177,4.5


In [28]:
df_train_data.head()


Unnamed: 0,RowID,BeerID,ReviewerID,rating,ReviewerReviewCount,BeerReviewCount,ABV,DayofWeek,DayofMonth,Month,...,BeerType_SmokedBeer,BeerType_Tripel,BeerType_ViennaLager,BeerType_Weizenbock,BeerType_Wheatwine,BeerType_WinterWarmer,BeerType_Witbier,Gender_Female,Gender_Male,Gender_unknown
0,19,12300,10635,4.0,36,23,7.4,1,23,5,...,0,0,0,0,0,0,0,0,1,0
1,21,12300,6547,4.5,4,23,7.4,1,16,5,...,0,0,0,0,0,0,0,0,1,0
2,23,12300,9789,4.5,36,23,7.4,7,10,4,...,0,0,0,0,0,0,0,0,0,1
3,24,12300,7372,5.0,69,23,7.4,3,30,3,...,0,0,0,0,0,0,0,0,1,0
4,25,12300,1302,4.5,59,23,7.4,4,24,3,...,0,0,0,0,0,0,0,0,1,0


In [29]:
from lightfm.data import Dataset

beerIds = dfTrainIds["BeerID"].drop_duplicates()
reviewerIds = dfTrainIds["ReviewerID"].drop_duplicates()



In [30]:

def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features

In [31]:
beerFeatureCols = ["BeerReviewCount", "ABV"]
reviewerFeatureCols = ["ReviewerReviewCount", "DayofWeek", "DayofMonth", "Month", "Year"]

beerFeatures = generate_feature_list(df_train_data[beerFeatureCols], beerFeatureCols)
reviewerFeatures = generate_feature_list(df_train_data[reviewerFeatureCols], reviewerFeatureCols)

In [44]:

def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features

In [34]:
beerFeatures

0         23.0
1          7.4
2         23.0
3          7.4
4         23.0
          ... 
213087     5.0
213088     4.0
213089     5.0
213090     4.0
213091     5.0
Length: 213092, dtype: object

In [38]:
beerReviewerTuples = list(zip(beerIds, reviewerIds))

In [36]:
beerReviewerTuples

[(12300, 10635),
 (7957, 6547),
 (5508, 9789),
 (4522, 7372),
 (6768, 1302),
 (825, 704),
 (7333, 1747),
 (471, 9368),
 (9597, 2568),
 (13705, 6838),
 (14108, 850),
 (13375, 9705),
 (12406, 3264),
 (9194, 2962),
 (7551, 2748),
 (11379, 757),
 (7622, 7207),
 (9185, 2849),
 (8029, 4737),
 (13087, 7826),
 (10274, 8720),
 (10201, 10680),
 (10972, 4094),
 (9753, 7162),
 (191, 10135),
 (10034, 6330),
 (8964, 1292),
 (9443, 2359),
 (1456, 10356),
 (13635, 7279),
 (13944, 1525),
 (10546, 1977),
 (11426, 3544),
 (10308, 4582),
 (5432, 5769),
 (10517, 6488),
 (6736, 10606),
 (6180, 9733),
 (11181, 879),
 (6041, 5764),
 (8188, 1524),
 (2790, 681),
 (6695, 1251),
 (5947, 2201),
 (10692, 5144),
 (13542, 8883),
 (13735, 8353),
 (11845, 3712),
 (13725, 9761),
 (197, 6361),
 (2048, 7637),
 (12653, 2282),
 (13726, 5912),
 (7472, 6464),
 (12928, 8372),
 (14093, 8435),
 (931, 5153),
 (5789, 8568),
 (10973, 3637),
 (12485, 5604),
 (9088, 2639),
 (1889, 5261),
 (1673, 458),
 (9366, 1189),
 (11965, 1891),
 

In [39]:

dataset = Dataset()
dataset.fit(set(beerIds), set(reviewerIds), item_features=reviewerFeatures, user_features=beerFeatures)

In [40]:
interactions = dataset.build_interactions(beerReviewerTuples)

In [41]:
interactions

(<1764x9026 sparse matrix of type '<class 'numpy.int32'>'
 	with 1764 stored elements in COOrdinate format>,
 <1764x9026 sparse matrix of type '<class 'numpy.float32'>'
 	with 1764 stored elements in COOrdinate format>)

In [43]:
beerFeaturesBuilt = dataset.build_item_features(beerFeatures)
reviewerFeaturesBuilt = dataset.build_item_features(reviewerFeatures)

ValueError: Expected tuples of (item_id, features), got 23.0.

In [42]:
# Set the number of threads; you can increase this
# if you have more physical cores available.
NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

# Let's fit a WARP model: these generally have the best performance.
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
               no_components=NUM_COMPONENTS)

# Run 3 epochs and time it.
model = model.fit(interactions, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

AttributeError: 'tuple' object has no attribute 'tocoo'

In [None]:


model = algorithm.fit(trainsetTrainFeatures)
predictions = algorithm.test(valset)


# Score our predictions with MAE
# It is around 0.77, which means the a random guess based on the distribution of the data
# is on average within 0.77 (plus or minus) the true label.
# Not bad! You can beat it though, I'm sure :).
# Smaller MAE is the better. Good luck!
mae = accuracy.mae(predictions,verbose=True)

print("Average MAE: " + str(mae))

In [None]:
print(type(valset))
print(valset[0:10])


In [None]:
print(type(predictions))
print(str(len(predictions)))
print(predictions[0:10])
print(predictions[0])

In [None]:
# Convert the Predictions to a dataframe so we can lookup predictions easy
lstUIds = list(map(lambda x: x.uid, predictions))
lstIIds = list(map(lambda x: x.iid, predictions))
lstTrueRatings = list(map(lambda x: x.r_ui, predictions))
lstRatingEst = list(map(lambda x: x.est, predictions))


# uid == BeerId, iid == ReviewerId, r_ui == Original Ration, est = Predicted rating
dfPredictions = pd.DataFrame({ "uid": lstUIds,"iid": lstIIds, "r_ui": lstTrueRatings, "Predict": lstRatingEst })

dfPredictions.head()
# dfPredictions[dfPredictions.uid == 3519]

In [None]:
print(dfValiIds.shape)
print(dfPredictions.shape)

In [None]:
# join the predictions to the ids, sort by rowid and write to file
dfPredictions = pd.merge(dfValiIds, dfPredictions, how="inner", left_on=["BeerID", "ReviewerID"], right_on=["uid", "iid"])
dfPredictions.head()


Write to a subrun file

In [None]:
if writeSubRunFile:
  dfPredictions.sort_values("RowID")[["RowID", "BeerID", "ReviewerID", "Predict"]].to_csv(subrunDir + filePrefix + "_subrun.csv", index=False)

print("Average MAE: " + str(mae))
print("analyse_maes.append(" + str(mae) + ")")
print(dfPredictions.shape)
dfPredictions.sort_values("RowID").head(8)


## Summary - 200k of data

First Run of Content KNN with just the numbers col came out to around 0.5 (exact value lost)

Adding Scaling: 0.501

## Full Data
