In [12]:
import pickle
#from create_plots import dist_word
#from create_plots import generate_POS_wordcloud
#from create_plots import bigram_wordcould
import nltk
import create_plots
import re
from wordcloud import WordCloud
from itertools import chain
import numpy as np
import random
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import make_scorer
from numpy.random import seed
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd

In [13]:
# set seed
seed(42)

In [14]:
# open dataset
with open('enriched_thomann_data.pickle', 'rb') as f:
        thomann_data = pickle.load(f)

In [15]:
# custom metric
def custom_mse_metric(y_true, y_pred):
    mse_class = 0
    num_classes = len(np.unique(y_true))
    stacked = np.vstack((y_true, y_pred))
    for i in np.unique(stacked[0]):     
        y_true_temp = stacked[0][np.where(stacked[0]==i)]
        y_pred_temp = stacked[1][np.where(stacked[0]==i)]
        mse = np.mean(np.square(y_pred_temp - y_true_temp))
        mse_class += mse
    return mse_class/num_classes

custom_mse_score = make_scorer(custom_mse_metric, greater_is_better=False)

In [16]:
# basic feature engineering
thomann_data["total_word_count"] = [len(review.split()) for review in thomann_data["review"]]
thomann_data["count_nouns"] =[len([word for (word, tag) in thomann_data["taggedWords_neg"][i] if re.match("NN*", tag)]) for i, _ in enumerate(thomann_data["taggedWords_neg"])]
thomann_data["count_verbs"] =[len([word for (word, tag) in thomann_data["taggedWords_neg"][i] if re.match("VB*", tag)]) for i, _ in enumerate(thomann_data["taggedWords_neg"])]
thomann_data["count_conjunction"] =[len([word for (word, tag) in thomann_data["taggedWords_neg"][i] if re.match("CC*", tag)]) for i, _ in enumerate(thomann_data["taggedWords_neg"])]
thomann_data["count_adverb"] =[len([word for (word, tag) in thomann_data["taggedWords_neg"][i] if re.match("RB*", tag)]) for i, _ in enumerate(thomann_data["taggedWords_neg"])]
thomann_data["count_adjectives"] =[len([word for (word, tag) in thomann_data["taggedWords_neg"][i] if re.match("JJ*", tag)]) for i, _ in enumerate(thomann_data["taggedWords_neg"])]



In [17]:
# train- testsplit
y = thomann_data["stars_gesamt"]/20
X = thomann_data.loc[:, ~ thomann_data.columns.isin([x for x in thomann_data.columns if re.match("stars", x)])]
num_classes = len(np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)
del X, y

In [18]:

# features auf traditionelle Feature anpassen

dmatrix_train = xgb.DMatrix(data = X_train[["count_nouns","count_verbs","count_conjunction","count_adverb","count_adjectives","total_word_count"]], label = y_train)
dmatrix_test = xgb.DMatrix(data = X_test[["count_nouns","count_verbs","count_conjunction","count_adverb","count_adjectives","total_word_count"]])

parameter = {
    "eta": 0.1, 
    "seed":0, 
    "objective": "reg:linear",
    "silent": True
}

xgb_model = xgb.train(params=parameter,
                      dtrain=dmatrix_train,
                      num_boost_round=100)

y_pred_xgb = xgb_model.predict(dmatrix_test)
#plot_importance(xgb_model)
del dmatrix_train, dmatrix_test

In [19]:
def classWiseMetric(y_true, y_pred):
    df = pd.DataFrame(np.column_stack([y_pred, y_true]), columns=["Predicted", "Truth"])
    class_mse = 0
    class_mae = 0
    print("MSE for Classes:")
    for i in df.Truth.unique():
        temp = df[df["Truth"]==i]
        mse = mean_squared_error(temp.Truth, temp.Predicted)
        mae = mean_absolute_error(temp.Truth, temp.Predicted)
        print("Class {}: MSE:{} MAE:{}".format(i, mse, mae))
        class_mse += mse
        class_mae += mae
    print()
    print("AVG MSE over Classes {}".format(class_mse/len(df.Truth.unique())))
    print("AVG MAE over Classes {}".format(class_mae/len(df.Truth.unique())))
    print()
    print("Global: MAE: {} MSE {}".format(mean_absolute_error(y_true, y_pred), mean_squared_error(y_true, y_pred)))

In [20]:
# baseline, predict mean of y_train
classWiseMetric(y_test, np.repeat(y_train.mean(),y_test.size))

MSE for Classes:
Class 4.0: MSE:0.19065221053228162 MAE:0.43663739021330006
Class 2.0: MSE:5.9372017713854826 MAE:2.4366373902132996
Class 5.0: MSE:0.3173774301056815 MAE:0.5633626097867
Class 3.0: MSE:2.0639269909588824 MAE:1.4366373902133
Class 1.0: MSE:11.810476551812082 MAE:3.4366373902132996

AVG MSE over Classes 4.0639269909588815
AVG MAE over Classes 1.6619824341279799

Global: MAE: 0.6709366984289776 MSE 0.6835984729835773


In [21]:
# baseline, predict most common class
classWiseMetric(y_test, np.repeat(5,y_test.size))

MSE for Classes:
Class 4.0: MSE:1.0 MAE:1.0
Class 2.0: MSE:9.0 MAE:3.0
Class 5.0: MSE:0.0 MAE:0.0
Class 3.0: MSE:4.0 MAE:2.0
Class 1.0: MSE:16.0 MAE:4.0

AVG MSE over Classes 6.0
AVG MAE over Classes 2.0

Global: MAE: 0.5639810426540285 MSE 1.001672706997491


In [22]:
# baseline, predict most common class
classWiseMetric(y_test, y_pred_xgb)

MSE for Classes:
Class 4.0: MSE:0.2050125746908523 MAE:0.41761376595858374
Class 2.0: MSE:4.936202196784972 MAE:2.1775632172488093
Class 5.0: MSE:0.3133088534555962 MAE:0.5255092131052491
Class 3.0: MSE:1.8068114673592084 MAE:1.3149523457190149
Class 1.0: MSE:10.403542652755759 MAE:3.148956310181391

AVG MSE over Classes 3.532975549009278
AVG MAE over Classes 1.5169189704426096

Global: MAE: 0.6239959333545823 MSE 0.6250284386381199
