In [1]:
import re
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from collections import defaultdict
import pickle

In [13]:
# load term matrix
with open('data/title_matrix.pkl', 'rb') as f:
    matrix = pickle.load(f)
    f.close()

# load original reviews df
with open('data/reviews_processed.pkl', 'rb') as f:
    reviews = pickle.load(f)
    f.close()

# load look-up dict
with open('data/title_key.pkl', 'rb') as f:
    vocab = pickle.load(f)
    f.close()

In [14]:
reviews.index = pd.RangeIndex(len(reviews.index))

In [16]:
# filter reviews without title
with_title = reviews[reviews['clean_title'] != '']

In [17]:
filtered_idx = reviews[reviews['clean_title'] != ''].index.values

In [19]:
# create new df
y = with_title['star_rating']

# Try product department instead of class
X = pd.DataFrame({'class':with_title['product_category_department'],
                  'upvotes':with_title['upvotes']})

# change to a 3 class problem
new_ys = []
for score in y:
    if score < 3:
        new_ys.append(0)
    elif score == 3:
        new_ys.append(1)
    else:
        new_ys.append(2)

In [20]:
# create df for features
lemmas = pd.DataFrame(columns=vocab.keys())
for c in tqdm(lemmas.columns.values, desc='Adding data to columns'):
    vocab_index = vocab[c]
    data = []
    for i in filtered_idx:
        data.append(matrix[(i-1, vocab_index)])
    lemmas[c] = data

HBox(children=(IntProgress(value=0, description='Adding data to columns', max=3057, style=ProgressStyle(descri…




In [21]:
X = X.reset_index(drop=True)
lemmas = lemmas.reset_index(drop=True)
X_feats = pd.concat([X, lemmas], axis=1)

# make product class dummy variable
prod_class = pd.get_dummies(X['class'])
prod_class = prod_class.reset_index(drop=True)

# drop original class columns
# concat prod_class
X_feats.drop('class', axis=1, inplace=True)
X_feats = pd.concat([X_feats, prod_class], axis=1)

X_feats.to_csv("data/title_features.csv", index=False)

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_feats, new_ys, test_size=0.2)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [23]:
from sklearn.metrics import f1_score, accuracy_score, classification_report, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LinearRegression

# Fit Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

# Fit Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', 
                        multi_class='multinomial', 
                        n_jobs=-1)
lr.fit(x_train, y_train)

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(x_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [24]:
# Random Forest
y_preds = rf.predict(x_test)
rf_f1 = f1_score(y_test, y_preds, average='weighted')
rf_acc = accuracy_score(y_test, y_preds)
rf_mae = mean_absolute_error(y_test, y_preds)

print("Random Forest")
print("F1_Weighted", rf_f1)
print("Accuracy", rf_acc)
print("MAE", rf_mae)
print("*"*40)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Random Forest
F1_Weighted 0.6656767040982435
Accuracy 0.7440914866581957
MAE 0.37229987293519695
****************************************
              precision    recall  f1-score   support

         Bad       0.14      0.02      0.04       423
     Neutral       0.18      0.03      0.05       496
        Good       0.77      0.96      0.85      3016

   micro avg       0.74      0.74      0.74      3935
   macro avg       0.36      0.34      0.32      3935
weighted avg       0.63      0.74      0.67      3935



In [25]:
#Logistic Regression
y_preds = lr.predict(x_test)
lr_f1 = f1_score(y_test, y_preds, average='weighted')
lr_acc = accuracy_score(y_test, y_preds)
lr_mae = mean_absolute_error(y_test, y_preds)
print("Logistic Regression")
print("F1_Weighted", lr_f1)
print("Accuracy", lr_acc)
print("MAE", lr_mae)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Logistic Regression
F1_Weighted 0.6604496662980864
Accuracy 0.737738246505718
MAE 0.3801778907242694
              precision    recall  f1-score   support

         Bad       0.08      0.01      0.02       423
     Neutral       0.14      0.03      0.05       496
        Good       0.77      0.96      0.85      3016

   micro avg       0.74      0.74      0.74      3935
   macro avg       0.33      0.33      0.31      3935
weighted avg       0.61      0.74      0.66      3935



In [26]:
#Bernoulli NaivesBayes
y_preds = bnb.predict(x_test)
bnb_f1 = f1_score(y_test, y_preds, average='weighted')
bnb_acc = accuracy_score(y_test, y_preds)
bnb_mae = mean_absolute_error(y_test, y_preds)
print("Bernoulli NB")
print("F1_Weighted", bnb_f1)
print("Accuracy", bnb_acc)
print("MAE", bnb_mae)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Bernoulli NB
F1_Weighted 0.6575100249628001
Accuracy 0.7415501905972046
MAE 0.3761118170266836
              precision    recall  f1-score   support

         Bad       0.03      0.00      0.01       423
     Neutral       0.07      0.01      0.02       496
        Good       0.77      0.97      0.85      3016

   micro avg       0.74      0.74      0.74      3935
   macro avg       0.29      0.33      0.29      3935
weighted avg       0.60      0.74      0.66      3935

