In [1]:
import re
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from collections import defaultdict
import pickle

In [2]:
with open("data/reviews_processed.pkl", "rb") as f:
    reviews = pickle.load(f)
    f.close()

In [6]:
# load term matrix
with open('data/ngram_matrix.pkl', 'rb') as f:
    matrix = pickle.load(f)
    f.close()

In [7]:
# load look-up dict
with open('data/ngram_vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
    f.close()

In [8]:
# create new df
y = reviews['star_rating']

# Try product department instead of class
X = pd.DataFrame({'class':reviews['product_category_department'],
                  'upvotes':reviews['upvotes']})

In [9]:
# create df for features
lemmas = pd.DataFrame(columns=vocab.keys())

NUM_DOCS = len(X)

for c in tqdm(lemmas.columns.values, desc='Adding data to columns'):
    vocab_index = vocab[c]
    data = []
    for i in range(NUM_DOCS):
        data.append(matrix[(i, vocab_index)])
    lemmas[c] = data

HBox(children=(IntProgress(value=0, description='Adding data to columns', max=2000, style=ProgressStyle(descri…




In [10]:
X = X.reset_index(drop=True)
lemmas = lemmas.reset_index(drop=True)
X_feats = pd.concat([X, lemmas], axis=1)
# make product class dummy variable
prod_class = pd.get_dummies(X['class'])
prod_class = prod_class.reset_index(drop=True)
# drop original class columns
# concat prod_class
X_feats.drop('class', axis=1, inplace=True)
X_feats = pd.concat([X_feats, prod_class], axis=1)
X_feats = X_feats.drop('upvotes', axis=1)

X_feats.to_csv("data/ngram_features.csv", index=False)

In [11]:
# change to a 3 class problem
new_ys = []
for score in y:
    if score < 3:
        new_ys.append(0)
    elif score == 3:
        new_ys.append(1)
    else:
        new_ys.append(2)

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# resplit and fit scaler
x_train, x_test, y_train, y_test = train_test_split(X_feats, new_ys, test_size=0.2, random_state=1234)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  if __name__ == '__main__':


In [27]:
# Fit Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
# Fit Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', 
                        multi_class='multinomial', 
                        n_jobs=-1)
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
from sklearn.metrics import classification_report, f1_score, accuracy_score, mean_absolute_error

# Random Forest
y_preds = rf.predict(x_test)
rf_f1 = f1_score(y_test, y_preds, average='weighted')
rf_acc = accuracy_score(y_test, y_preds)
rf_mae = mean_absolute_error(y_test, y_preds)
print("Random Forest")
print("F1_Weighted", rf_f1)
print("Accuracy", rf_acc)
print("MAE", rf_mae)
print("*"*40)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Random Forest
F1_Weighted 0.7580024499901495
Accuracy 0.8098918083462133
MAE 0.2589975712077721
****************************************
              precision    recall  f1-score   support

         Bad       0.62      0.27      0.38       449
     Neutral       0.42      0.08      0.13       534
        Good       0.83      0.99      0.90      3546

   micro avg       0.81      0.81      0.81      4529
   macro avg       0.62      0.44      0.47      4529
weighted avg       0.76      0.81      0.76      4529



In [30]:
# Logistic Regression
y_preds = lr.predict(x_test)
lr_f1 = f1_score(y_test, y_preds, average='weighted')
lr_acc = accuracy_score(y_test, y_preds)
lr_mae = mean_absolute_error(y_test, y_preds)
print("Logistic Regression")
print("F1_Weighted", lr_f1)
print("Accuracy", lr_acc)
print("MAE", lr_mae)
print("*"*40)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Logistic Regression
F1_Weighted 0.7971321035659268
Accuracy 0.797747847206889
MAE 0.2583351733274454
****************************************
              precision    recall  f1-score   support

         Bad       0.43      0.47      0.45       449
     Neutral       0.37      0.34      0.36       534
        Good       0.91      0.91      0.91      3546

   micro avg       0.80      0.80      0.80      4529
   macro avg       0.57      0.57      0.57      4529
weighted avg       0.80      0.80      0.80      4529



In [37]:
# Bernoulli NB
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(x_train, y_train)

y_preds = bnb.predict(x_test)
bnb_f1 = f1_score(y_test, y_preds, average='weighted')
bnb_acc = accuracy_score(y_test, y_preds)
bnb_mae = mean_absolute_error(y_test, y_preds)
print("Bernoulli NB")
print("F1_Weighted", bnb_f1)
print("Accuracy", bnb_acc)
print("MAE", bnb_mae)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Bernoulli NB
F1_Weighted 0.8059131165362621
Accuracy 0.7948774563921396
MAE 0.24707440936189004
              precision    recall  f1-score   support

         Bad       0.47      0.56      0.51       449
     Neutral       0.35      0.43      0.38       534
        Good       0.94      0.88      0.91      3546

   micro avg       0.79      0.79      0.79      4529
   macro avg       0.59      0.62      0.60      4529
weighted avg       0.82      0.79      0.81      4529



In [32]:
import numpy as np
N = 5
for i, class_name in enumerate(classes):
    indices = np.argsort(bnb.coef_[i])
    feature_names = np.array(X_feats.columns.values)[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(class_name))
    print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
    print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

# 'Bad':
  . Top unigrams:
       . tops
       . like
       . dresses
       . size
       . fit
  . Top bigrams:
       . wanted love
       . looked like
       . looks like
       . ordered size
       . really wanted
# 'Neutral':
  . Top unigrams:
       . tops
       . like
       . size
       . fit
       . dresses
  . Top bigrams:
       . wanted love
       . true size
       . did work
       . just did
       . really wanted
# 'Good':
  . Top unigrams:
       . tops
       . love
       . size
       . fit
       . wear
  . Top bigrams:
       . true size
       . looks great
       . usually wear
       . fit perfectly
       . love dress
