In [1]:
import re
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from collections import defaultdict
#import spacy
#import textacy
#nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [2]:
# load term matrix
import pickle
with open('data/term_matrix.pkl', 'rb') as f:
    matrix = pickle.load(f)
    f.close()

In [3]:
# load original reviews df
with open('data/reviews_processed.pkl', 'rb') as f:
    reviews = pickle.load(f)
    f.close()

In [4]:
# load look-up dict
with open('data/vocab_key.pkl', 'rb') as f:
    vocab = pickle.load(f)
    f.close()

In [None]:
matrix.shape

In [None]:
# make class balanced
# processed_df = reviews.groupby('star_rating')
# processed_df = pd.DataFrame(processed_df.progress_apply(
#     lambda x: x.sample(processed_df.size().min()).reset_index(drop=True)))
# processed_df.head()

In [5]:
# create new df
y = reviews['star_rating']

# Try product department instead of class
X = pd.DataFrame({'class':reviews['product_category_department'],
                  'upvotes':reviews['upvotes']})

In [None]:
# create df for features
lemmas = pd.DataFrame(columns=vocab.keys())

In [None]:
NUM_DOCS = len(X)

for c in tqdm(lemmas.columns.values, desc='Adding data to columns'):
    vocab_index = vocab[c]
    data = []
    for i in range(NUM_DOCS):
        data.append(matrix[(i, vocab_index)])
    lemmas[c] = data

In [None]:
X = X.reset_index(drop=True)
X.shape

In [None]:
lemmas = lemmas.reset_index(drop=True)
lemmas.shape

In [None]:
X_feats = pd.concat([X, lemmas], axis=1)

In [None]:
X_feats.shape

In [None]:
# make product class dummy variable
prod_class = pd.get_dummies(X['class'])
prod_class = prod_class.reset_index(drop=True)

In [None]:
# drop original class columns
# concat prod_class
X_feats.drop('class', axis=1, inplace=True)
X_feats = pd.concat([X_feats, prod_class], axis=1)

In [None]:
X_feats.shape

In [6]:
# X_feats.to_csv("data/features.csv", index=False)
X_feats = pd.read_csv("data/senti_features.csv")

In [7]:
X_feats = X_feats.drop('upvotes', axis=1)

Unnamed: 0,absolut,wonder,silki,sexi,comfort,love,dress,sooo,pretti,happen,...,moth,bailey,engag,umbrella,bottoms,dresses,intimate,jackets,tops,trend.1
0,0.08991,0.152306,0.03224,0.147115,-0.09285,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.010558,0.024489,0.08054,-0.004734,0.066538,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
2,0.0,0.0,0.0,0.0,-0.012742,0.0,0.003152,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.017284,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.019062,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0


In [9]:
# change to a 3 class problem
new_ys = []
for score in y:
    if score < 3:
        new_ys.append(0)
    elif score == 3:
        new_ys.append(1)
    else:
        new_ys.append(2)

In [28]:
# resplit and fit scaler
x_train, x_test, y_train, y_test = train_test_split(X_feats, new_ys, test_size=0.2, random_state=1234)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [29]:
# Fit Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
# Fit Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', 
                        multi_class='multinomial', 
                        n_jobs=-1)
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
from sklearn.metrics import classification_report, f1_score, accuracy_score, mean_absolute_error

# Random Forest
y_preds = rf.predict(x_test)
rf_f1 = f1_score(y_test, y_preds, average='weighted')
rf_acc = accuracy_score(y_test, y_preds)
rf_mae = mean_absolute_error(y_test, y_preds)
print("Random Forest")
print("F1_Weighted", rf_f1)
print("Accuracy", rf_acc)
print("MAE", rf_mae)
print("*"*40)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Random Forest
F1_Weighted 0.7376889673978086
Accuracy 0.8008390373150805
MAE 0.27997350408478694
****************************************
              precision    recall  f1-score   support

         Bad       0.58      0.20      0.30       449
     Neutral       0.48      0.04      0.08       534
        Good       0.81      0.99      0.89      3546

   micro avg       0.80      0.80      0.80      4529
   macro avg       0.62      0.41      0.42      4529
weighted avg       0.75      0.80      0.74      4529



In [32]:
# Logistic Regression
y_preds = lr.predict(x_test)
lr_f1 = f1_score(y_test, y_preds, average='weighted')
lr_acc = accuracy_score(y_test, y_preds)
lr_mae = mean_absolute_error(y_test, y_preds)
print("Logistic Regression")
print("F1_Weighted", lr_f1)
print("Accuracy", lr_acc)
print("MAE", lr_mae)
print("*"*40)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Logistic Regression
F1_Weighted 0.786516369031218
Accuracy 0.7906822698167366
MAE 0.26849194082578937
****************************************
              precision    recall  f1-score   support

         Bad       0.42      0.43      0.43       449
     Neutral       0.34      0.29      0.32       534
        Good       0.89      0.91      0.90      3546

   micro avg       0.79      0.79      0.79      4529
   macro avg       0.55      0.55      0.55      4529
weighted avg       0.78      0.79      0.79      4529



In [33]:
# Bernoulli NB
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(x_train, y_train)

y_preds = bnb.predict(x_test)
bnb_f1 = f1_score(y_test, y_preds, average='weighted')
bnb_acc = accuracy_score(y_test, y_preds)
bnb_mae = mean_absolute_error(y_test, y_preds)
print("Bernoulli NB")
print("F1_Weighted", bnb_f1)
print("Accuracy", bnb_acc)
print("MAE", bnb_mae)

classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

Bernoulli NB
F1_Weighted 0.8028628498123824
Accuracy 0.7999558401413116
MAE 0.24243762419960257
              precision    recall  f1-score   support

         Bad       0.51      0.50      0.51       449
     Neutral       0.35      0.38      0.36       534
        Good       0.91      0.90      0.91      3546

   micro avg       0.80      0.80      0.80      4529
   macro avg       0.59      0.60      0.59      4529
weighted avg       0.81      0.80      0.80      4529



In [34]:
import numpy as np
N = 5
for i, class_name in enumerate(classes):
  indices = np.argsort(bnb.coef_[i])
  feature_names = np.array(X_feats.columns.values)[indices]
  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
  print("# '{}':".format(class_name))
  print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))

# 'Bad':
  . Top unigrams:
       . tops
       . look
       . love
       . dresses
       . size
# 'Neutral':
  . Top unigrams:
       . tops
       . look
       . size
       . dresses
       . fit
# 'Good':
  . Top unigrams:
       . tops
       . fit
       . size
       . love
       . look
