In [1]:
import re
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from collections import defaultdict
#import spacy
#import textacy
#nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [3]:
# load term matrix
import pickle
with open('data/term_matrix.pkl', 'rb') as f:
    matrix = pickle.load(f)
    f.close()

In [4]:
# load original reviews df
with open('data/reviews_clean.pkl', 'rb') as f:
    reviews = pickle.load(f)
    f.close()

In [None]:
# load look-up dict
with open('data/vocab_key.pkl', 'rb') as f:
    vocab = pickle.load(f)
    f.close()

In [None]:
matrix.shape

In [None]:
# make class balanced
# processed_df = reviews.groupby('star_rating')
# processed_df = pd.DataFrame(processed_df.progress_apply(
#     lambda x: x.sample(processed_df.size().min()).reset_index(drop=True)))
# processed_df.head()

In [5]:
# create new df
y = reviews['star_rating']

# Try product department instead of class
X = pd.DataFrame({'class':reviews['product_category_department'],
                  'upvotes':reviews['upvotes']})

In [None]:
# create df for features
lemmas = pd.DataFrame(columns=vocab.keys())

In [None]:
NUM_DOCS = len(X)

for c in tqdm(lemmas.columns.values, desc='Adding data to columns'):
    vocab_index = vocab[c]
    data = []
    for i in range(NUM_DOCS):
        data.append(matrix[(i, vocab_index)])
    lemmas[c] = data

In [None]:
X = X.reset_index(drop=True)
X.shape

In [None]:
lemmas = lemmas.reset_index(drop=True)
lemmas.shape

In [None]:
X_feats = pd.concat([X, lemmas], axis=1)

In [None]:
X_feats.shape

In [None]:
# make product class dummy variable
prod_class = pd.get_dummies(X['class'])
prod_class = prod_class.reset_index(drop=True)

In [None]:
# drop original class columns
# concat prod_class
X_feats.drop('class', axis=1, inplace=True)
X_feats = pd.concat([X_feats, prod_class], axis=1)

In [None]:
X_feats.shape

In [6]:
# X_feats.to_csv("data/features.csv", index=False)
X_feats = pd.read_csv("data/senti_features.csv")

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_feats, y, test_size=0.2)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [8]:
# Fit Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
# Fit Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', 
                        multi_class='multinomial', 
                        n_jobs=-1)
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
# Evaluate accuracy
from sklearn.metrics import f1_score, accuracy_score, mean_absolute_error

# Random Forest
y_preds = rf.predict(x_test)
rf_f1 = f1_score(y_test, y_preds, average='weighted')
rf_acc = accuracy_score(y_test, y_preds)
rf_mae = mean_absolute_error(y_test, y_preds)

# Logistic Regression
y_preds = lr.predict(x_test)
lr_f1 = f1_score(y_test, y_preds, average='weighted')
lr_acc = accuracy_score(y_test, y_preds)
lr_mae = mean_absolute_error(y_test, y_preds)

In [11]:
print("Random Forest")
print("F1_Weighted", rf_f1)
print("Accuracy", rf_acc)
print("MAE", rf_mae)
print("*"*40)
print("Logistic Regression")
print("F1_Weighted", lr_f1)
print("Accuracy", lr_acc)
print("MAE", lr_mae)

Random Forest
F1_Weighted 0.47929329277221994
Accuracy 0.5846765290351071
MAE 0.6869066018988739
****************************************
Logistic Regression
F1_Weighted 0.5577151185450562
Accuracy 0.5628173989843233
MAE 0.6710090527710312


In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           1       0.13      0.22      0.16       157
           2       0.21      0.19      0.20       331
           3       0.31      0.31      0.31       560
           4       0.36      0.28      0.31       952
           5       0.76      0.79      0.77      2529

   micro avg       0.56      0.56      0.56      4529
   macro avg       0.35      0.36      0.35      4529
weighted avg       0.56      0.56      0.56      4529



In [None]:
# # kNN
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier()
# knn.fit(x_train, y_train)

# y_preds = knn.predict(x_test)
# knn_f1 = f1_score(y_test, y_preds, average='weighted')
# knn_acc = accuracy_score(y_test, y_preds)
# knn_mae = mean_absolute_error(y_test, y_preds)

# print("SVC")
# print("F1_Weighted", knn_f1)
# print("Accuracy", knn_acc)
# print("MAE", knn_mae)

In [13]:
# change to a 3 class problem
new_ys = []
for score in y:
    if score < 3:
        new_ys.append(0)
    elif score == 3:
        new_ys.append(1)
    else:
        new_ys.append(2)

In [14]:
# resplit and fit scaler
x_train, x_test, y_train, y_test = train_test_split(X_feats, new_ys, test_size=0.2)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [15]:
# Fit Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
# Fit Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', 
                        multi_class='multinomial', 
                        n_jobs=-1)
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
# Random Forest
y_preds = rf.predict(x_test)
rf_f1 = f1_score(y_test, y_preds, average='weighted')
rf_acc = accuracy_score(y_test, y_preds)
rf_mae = mean_absolute_error(y_test, y_preds)

# Logistic Regression
# y_preds = lr.predict(x_test)
# lr_f1 = f1_score(y_test, y_preds, average='weighted')
# lr_acc = accuracy_score(y_test, y_preds)
# lr_mae = mean_absolute_error(y_test, y_preds)

In [18]:
print("Random Forest")
print("F1_Weighted", rf_f1)
print("Accuracy", rf_acc)
print("MAE", rf_mae)
print("*"*40)
print("Logistic Regression")
print("F1_Weighted", lr_f1)
print("Accuracy", lr_acc)
print("MAE", lr_mae)

Random Forest
F1_Weighted 0.7276550602608697
Accuracy 0.7968646500331199
MAE 0.28527268712740117
****************************************
Logistic Regression
F1_Weighted 0.7785375977302226
Accuracy 0.7847206888937955
MAE 0.2784279090306911


In [21]:
# kNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

y_preds = knn.predict(x_test)
knn_f1 = f1_score(y_test, y_preds, average='weighted')
knn_acc = accuracy_score(y_test, y_preds)
knn_mae = mean_absolute_error(y_test, y_preds)

print("SVC")
print("F1_Weighted", knn_f1)
print("Accuracy", knn_acc)
print("MAE", knn_mae)

SVC
F1_Weighted 0.7016984057696425
Accuracy 0.7697063369397218
MAE 0.3294325458158534


In [24]:
classes = ['Bad', 'Neutral', 'Good']
print(classification_report(y_test, y_preds, target_names=classes))

              precision    recall  f1-score   support

         Bad       0.64      0.17      0.27       457
     Neutral       0.45      0.03      0.06       544
        Good       0.80      1.00      0.89      3528

   micro avg       0.80      0.80      0.80      4529
   macro avg       0.63      0.40      0.41      4529
weighted avg       0.75      0.80      0.73      4529

