In [1]:
import re
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from collections import defaultdict
#import spacy
#import textacy
#nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [2]:
# load term matrix
import pickle
with open('data/term_matrix.pkl', 'rb') as f:
    matrix = pickle.load(f)
    f.close()

In [3]:
# load original reviews df
with open('data/reviews_clean.pkl', 'rb') as f:
    reviews = pickle.load(f)
    f.close()

In [17]:
# load look-up dict
with open('data/vocab_key.pkl', 'rb') as f:
    vocab = pickle.load(f)
    f.close()

In [11]:
matrix.shape

(22641, 1000)

In [24]:
# create new df
y = reviews['star_rating']
X = pd.DataFrame({'class':reviews['product_category_class'],
                  'upvotes':reviews['upvotes']})

In [9]:
# create df for features
lemmas = pd.DataFrame(columns=vocab.values())

In [None]:
NUM_DOCS = 22641

for c in tqdm(lemmas.columns.values, desc='Adding data to columns'):
    vocab_index = vocab[c]
    data = []
    for i in tqdm(range(NUM_DOCS), desc='Going though reviews'):
        data.append(matrix[(i, vocab_index)])
    lemmas[c] = data

In [48]:
X = X.reset_index(drop=True)
X.shape

(22641, 2)

In [47]:
lemmas = lemmas.reset_index(drop=True)
lemmas.shape

(22641, 1000)

In [54]:
X_feats = pd.concat([X, lemmas], axis=1)

In [55]:
X_feats.shape

(22641, 1002)

In [56]:
# make product class dummy variable
prod_class = pd.get_dummies(X['class'])
prod_class = prod_class.reset_index(drop=True)

In [57]:
# drop original class columns
# concat prod_class
X_feats.drop('class', axis=1, inplace=True)
X_feats = pd.concat([X_feats, prod_class], axis=1)

In [58]:
X_feats.shape

(22641, 1021)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_feats, y, test_size=0.2)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [61]:
# Fit Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [62]:
# Fit Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs', 
                        multi_class='multinomial', 
                        n_jobs=-1)
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [63]:
# Evaluate accuracy
from sklearn.metrics import f1_score, accuracy_score, mean_absolute_error

# Random Forest
y_preds = rf.predict(x_test)
rf_f1 = f1_score(y_test, y_preds, average='weighted')
rf_acc = accuracy_score(y_test, y_preds)
rf_mae = mean_absolute_error(y_test, y_preds)

# Logistic Regression
y_preds = rf.predict(x_test)
lr_f1 = f1_score(y_test, y_preds, average='weighted')
lr_acc = accuracy_score(y_test, y_preds)
lr_mae = mean_absolute_error(y_test, y_preds)

In [66]:
print("Random Forest")
print("F1_Weighted", rf_f1)
print("Accuracy", rf_acc)
print("MAE", rf_mae)
print("*"*40)
print("Logistic Regression")
print("F1_Weighted", lr_f1)
print("Accuracy", lr_acc)
print("MAE", lr_mae)

Random Forest
F1_Weighted 0.4650954952288232
Accuracy 0.5738573636564362
MAE 0.7092073305365423
****************************************
Logistic Regression
F1_Weighted 0.4650954952288232
Accuracy 0.5738573636564362
MAE 0.7092073305365423


In [69]:
X_feats.to_csv('features.csv', index=False)