## Imports and Function Declarations

In [79]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import nltk
from nltk import PorterStemmer # added
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize # added
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer # added

nltk.download('wordnet') # added
nltk.download('omw-1.4') # added
nltk.download('punkt') # added

import textacy
from textacy import preprocessing 
from textacy.preprocessing import remove, normalize, replace

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline

# sklearn preproc can convert string labels into numbers

import warnings 
import contractions

warnings.filterwarnings('ignore')

# import pkg_resources
# from symspellpy import SymSpell

# def get_speller(): 
    
#     speller = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
#     dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
#     bidict_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

#     speller.load_dictionary(dict_path, term_index=0, count_index=1)
#     speller.load_bigram_dictionary(bidict_path, term_index=0, count_index=2)
    
#     return speller

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/meganbull/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/meganbull/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/meganbull/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# ! python3 -m pip install bs4 # in case you don't have it installed

# GLOBALS 

F_PATH = 'amazon_reviews_us_Jewelry_v1_00.tsv'

STAR_H = 'star_rating'
REVIEW_H = 'review_body'

COLS=[STAR_H, REVIEW_H]

VAL_STARS = {'1', '2', '3', '4', '5'}

WNL = WordNetLemmatizer()
SW = SnowballStemmer(language='english')


In [3]:
def read_data(f_path=F_PATH):
   df = pd.read_csv(f_path, sep='\t', usecols=COLS, low_memory=False)
   # print(f"Shape before dropping NaN vals: {df.shape}")
   df.dropna(inplace=True)
   # print(f"Shape after dropping NaN vals: {df.shape}")
   return df

def get_sample(df, s_size=20000):

   grouped = df.groupby(STAR_H)
   rat_dfs = [grouped.get_group(rating).sample(n=s_size) for rating in VAL_STARS]
   # sampled[STAR_H] = sampled[STAR_H].astype("category")
   return pd.concat(rat_dfs) 

def gen_clean(text):
   """
   gen text cleanup 
   incl removal: extended ws, html tags, urls
   """
   text = BeautifulSoup(text, "html.parser").text #rm html tags 
   text = replace.urls(text, '')
   text = contractions.fix(text)
   text = remove.punctuation(text)
   text = normalize.whitespace(text)
   
   return text.lower()
   
def rm_stops(text): 
   """
   remove stop words from text 
   """
   stops = set(stopwords.words("english"))
   sans_stops = [tok for tok in word_tokenize(text) if tok not in stops]
   return " ".join(sans_stops).strip()

def lemmatize(text): 

   lemmas = [WNL.lemmatize(w) for w in word_tokenize(text)]
   return " ".join(lemmas).strip()

def print_report(test_labels, test_pred):
   classific_dict = classification_report(test_labels, test_pred, output_dict=True)

   for k,v in classific_dict.items(): 
      if k in VAL_STARS:
         print(f"Rating {k}:")
         print(f"\tPrecision: {v['precision']:.3f}")
         print(f"\tRecall: {v['recall']:.3f}")
         print(f"\tF1-score: {v['f1-score']:.3f}")
      elif k == 'macro avg':
         print("Overall Average:")
         print(f"\tPrecision: {v['precision']:.3f}")
         print(f"\tRecall: {v['recall']:.3f}")
         print(f"\tF1-score: {v['f1-score']:.3f}")


## Read Data
1. Read from file 
2. Drop NaN vals
3. Verify unique `star_rating` values are valid rating classes
4. Randomly select 20,000 samples frome each valid rating class 

In [4]:
df = read_data()
# print(df.head())
print(df[STAR_H].unique())

['5' '1' '4' '3' '2']


In [5]:
sampled = get_sample(df)
print(sampled.shape)
# print(sampled.head())
sampled.to_pickle('samp.pkl')

(100000, 2)


## Data Cleaning
1. Print Average character length of reviews pre-clean
2. Perform general text cleaning incl: 
   - Remove html tags
   - remove URLS
   - resolve contractions 
   - normalize whitespace 
   - all text to lowercase
3. Print Average character length of reviews post-clean/pre-preprocessing


In [6]:
raw_len_avg = sampled[REVIEW_H].str.len().mean()
print(f'Average character length pre-clean: {raw_len_avg}')

sampled[REVIEW_H] = sampled[REVIEW_H].apply(gen_clean)
sampled.sort_index(inplace=True)

cl_len_avg = sampled[REVIEW_H].str.len().mean()
print(f'Average character length post-clean: {cl_len_avg}')

Average character length pre-clean: 189.66679
Average character length post-clean: 183.69641


## Pre-processing
1. Remove english stop words 
2. Perform lemmatization 
3. Print average character length after pre-processing

In [7]:
sampled[REVIEW_H] = sampled[REVIEW_H].apply(rm_stops)
sampled[REVIEW_H] = sampled[REVIEW_H].apply(lemmatize)

preproc_len_avg = sampled[REVIEW_H].str.len().mean()
print(f'Average character length after preproc: {preproc_len_avg}')

Average character length after preproc: 108.95463


## TF-IDF Feature Extraction
1. Extract features from reviews 
2. Split sampled DF 80% training 20% testing 

In [80]:
v = TfidfVectorizer()
v.fit_transform(sampled[REVIEW_H])
feat = v.fit_transform(sampled[REVIEW_H]) # .toarray()
train_data, test_data, train_labels, test_labels = train_test_split(feat, sampled[STAR_H], test_size=0.2)

## Perceptron
1. Fit model 
2. Predict labels for `test_data`
3. Print Precision, Recall, F1-Score for each rating class 
4. Print overall averages of above. 

In [82]:
pipe = Pipeline([
           ('v', TfidfVectorizer()),
           ('clf', Perceptron())
])
params = {
    'v__use_idf': (True, False),
    'v__norm': ('l1', 'l2', None),
    
    # 'clf__penalty': ('l1', 'l2', 'elasticnet', None),
    'clf__random_state': (0, 42), 
    'clf__max_iter': (10, 20, 50, 80), 
    'clf__early_stopping':(True, False), # def: False
    'clf__n_iter_no_change':(2, 3, 4, 5, 6, 7), # def: 5
    'clf__class_weight': ('balanced', None)
}
grid_search = GridSearchCV(pipe, params, n_jobs=-1, verbose=1)
grid_search.fit(sampled[REVIEW_H], sampled[STAR_H])

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits




KeyboardInterrupt: 

In [69]:
p = Perceptron(random_state=42, early_stopping=True, validation_fraction=0.1, n_iter_no_change=5) # set to defaults: n_iter, vf
# balanced class weight with above params decr performance 
p.fit(train_data, train_labels)
p_pred = p.predict(test_data)
print_report(test_labels, p_pred)
# print(f"Accuracy: {metrics.accuracy_score(test_labels, p_pred)}")
p.n_iter_

Accuracy: 0.42155


11

# SVM

In [70]:
svm = LinearSVC()

parameters = {
   'penalty':(),
   'C':[1, 10]
}
clf = GridSearchCV(svm, parameters)

svm.fit(train_data, train_labels)
# sorted(clf.cv_results_.keys())

svm_pred = svm.predict(test_data)
print_report(test_labels, svm_pred)
# print(f"Accuracy: {metrics.accuracy_score(test_labels,svm_pred)}")

Accuracy: 0.48925


In [71]:
svm = LinearSVC(random_state=42, class_weight='balanced')
svm.fit(train_data, train_labels)
svm_pred = svm.predict(test_data)
print_report(test_labels, svm_pred)
# print(f"Accuracy: {metrics.accuracy_score(test_labels,svm_pred)}")

Accuracy: 0.4893


# Logistic Regression

In [67]:
# lr = LogisticRegression(max_iter=200, solver='saga')
# lr = LogisticRegression(max_iter=200, solver='sag') 
lr = LogisticRegression(random_state=42, max_iter=200, class_weight='balanced', solver='lbfgs') # if using need to incr. max_iters
# lr = LogisticRegression(max_iter=200, solver='newton-cg') 

lr.fit(train_data, train_labels)
lr_pred = lr.predict(test_data)
print_report(test_labels, lr_pred)
# print(f"Accuracy: {metrics.accuracy_score(test_labels, lr_pred)}")
lr.n_iter_

Accuracy: 0.51405


array([300], dtype=int32)

# Naive Bayes

In [73]:
nb = MultinomialNB()
nb.fit(train_data, train_labels)
nb_pred = nb.predict(test_data)
print_report(test_labels, nb_pred)
# print(f"Accuracy: {metrics.accuracy_score(test_labels, nb_pred)}")

Accuracy: 0.4991
