## Imports and Function Declarations

In [1]:
# python version 3.10.6

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True) 
nltk.download('punkt', quiet=True) 

from textacy.preprocessing import remove, normalize, replace

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron, LogisticRegression

import warnings 
import contractions

warnings.filterwarnings('ignore')

In [2]:
# GLOBALS 

F_PATH = 'amazon_reviews_us_Jewelry_v1_00.tsv'

STAR_H = 'star_rating'
REVIEW_H = 'review_body'

COLS=[STAR_H, REVIEW_H]

VAL_STARS = {'1', '2', '3', '4', '5'}

WNL = WordNetLemmatizer()

In [3]:
def read_data(f_path=F_PATH):
   df = pd.read_csv(f_path, sep='\t', usecols=COLS, low_memory=False)
   df.dropna(inplace=True)
   return df

def get_sample(df, s_size=20000):

   grouped = df.groupby(STAR_H)
   rat_dfs = [grouped.get_group(rating).sample(n=s_size) for rating in VAL_STARS]
   return pd.concat(rat_dfs) 

def gen_clean(text):
   """
   gen text cleanup 
   incl removal: extended ws, html tags, urls
   """
   text = BeautifulSoup(text, "html.parser").text #rm html tags 
   text = replace.urls(text, '')
   text = contractions.fix(text)
   text = remove.punctuation(text)
   text = normalize.whitespace(text)
   
   return text.lower()
   
def rm_stops(text): 
   """
   remove stop words from text 
   """
   stops = set(stopwords.words("english"))
   sans_stops = [tok for tok in word_tokenize(text) if tok not in stops]
   return " ".join(sans_stops).strip()

def lemmatize(text): 

   lemmas = [WNL.lemmatize(w) for w in word_tokenize(text)]
   return " ".join(lemmas).strip()


## Read Data
1. Read from file.
2. Drop NaN vals.
3. Visually verify unique `star_rating` values are valid rating classes to determine if more rows need to be dropped. 
4. Randomly select 20,000 samples frome each valid rating class 
   - This is done by grouping the original DataFrame by `VAL_STARS`, list of sampled DataFrames from each rating class, and finally, concatenating them together. 

In [4]:
df = read_data()
print(df[STAR_H].unique())

sampled = get_sample(df)

['5' '1' '4' '3' '2']


## Data Cleaning
1. Perform general text cleaning including: 
   - Remove html tags via `BeautifulSoup`'s `html.parser`
   - Remove URLS via `textaCy`'s `.replace.urls` functionality.
   - Resolve contractions with `contractions` library which handles a host of contractions, including slang such as 'y'all'.
   - Replace punctuation with `textaCy` which replaces all instances of punctuation. 
   - Normalize whitespace via `textaCy` which replaces all contiguous zero-width spaces as well as strip leading and trailing whitespace. 
   - Convert all text to lowercase. 
2. Print Average character length of reviews pre and post-clean.


In [5]:
raw_len_avg = sampled[REVIEW_H].str.len().mean()

sampled[REVIEW_H] = sampled[REVIEW_H].apply(gen_clean)

cl_len_avg = sampled[REVIEW_H].str.len().mean()

print(f'{raw_len_avg}, {cl_len_avg}')

189.68736, 183.72778


## Pre-processing
1. Remove english stop words using `NLTK`'s set of stop words and performing list comprhension to for a list of remaining words. <br>
   Returns a stripped string of the concatenated words to account for stops appearing at the beginning or end of a review. 
2. Perform lemmatization using `WordNetLemmatizer` on tokens produced by `nltk.tokenize.word_tokenize`
3. Print average character length before and after pre-processing.

In [6]:
sampled[REVIEW_H] = sampled[REVIEW_H].apply(rm_stops)
sampled[REVIEW_H] = sampled[REVIEW_H].apply(lemmatize)

preproc_len_avg = sampled[REVIEW_H].str.len().mean()
print(f'{cl_len_avg}, {preproc_len_avg}')

183.72778, 108.92808


## TF-IDF Feature Extraction
`use_idf` is included as a parameter due to the findings of the `GridSearchCV` algorithm from `sklearn`. This parameter disables reweighting due to IDF. My original implementation had this parameter set to `True`, however, I noticed an increase in overall average precision across all of the algorithms with the use of `use_idf = False`, necessitating its' inclusion in my performance report due to the competitive nature of grading.

In [7]:
v = TfidfVectorizer(use_idf=False)
feat = v.fit_transform(sampled[REVIEW_H])
X_train, X_test, train_labels, test_labels = train_test_split(feat, sampled[STAR_H], test_size=0.2, random_state=42)

## Perceptron
Hyperparameters were tuned using `GridSearchCV` from the `sklearn` package. Since the code for tuning is not relevant to the results it has not been included.
I found that on average, the perceptron algorithm performed less than 20 iterations and `GridSearchCV` helped me determine the best boundary for the `max_iter` parameter. `random_state` is included for reproducability of results and `class_weight` being set to `balanced` allows the algorithm to adjust for unbalanced datasets. My inclination to include this last parameter stemmed from the conclusion that the train/test split made after TF-IDF would not be proportionally represetative of the rating classes. 

In [8]:
p = Perceptron(random_state=42, class_weight='balanced', max_iter=20, n_iter_no_change=3)
p.fit(X_train, train_labels)
p_pred = p.predict(X_test)
print(classification_report(test_labels, p_pred))

              precision    recall  f1-score   support

           1       0.49      0.48      0.48      4015
           2       0.33      0.29      0.31      4018
           3       0.31      0.22      0.26      3987
           4       0.33      0.49      0.40      3978
           5       0.52      0.51      0.52      4002

    accuracy                           0.40     20000
   macro avg       0.40      0.40      0.39     20000
weighted avg       0.40      0.40      0.39     20000



## SVM
As with the perceptron algorithm, I tuned SVM using `GridSearchCV`. According to the `sklearn` documentation, `dual` should prefer to be `False` when the number of samples is greater than the number of features, which is true in this case. As mentioned above, `random_state` is included for reproducability and is set with 42, as per the recommendation of the `sklearn` glossary. The grid search tested a number of `max_iter` values but ultimately found this led to the best performance without risk of overfitting. `penalty` is set to `l1` since our features are sparse. 

In [9]:
svm = LinearSVC(penalty='l1', dual=False, random_state=42, max_iter=300)
svm.fit(X_train, train_labels)
svm_pred = svm.predict(X_test)
print(classification_report(test_labels, svm_pred))

              precision    recall  f1-score   support

           1       0.56      0.68      0.61      4015
           2       0.40      0.33      0.36      4018
           3       0.42      0.35      0.38      3987
           4       0.46      0.43      0.44      3978
           5       0.62      0.75      0.68      4002

    accuracy                           0.51     20000
   macro avg       0.49      0.51      0.49     20000
weighted avg       0.49      0.51      0.50     20000



## Logistic Regression
Again, hyperparameters were tuned with the use of `GridSearchCV`. Grid search was especially helpful in this case due to the variety of solvers that can be used in this algorithm. Out of the four mentioned in the documentation to be most suitable for multiclass problems, grid search found `sag` to be optimal. However, it should be noted the difference in performance between solvers was not significant. `max_iter` is set to 400 instead of the default 100 due to warnings of convergence failure on lower max iterations. `class_weight` set to `balanced` also appeared to improve this algorithm and was included with the grid search results.  

In [10]:
lr = LogisticRegression(random_state=42, max_iter=400, class_weight='balanced', solver='sag')
lr.fit(X_train, train_labels)
lr_pred = lr.predict(X_test)
print(classification_report(test_labels, lr_pred))

              precision    recall  f1-score   support

           1       0.59      0.64      0.61      4015
           2       0.42      0.38      0.40      4018
           3       0.42      0.40      0.41      3987
           4       0.47      0.45      0.46      3978
           5       0.65      0.71      0.68      4002

    accuracy                           0.52     20000
   macro avg       0.51      0.52      0.51     20000
weighted avg       0.51      0.52      0.51     20000



## Naive Bayes
Although not needed due to the lack of variance in parameters as compared to previous algorithms, `GridSearchCV` was also used to tune hyperparameters. Setting `alpha=1` is representative of an added smoothing parameter while `fit_prior=False` disallows the algorithm from learning class prior probabilites. 

In [11]:
nb = MultinomialNB(alpha=1, fit_prior=False)
nb.fit(X_train, train_labels)
nb_pred = nb.predict(X_test)
print(classification_report(test_labels, nb_pred))

              precision    recall  f1-score   support

           1       0.61      0.62      0.61      4015
           2       0.42      0.39      0.41      4018
           3       0.42      0.43      0.42      3987
           4       0.45      0.44      0.45      3978
           5       0.65      0.68      0.66      4002

    accuracy                           0.51     20000
   macro avg       0.51      0.51      0.51     20000
weighted avg       0.51      0.51      0.51     20000

