In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize # added

nltk.download('wordnet') # added
nltk.download('omw-1.4') # added
nltk.download('punkt') # added

import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# sklearn has preprocessing 
# can convert strinbg labels into numbers

import contractions

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/meganbull/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/meganbull/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/meganbull/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# ! python3 -m pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz

f_path = 'amazon_reviews_us_Jewelry_v1_00.tsv'

STAR_H = 'star_rating'
REVIEW_H = 'review_body'

cols=[STAR_H, REVIEW_H]

valid_ratings = {'1', '2', '3', '4', '5'}

## Read Data

In [3]:
df = pd.read_csv(f_path, sep='\t', usecols=cols, low_memory=False)
print(f"Shape before dropping NaN vals: {df.shape}")
df.dropna(inplace=True)
print(f"Shape after dropping NaN vals: {df.shape}")
df.head()

Shape before dropping NaN vals: (1767051, 2)
Shape after dropping NaN vals: (1766807, 2)


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [4]:
df[STAR_H].unique()

array(['5', '1', '4', '3', '2'], dtype=object)

 ## We select 20000 reviews randomly from each rating class.



In [5]:
s_size = 20000

grouped = df.groupby(STAR_H)
rat_dfs = [grouped.get_group(rating).sample(n=s_size) for rating in valid_ratings]

sampled = pd.concat(rat_dfs)
# sampled[STAR_H] = sampled[STAR_H].astype("category")
print(sampled.shape)
sampled.head()

(100000, 2)


Unnamed: 0,star_rating,review_body
115945,4,Great shade. I also purchased the long neckla...
1753716,4,"These are dainty, lovely earrings...especially..."
1300065,4,This is a beautifully delicate amethyst rosary...
1016218,4,I loved the two longer chains. The smallest on...
1518214,4,Awesome product would order again as well as t...


# Data Cleaning



- convert the all reviews into the lower case. (**DONE**)
- remove the HTML and URLs from the reviews (**DONE**)
- remove non-alphabetical characters (**DONE**)
- remove extra spaces (**DONE**)
- perform contractions on the reviews, e.g., won’t -> will not (**DONE**)

In [6]:
raw_len_avg = sampled[REVIEW_H].str.len().mean()

print(f'Average character length pre-clean: {raw_len_avg}')

Average character length pre-clean: 190.29024


In [7]:
def gen_clean(text):
    """
    gen text cleanup 
    incl removal: extended ws, html tags, urls
    """
    text = BeautifulSoup(text, "html.parser").text #rm html tags 
    text = re.sub(r'http\S+', r'', text)
    text = contractions.fix(text)

    for c in text: 
        if not c.isalpha():
            text = text.replace(c, ' ')

    text = re.sub(" +", " ", text)
    
    return text.lower()

sampled[REVIEW_H] = sampled[REVIEW_H].apply(gen_clean)
sampled.sort_index(inplace=True)




In [8]:
cl_len_avg = sampled[REVIEW_H].str.len().mean()

print(f'Average character length post-clean: {cl_len_avg}')

Average character length post-clean: 184.19912


# Pre-processing

## remove the stop words 

In [9]:
def rm_stops(text): 
   """
   remove stop words from text 
   """
   stops = set(stopwords.words("english"))
   sans_stops = [tok for tok in word_tokenize(text) if tok not in stops]
   return " ".join(sans_stops).strip()

sampled[REVIEW_H] = sampled[REVIEW_H].apply(rm_stops)


## perform lemmatization  

In [10]:
wnl = WordNetLemmatizer()

def lemmatize(text): 
   lemmas = [wnl.lemmatize(w) for w in word_tokenize(text)]
   return " ".join(lemmas)
   
sampled[REVIEW_H] = sampled[REVIEW_H].apply(lemmatize)


In [11]:
preproc_len_avg = sampled[REVIEW_H].str.len().mean()

print(f'Average character length after preproc: {preproc_len_avg}')

Average character length after preproc: 108.34514


# TF-IDF Feature Extraction

In [21]:
v = TfidfVectorizer()
feat = v.fit_transform(sampled[REVIEW_H]).toarray()

In [22]:
train_data, test_data, train_labels, test_labels = train_test_split(feat, sampled[STAR_H], test_size=0.2)

In [23]:
def print_report(test_labels, test_pred):
   classific_dict = classification_report(test_labels, test_pred, output_dict=True)

   for k,v in classific_dict.items(): 
      if k in valid_ratings:
         print(f"Rating {k}:")
         print(f"\tPrecision: {v['precision']:.3f}")
         print(f"\tRecall: {v['recall']:.3f}")
         print(f"\tF1-score: {v['f1-score']:.3f}")
      elif k == 'macro avg':
         print("Overall Average:")
         print(f"\tPrecision: {v['precision']:.3f}")
         print(f"\tRecall: {v['recall']:.3f}")
         print(f"\tF1-score: {v['f1-score']:.3f}")

# Perceptron

In [24]:
p = Perceptron(random_state=42, max_iter=2000, n_iter_no_change=5) 
# , early_stopping=True, validation_fraction=0.1, n_iter_no_change=5)
# p = Perceptron(random_state=42, eta0=0.1)
p.fit(train_data, train_labels)
p_pred = p.predict(test_data)
print_report(test_labels, p_pred)
print(f"Accuracy: {metrics.accuracy_score(test_labels, p_pred)}")

# SVM

In [None]:
svm = LinearSVC()
svm.fit(train_data, train_labels)
svm_pred = svm.predict(test_data)
print_report(test_labels, svm_pred)
print(f"Accuracy: {metrics.accuracy_score(test_labels,svm_pred)}")

# Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=200, solver='saga')
# lr = LogisticRegression(max_iter=200, solver='sag') 
# lr = LogisticRegression(max_iter=200, solver='lbfgs') # if using need to incr. max_iters
# lr = LogisticRegression(max_iter=200, solver='newton-cg') 

lr.fit(train_data, train_labels)
lr_pred = lr.predict(test_data)
print_report(test_labels, lr_pred)
print(f"Accuracy: {metrics.accuracy_score(test_labels, lr_pred)}")

Rating 1:
	Precision: 0.577
	Recall: 0.626
	F1-score: 0.601
Rating 2:
	Precision: 0.415
	Recall: 0.395
	F1-score: 0.405
Rating 3:
	Precision: 0.417
	Recall: 0.402
	F1-score: 0.409
Rating 4:
	Precision: 0.454
	Recall: 0.424
	F1-score: 0.438
Rating 5:
	Precision: 0.642
	Recall: 0.685
	F1-score: 0.663
Overall Average:
	Precision: 0.501
	Recall: 0.506
	F1-score: 0.503


# Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(train_data, train_labels)
nb_pred = nb.predict(test_data)
print_report(test_labels, nb_pred)
print(f"Accuracy: {metrics.accuracy_score(test_labels, nb_pred)}")

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.