In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
import re
from bs4 import BeautifulSoup

import contractions

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/meganbull/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/meganbull/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/meganbull/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# ! python3 -m pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz

f_path = 'amazon_reviews_us_Jewelry_v1_00.tsv'

STAR_H = 'star_rating'
REVIEW_H = 'review_body'

cols=[STAR_H, REVIEW_H]

valid_ratings = {'1', '2', '3', '4', '5'}

## Read Data

In [3]:
df = pd.read_csv(f_path, sep='\t', usecols=cols, low_memory=False)
print(f"Shape before dropping NaN vals: {df.shape}")
df.dropna(inplace=True)
print(f"Shape after dropping NaN vals: {df.shape}")
df.head()

Shape before dropping NaN vals: (1767051, 2)
Shape after dropping NaN vals: (1766807, 2)


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [4]:
df[STAR_H].unique()

array(['5', '1', '4', '3', '2'], dtype=object)

 ## We select 20000 reviews randomly from each rating class.



In [5]:
s_size = 20000

grouped = df.groupby(STAR_H)
rat_dfs = [grouped.get_group(rating).sample(n=s_size) for rating in valid_ratings]

sampled = pd.concat(rat_dfs)

print(sampled.shape)
sampled.head()

(100000, 2)


Unnamed: 0,star_rating,review_body
560904,5,These plugs are gorgeous and fit as described....
372508,5,Received the product before the estimated arri...
960622,5,i loved it. i gave it to my mother and she rea...
708886,5,Love It
1149496,5,"Love, love, love them! They look very classy ..."


# Data Cleaning



- convert the all reviews into the lower case. (**DONE**)
- remove the HTML and URLs from the reviews (**DONE**)
- remove non-alphabetical characters (**DONE**)
- remove extra spaces (**DONE**)
- perform contractions on the reviews, e.g., won’t -> will not (**DONE**)

In [6]:
raw_len_avg = sampled[REVIEW_H].str.len().mean()

print(f'Average character length pre-clean: {raw_len_avg}')

Average character length pre-clean: 190.39115


In [7]:
def gen_clean(text):
    """
    gen text cleanup 
    incl removal: extended ws, html tags, urls
    """
    text = BeautifulSoup(text, "html.parser").text #rm html tags 
    text = re.sub(r'http\S+', r'', text)
    text = contractions.fix(text)

    for c in text: 
        if not c.isalpha():
            text = text.replace(c, ' ')

    text = re.sub(" +", " ", text)
    
    return text.lower()

sampled[REVIEW_H] = sampled[REVIEW_H].apply(gen_clean)
sampled.sort_index(inplace=True)




In [8]:
cl_len_avg = sampled[REVIEW_H].str.len().mean()

print(f'Average character length post-clean: {cl_len_avg}')

Average character length post-clean: 184.29363


# Pre-processing

## remove the stop words 

In [9]:
def rm_stops(text): 
   """
   remove stop words from text 
   """
   stops = set(stopwords.words("english"))
   sans_stops = [tok for tok in word_tokenize(text) if tok not in stops]
   return " ".join(sans_stops).strip()

sampled[REVIEW_H] = sampled[REVIEW_H].apply(rm_stops)


## perform lemmatization  

In [10]:
wnl = WordNetLemmatizer()

def lemmatize(text): 
   lemmas = [wnl.lemmatize(w) for w in word_tokenize(text)]
   return " ".join(lemmas)
   
sampled[REVIEW_H] = sampled[REVIEW_H].apply(lemmatize)


In [11]:
preproc_len_avg = sampled[REVIEW_H].str.len().mean()

print(f'Average character length after preproc: {preproc_len_avg}')

Average character length after preproc: 108.40071


# TF-IDF Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Perceptron

# SVM

# Logistic Regression

# Naive Bayes