In [208]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
import string
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xmh91\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz

## Read Data

In [275]:
df = pd.read_csv("amazon_reviews_us_Kitchen_v1_00.tsv.gz", sep='\t', error_bad_lines=False, warn_bad_lines=False)

## Keep Reviews and Ratings

In [276]:
df2 = df[['star_rating', 'review_body']].dropna(subset=['review_body'])

# Labelling Reviews:
## The reviews with rating 4,5 are labelled to be 1 and 1,2 are labelled as 0. Discard the reviews with rating 3'

In [278]:
# Drop rating 3 rows
three_indexes = df2[ df2['star_rating'] == 3 ].index
df2.drop(three_indexes, inplace = True)

# Lebal samples according to rating
df2['label'] = 1
df2.loc[df2['star_rating'] <= 2, 'label'] = 0

#pd.value_counts(df2['label']).plot.bar()

 ## We select 200000 reviews randomly with 100,000 positive and 100,000 negative reviews.



In [280]:
# select reviews
df_pos = df2.loc[df2['label'] == 1].sample(1000000)
df_neg = df2.loc[df2['label'] == 0].sample(1000000)
# select pos and neg train and test data
train_pos, test_pos = train_test_split(df_pos, test_size=0.2)
train_neg, test_neg = train_test_split(df_neg, test_size=0.2)
# combine pos and neg data
train_data = train_pos.append(train_neg)
test_data = test_pos.append(test_neg)

# Data Cleaning

## Convert the all reviews into the lower case.

In [287]:
train_data['review_body'] = train_data['review_body'].str.lower()
test_data['review_body'] = test_data['review_body'].str.lower()
train_data.head(50)

Unnamed: 0,star_rating,review_body,label
3476398,5.0,bought this one to replace the exact model. w...,1
2974696,5.0,"i love these grips. i use it when grilling, se...",1
1233331,5.0,perfect for my huge ham which was why i purcha...,1
2756099,5.0,mesh is fine - no problems with pulp. used to ...,1
147917,5.0,great!,1
1077298,5.0,bright color. typical flask,1
1549988,5.0,arrived just in time for the holidays. this wa...,1
1116929,5.0,"it's a nice set, but i put my sash in the drye...",1
383814,5.0,worked perfect for our magic bullet,1
3410975,5.0,and i'm not sure that i can do this knife just...,1


## remove the HTML and URLs from the reviews

In [289]:
def remove_html_url(s):
    # parse html
    soup = BeautifulSoup(s, "html.parser")
    
    for data in soup(['style', 'script']):
        # remove tags
        data.decompose()
    # replace url with empty string and return
    return re.sub(r"http\S+", "", ' '.join(soup.stripped_strings))

# Remove HTML markups and URL in text format
train_data['review_body'] = [ remove_html_url(review) for review in train_data['review_body'] ]
test_data['review_body'] = [ remove_html_url(review) for review in test_data['review_body'] ]

print(train_data.loc[3901394, ['review_body']].review_body)



In [297]:
# To be removed
print(train_data.loc[1095000, ['review_body']].review_body)

bought for a second home my husband's words get rid of it very hard to fill with water leaks all over when pouring coffee every single time awful does not work as intended 


## remove non-alphabetical characters. -> perform contractions on the reviews.

In [291]:
def remove_non_alphabetical(s):
    # remove numbers
    #s = re.sub(r'\d+', '', s)
    
    # replace non-alphabetical by whitespace
    s = re.sub(r"[^a-zA-Z'’]", ' ', s)
    
    # remove punctuation and return
    #return ' '.join([word.strip(string.punctuation) for word in s.split(" ")])
    return s

train_data['review_body'] = [ remove_non_alphabetical(review) for review in train_data['review_body'] ]
test_data['review_body'] = [ remove_non_alphabetical(review) for review in test_data['review_body'] ]

## Remove the extra spaces between the words. -> remove non-alphabetical characters

In [319]:
train_data['review_body'] = [ re.sub(r'\s+', ' ', review) for review in train_data['review_body'] ]
test_data['review_body'] = [ re.sub(r'\s+', ' ', review) for review in test_data['review_body'] ]
print(train_data.head(50))

         star_rating                                        review_body  label
3476398          5.0  bought one replace exact model wore old coffee...      1
2974696          5.0  love grips use grilling serving broiling etc c...      1
1233331          5.0  perfect huge ham purchased cleaned nicely matc...      1
2756099          5.0  mesh fine problems pulp used make green juice ...      1
147917           5.0                                             great       1
1077298          5.0                         bright color typical flask      1
1549988          5.0  arrived time holidays great purchase use twice...      1
1116929          5.0                 nice set put sash dryer get lines       1
383814           5.0                        worked perfect magic bullet      1
3410975          5.0  sure knife justice wonderfully primitive look ...      1
1714893          5.0      great grinder processes fast bagger bag label      1
4218325          5.0  get compliments constantly guy

## perform contractions on the reviews. ->  Remove the extra spaces between the words.

In [299]:
 def contractionfunction(s):

    contractions = {
        "a'ight": "alright",
        "ain't": "am not",
        "amn't": "am not",
        "arencha": "are not you",
        "aren't": "are not",
        "‘bout": "about",
        "cannot": "can not",
        "can't": "cannot",
        "cap’n": "captain",
        "cause": "because",
        "’cept": "except",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "dammit": "damn it",
        "daren't": "dare not",
        "daresn't": "dare not",
        "dasn't": "dare not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "dunno": "do not know",
        "d'ye": "did you",
        "e'en": "even",
        "e'er": "ever",
        "em": "them",
        "everybody's": "everybody is",
        "everyone's": "everyone is",
        "fo’c’sle": "forecastle",
        "’gainst": "against",
        "g'day": "good day",
        "gimme": "give me",
        "giv'n": "given",
        "gonna": "going to",
        "gon't": "go not",
        "gotta": "got to",
        "hadn't": "had not",
        "had've": "had have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "helluva": "hell of a",
        "he's": "he is",
        "here's": "here is",
        "how'd": "how did",
        "howdy": "how do you do",
        "how'll": "how will",
        "how're": "how are",
        "how's": "how is",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'm": "i am",
        "imma": "i am about to",
        "i'm'o": "i am going to",
        "innit": "is it not",
        "ion": "i do not",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'll": "it will",
        "it's": "it is",
        "iunno": "i do not know",
        "kinda": "kind of",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "may've": "may have",
        "methinks": "i think",
        "mightn't": "might not",
        "might've": "might have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "must've": "must have",
        "‘neath": "beneath",
        "needn't": "need not",
        "nal": "and all",
        "ne'er": "never",
        "o'clock": "of the clock",
        "o'er": "over",
        "ol'": "old",
        "oughtn't": "ought not",
        "‘round": "around",
        "s": "is",
        "shalln't": "shall not",
        "shan't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "somebody's": "somebody is",
        "someone's": "someone is",
        "something's": "something is",
        "so're": "so are",
        "so’s": "so has",
        "so’ve": "so have",
        "that'll": "that will",
        "that're": "that are",
        "that's": "that is",
        "that'd": "that would",
        "there'd": "there would",
        "there'll": "there will",
        "there're": "there are",
        "there's": "there is",
        "these're": "these are",
        "these've": "these have",
        "they'd": "they would",
        "they'll": "they will",
        "they're": "they are",
        "they've": "they have",
        "this's": "this is",
        "those're": "those are",
        "those've": "those have",
        "thout": "without",
        "’til": "until",
        "tis": "it is",
        "to've": "to have",
        "twas": "it was",
        "tween": "between",
        "twere": "it were",
        "wanna": "want to",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "whatcha": "what are you",
        "what'd": "what did",
        "what'll": "what will",
        "what're": "what are/what were",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "where'd": "where did",
        "where'll": "where will",
        "where're": "where are",
        "where's": "where is",
        "where've": "where have",
        "which'd": "which had",
        "which'll": "which will",
        "which're": "which are",
        "which's": "which is",
        "which've": "which have",
        "who'd": "who would",
        "who'd've": "who would have",
        "who'll": "who will",
        "who're": "who are",
        "who's": "who is",
        "who've": "who have",
        "why'd": "why did",
        "why're": "why are",
        "why's": "why is",
        "willn't": "will not",
        "won't": "will not",
        "wonnot": "will not",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd've": "you all would have",
        "y'all'd'n't've": "you all would not have",
        "y'all're": "you all are",
        "y'all'ren't": "you all are not",
        "y'at": "you at",
        "yes’m": "yes madam",
        "yessir": "yes sir",
        "you'd": "you would",
        "you'll": "you will",
        "you're": "you are",
        "you've": "you have",
        "'s": ""
    }
    
    
    for word in s.split(" "):
        if word in contractions:
            s = s.replace(word, contractions[word])
    return s

train_data['review_body'] = [ contractionfunction(review) for review in train_data['review_body'] ]
test_data['review_body'] = [ contractionfunction(review) for review in test_data['review_body'] ]

# Pre-processing

## remove the stop words 

In [320]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(s):
    # only keep words not in stopwords set
    filtered_words = [word for word in s.split(" ") if word not in stop_words]
    return " ".join(filtered_words)

train_data['review_body'] = [ remove_stopwords(review) for review in train_data['review_body'] ]
test_data['review_body'] = [ remove_stopwords(review) for review in test_data['review_body'] ]

## perform lemmatization  

In [352]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN) 



print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

husband's


# TF-IDF Feature Extraction

# Perceptron

# SVM

# Logistic Regression

# Naive Bayes