# Sentiment Analysis - Amazon Reviews

## Binary Sentiment Analysis
## Data Cleaning -> Data Preprocessing -> TF-IDF -> ML Algorithms (Perceptron, SVM, Logistic Regression, Naive Bayes)

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet', quiet=True)
import re
from bs4 import BeautifulSoup

In [2]:
#! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz

## Read Data

In [3]:
 data=pd.read_csv('amazon_reviews_us_Kitchen_v1_00.tsv', sep="\t", error_bad_lines=False, warn_bad_lines=False)

## Keep Reviews and Ratings

In [4]:
df=data[["review_body","star_rating"]]

# Labelling Reviews:

In [5]:
# statistics of all the reviews
df=df.dropna()
df_grouped = df.groupby('star_rating')
df_grouped.count()

Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1.0,426870
2.0,241939
3.0,349539
4.0,731701
5.0,3124595


In [39]:
#statistics of the 3 classes
count_3 = df_grouped.get_group(3.0).count()[1]
count_less = df_grouped.get_group(1.0).count()[1] + df_grouped.get_group(2.0).count()[1]
count_more = df_grouped.get_group(4.0).count()[1] + df_grouped.get_group(5.0).count()[1]
counts=count_more, count_less, count_3
print("Count of reviews having rating less than 3: ",count_less)
print("Count of reviews having rating  3: ",count_3)
print("Count of reviews having rating more than 3: ",count_more)

Count of reviews having rating less than 3:  668809
Count of reviews having rating  3:  349539
Count of reviews having rating more than 3:  3856296


## The reviews with rating 4,5 are labelled to be 1 and 1,2 are labelled as 0. Discard the reviews with rating 3'

In [7]:
df_labeled=df.copy()
df_labeled=df_labeled[df_labeled["star_rating"]!=3.0]
df_labeled["star_rating"]=(df_labeled["star_rating"]>3)*1

In [9]:
df_labeled.sample(5)

Unnamed: 0,review_body,star_rating
1252980,Awesome,1
2370796,I use this everytime I cook.... handy tool.,1
2813246,2 1/2 weeks into juicing as a lifestyle change...,0
1672884,Very happy with Kitchen Aid Convection Oven. K...,1
4076561,This can opener will make you curse the day yo...,0


In [10]:
#statistics of just the 2 classes used
df_labeled.groupby('star_rating').count()

Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
0,668809
1,3856296


## Handling Class Imbalance 
### We select 200000 reviews randomly with 100,000 positive and 100,000 negative reviews.

In [11]:
df_0=df_labeled[df_labeled["star_rating"]==0].sample(100000, replace=False)
df_1=df_labeled[df_labeled["star_rating"]==1].sample(100000, replace=False)
df_subset=pd.concat([df_0,df_1])

 ## average character length before cleaning



In [12]:
char_len_before = sum(list(df_subset["review_body"].str.len()))/df_subset.shape[0]
print(char_len_before)

322.26184


 ## sample reviews



In [14]:
df_subset.sample(5)

Unnamed: 0,review_body,star_rating
3710130,"Basically, it is a nice unit and works well as...",0
669251,Returning because the &#34;bagel&#34; function...,0
4788646,Coffee presses work pretty much the same way. ...,1
3656991,I just purchased this pan after having impress...,1
3489238,"Pretty chrome, spins easily, holds all the k c...",1


# Data Cleaning

## Convert the all reviews into the lower case.

In [15]:
df_subset['review_body']=df_subset['review_body'].str.lower()

## remove the HTML and URLs from the reviews

In [16]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    text=str(text)
    text = re.sub(clean, '', text)
    return re.sub(r"\S*http\S+", "", text)

df_subset['review_body']=df_subset['review_body'].apply(remove_html_tags)

## perform contractions on the reviews.

In [17]:
import contractions
def contractionfunction(s):
    s=contractions.fix(s)
    return s
df_subset['review_body']=df_subset['review_body'].apply(contractionfunction)

## remove non-alphabetical characters

In [18]:
def remove_non_alpha(text):
    clean = re.compile('[^a-zA-Z]+')
    text=str(text)
    return re.sub(clean, ' ', text)

df_subset['review_body']=df_subset['review_body'].apply(remove_non_alpha)

## Remove the extra spaces between the words

In [19]:
def remove_extra_space(text):
     return re.sub(' +', ' ', str(text.strip()))

df_subset['review_body']=df_subset['review_body'].apply(remove_extra_space)

 ## average character length after cleaning



In [20]:
char_len_after = sum(df_subset["review_body"].str.len())/df_subset.shape[0]
print(char_len_after)

308.18849


# Pre-processing

## remove the stop words 

In [21]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
from nltk.corpus import stopwords
stopwords_set = set(stopwords.words("english"))                  


## perform lemmatization  

In [22]:
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()


def lemmatize_remove_stopwords(text):
    text = ' '.join([wnl.lemmatize(word) for word in nltk.word_tokenize(text) if  word not in stopwords_set])
    return text

df_subset['review_body']=df_subset['review_body'].apply(lemmatize_remove_stopwords)

 ## average character length after preprocessing



In [23]:
char_len_after_prep = sum(df_subset["review_body"].str.len())/df_subset.shape[0]
char_len_clean=char_len_before, char_len_after
char_len_prep=char_len_after, char_len_after_prep
print(char_len_after_prep)

188.97449


 ## sample review



In [24]:
df_subset.sample(5)

Unnamed: 0,review_body,star_rating
2982855,shattered bumped side wooden table glass thin ...,0
3942189,wife received set quickly corrosion far moth u...,1
1791931,perfect fridge check meat temp requirement rea...,1
3494843,le day bought turned yesterday flash burst sou...,0
1969468,fionally uused first time filled water coffee ...,0


 ##  train-test split



In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_subset["review_body"], df_subset["star_rating"], test_size=0.2, random_state=42)

# TF-IDF Feature Extraction

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf= vectorizer.fit_transform(X_train)
X_test_tfidf=vectorizer.transform(X_test)

# Perceptron

In [27]:
from sklearn.linear_model import Perceptron
clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train_tfidf, y_train)
y_train_pred=clf.predict(X_train_tfidf)
y_test_pred=clf.predict(X_test_tfidf)

In [28]:
from sklearn.metrics import classification_report
cl_report_train=classification_report(y_train, y_train_pred, output_dict=True)
cl_report_train_string=classification_report(y_train, y_train_pred)
print("----------Perceptron----------")
print("TRAIN Classification Report")
print(cl_report_train_string)
cl_report_test=classification_report(y_test, y_test_pred, output_dict=True)
cl_report_test_string=classification_report(y_test, y_test_pred)
print("\nTEST Classification Report")
print(cl_report_test_string)

----------Perceptron----------
TRAIN Classification Report
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     80007
           1       0.91      0.91      0.91     79993

    accuracy                           0.91    160000
   macro avg       0.91      0.91      0.91    160000
weighted avg       0.91      0.91      0.91    160000


TEST Classification Report
              precision    recall  f1-score   support

           0       0.86      0.86      0.86     19993
           1       0.86      0.86      0.86     20007

    accuracy                           0.86     40000
   macro avg       0.86      0.86      0.86     40000
weighted avg       0.86      0.86      0.86     40000



In [29]:
perceptron_train=cl_report_train["accuracy"], cl_report_train["macro avg"]["precision"], cl_report_train["macro avg"]["recall"], cl_report_train["macro avg"]["f1-score"]
perceptron_test=cl_report_test["accuracy"], cl_report_test["macro avg"]["precision"], cl_report_test["macro avg"]["recall"], cl_report_test["macro avg"]["f1-score"]

# SVM

In [30]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X_train_tfidf, y_train)
y_train_pred=clf.predict(X_train_tfidf)
y_test_pred=clf.predict(X_test_tfidf)

In [31]:
from sklearn.metrics import classification_report
cl_report_train=classification_report(y_train, y_train_pred, output_dict=True)
cl_report_train_string=classification_report(y_train, y_train_pred)
print("----------SVM----------")
print("TRAIN Classification Report")
print(cl_report_train_string)
cl_report_test=classification_report(y_test, y_test_pred, output_dict=True)
cl_report_test_string=classification_report(y_test, y_test_pred)
print("\nTEST Classification Report")
print(cl_report_test_string)

----------SVM----------
TRAIN Classification Report
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     80007
           1       0.93      0.93      0.93     79993

    accuracy                           0.93    160000
   macro avg       0.93      0.93      0.93    160000
weighted avg       0.93      0.93      0.93    160000


TEST Classification Report
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     19993
           1       0.90      0.90      0.90     20007

    accuracy                           0.90     40000
   macro avg       0.90      0.90      0.90     40000
weighted avg       0.90      0.90      0.90     40000



In [32]:
svm_train=cl_report_train["accuracy"], cl_report_train["macro avg"]["precision"], cl_report_train["macro avg"]["recall"], cl_report_train["macro avg"]["f1-score"]
svm_test=cl_report_test["accuracy"], cl_report_test["macro avg"]["precision"], cl_report_test["macro avg"]["recall"], cl_report_test["macro avg"]["f1-score"]

# Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=250)
clf.fit(X_train_tfidf, y_train)
y_train_pred=clf.predict(X_train_tfidf)
y_test_pred=clf.predict(X_test_tfidf)

In [34]:
from sklearn.metrics import classification_report
cl_report_train=classification_report(y_train, y_train_pred, output_dict=True)
cl_report_train_string=classification_report(y_train, y_train_pred)
print("----------Logistic Regression----------")
print("TRAIN Classification Report")
print(cl_report_train_string)
cl_report_test=classification_report(y_test, y_test_pred, output_dict=True)
cl_report_test_string=classification_report(y_test, y_test_pred)
print("\nTEST Classification Report")
print(cl_report_test_string)

----------Logistic Regression----------
TRAIN Classification Report
              precision    recall  f1-score   support

           0       0.91      0.92      0.91     80007
           1       0.92      0.91      0.91     79993

    accuracy                           0.91    160000
   macro avg       0.91      0.91      0.91    160000
weighted avg       0.91      0.91      0.91    160000


TEST Classification Report
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     19993
           1       0.90      0.90      0.90     20007

    accuracy                           0.90     40000
   macro avg       0.90      0.90      0.90     40000
weighted avg       0.90      0.90      0.90     40000



In [35]:
lr_train=cl_report_train["accuracy"], cl_report_train["macro avg"]["precision"], cl_report_train["macro avg"]["recall"], cl_report_train["macro avg"]["f1-score"]
lr_test=cl_report_test["accuracy"], cl_report_test["macro avg"]["precision"], cl_report_test["macro avg"]["recall"], cl_report_test["macro avg"]["f1-score"]

# Naive Bayes

In [36]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)
y_train_pred=clf.predict(X_train_tfidf)
y_test_pred=clf.predict(X_test_tfidf)

In [37]:
from sklearn.metrics import classification_report
cl_report_train=classification_report(y_train, y_train_pred, output_dict=True)
cl_report_train_string=classification_report(y_train, y_train_pred)
print("----------Naive Bayes----------")
print("TRAIN Classification Report")
print(cl_report_train_string)
cl_report_test=classification_report(y_test, y_test_pred, output_dict=True)
cl_report_test_string=classification_report(y_test, y_test_pred)
print("\nTEST Classification Report")
print(cl_report_test_string)

----------Naive Bayes----------
TRAIN Classification Report
              precision    recall  f1-score   support

           0       0.88      0.89      0.89     80007
           1       0.89      0.88      0.88     79993

    accuracy                           0.88    160000
   macro avg       0.88      0.88      0.88    160000
weighted avg       0.88      0.88      0.88    160000


TEST Classification Report
              precision    recall  f1-score   support

           0       0.87      0.88      0.87     19993
           1       0.88      0.87      0.87     20007

    accuracy                           0.87     40000
   macro avg       0.87      0.87      0.87     40000
weighted avg       0.87      0.87      0.87     40000



In [38]:
nb_train=cl_report_train["accuracy"], cl_report_train["macro avg"]["precision"], cl_report_train["macro avg"]["recall"], cl_report_train["macro avg"]["f1-score"]
nb_test=cl_report_test["accuracy"], cl_report_test["macro avg"]["precision"], cl_report_test["macro avg"]["recall"], cl_report_test["macro avg"]["f1-score"]

## It can be observed that the SVM model performs the best with an accuracy of 93% on Train and 90% on test. 