# Amazon Food Reviews - Sentiment Analysis.

# Objective

To determine the sentiment of a given review. (Negative/ Neutral/ Positive)

# Dataset Description

- ProductId : Unique identifier for the product
- UserId : Unqiue identifier for the user
- ProfileName : Name of customer
- HelpfulnessNumerator : Number of users who found the review helpful
- HelpfulnessDenominator : Number of users who indicated whether they found the review helpful or not
- Score : Rating between 1 and 5
- Time : Timestamp for the review
- Summary : Brief summary of the review
- Text : Review text

 Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

Reading the file

In [2]:
review=pd.read_csv("AmazonFoodReviews.csv",encoding='utf8')

In [3]:
review.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
review.shape

(299438, 10)

Checking for the null values

In [5]:
review.isnull().sum()[review.isnull().sum()>0]

ProfileName     9
Summary        11
dtype: int64

Filling the null values

In [6]:
review.ProfileName=review.ProfileName.fillna('C. F. Hill "CFH"')
review.Summary=review.Summary.fillna('Delicious!')

In [7]:
review.isnull().sum()[review.isnull().sum()>0]

Series([], dtype: int64)

Converting the 5 star rating into 3 star rating scores for easy classification.

In [8]:
review.Score=review.Score.replace({2:1, 3:2, 4:3 ,5:3})

In [9]:
review.columns[review.select_dtypes=='object']

array([], shape=(0, 10), dtype=object)

In [10]:
review=review.iloc[ : , [6,9]]

In [11]:
review.head(2)

Unnamed: 0,Score,Text
0,3,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...


In [12]:
review.Text

0         I have bought several of the Vitality canned d...
1         Product arrived labeled as Jumbo Salted Peanut...
2         This is a confection that has been around a fe...
3         If you are looking for the secret ingredient i...
4         Great taffy at a great price.  There was a wid...
                                ...                        
299433    i really have to say I love this chamomile tea...
299434    Better than many I have tried.  With a little ...
299435    I ordered this for my boyfriend because he has...
299436    I know nettle tea has a pretty distinct taste,...
299437    These reviews are wrong.It may work but I coul...
Name: Text, Length: 299438, dtype: object

Converting all the strings to lower case

In [13]:
review.Text=review.Text.str.lower()

In [14]:
from nltk.corpus import stopwords

In [15]:
l1 = stopwords.words("english")

In [16]:
import string

In [17]:
def text_process(mess):            ### creating a function
    """                                                        ## a docstring
    1. remove the punctuation
    2. remove the stopwords
    3. return the list of clean textwords
    
    """
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = "".join(nopunc)
    
    return [ word for word in nopunc.split() if word not in l1]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
review_count=CountVectorizer(analyzer=text_process).fit(review['Text'])

In [20]:
review_x = review_count.transform(review['Text'])

In [21]:
review_x.shape

(299438, 173203)

In [22]:
from sklearn.model_selection import train_test_split

x_train ,x_test ,y_train ,y_test = train_test_split(review_x ,review.Score ,test_size=.2)

In [23]:
print(x_train.shape , y_train.shape)

(239550, 173203) (239550,)


In [24]:
print(x_test.shape , y_test.shape)

(59888, 173203) (59888,)


# Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [26]:
lr.fit(x_train,y_train)

LogisticRegression()

In [27]:
pred=lr.predict(x_test)

In [28]:
from sklearn.metrics import confusion_matrix
tab=confusion_matrix(y_test,pred)
tab

array([[ 6071,   534,  2098],
       [  890,  1476,  2275],
       [ 1119,   835, 44590]], dtype=int64)

In [29]:
from sklearn.metrics import accuracy_score
print('Accuracy = ',accuracy_score(y_test,pred)*100)

Accuracy =  87.05750734704782


# Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()

In [31]:
rf.fit(x_train,y_train)

RandomForestClassifier()

In [32]:
pred=rf.predict(x_test)

In [33]:
from sklearn.metrics import confusion_matrix
tab=confusion_matrix(y_test,pred)
tab

array([[ 3805,     5,  4893],
       [   70,  1455,  3116],
       [   40,     8, 46496]], dtype=int64)

In [34]:
from sklearn.metrics import accuracy_score
print('Accuracy = ',accuracy_score(y_test,pred)*100)

Accuracy =  86.42131979695431


# Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()

In [36]:
dt.fit(x_train,y_train)

DecisionTreeClassifier()

In [37]:
pred=dt.predict(x_test)

In [38]:
from sklearn.metrics import confusion_matrix
tab=confusion_matrix(y_test,pred)
tab

array([[ 5382,   528,  2793],
       [  756,  1903,  1982],
       [ 2274,  1380, 42890]], dtype=int64)

In [39]:
from sklearn.metrics import accuracy_score
print('Accuracy = ',accuracy_score(y_test,pred)*100)

Accuracy =  83.78139193160567


# Naive Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()

In [41]:
nb.fit(x_train,y_train)

MultinomialNB()

In [42]:
pred=nb.predict(x_test)

In [43]:
from sklearn.metrics import confusion_matrix
tab=confusion_matrix(y_test,pred)
tab

array([[ 5516,   459,  2728],
       [  877,   927,  2837],
       [ 1585,   909, 44050]], dtype=int64)

In [44]:
from sklearn.metrics import accuracy_score
print('Accuracy = ',accuracy_score(y_test,pred)*100)

Accuracy =  84.31238311514828


# AdaBoost Classifier

In [45]:
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier()

In [46]:
ada.fit(x_train,y_train)

AdaBoostClassifier()

In [47]:
pred=ada.predict(x_test)

In [48]:
from sklearn.metrics import confusion_matrix
tab=confusion_matrix(y_test,pred)
tab

array([[ 3037,   150,  5516],
       [  614,   203,  3824],
       [ 1035,   190, 45319]], dtype=int64)

In [49]:
from sklearn.metrics import accuracy_score
print('Accuracy = ',accuracy_score(y_test,pred)*100)

Accuracy =  81.0830216403954


# The Highest Accuracy was seen to be 87.05%