Sentiment Analysis for Amazon Reviews

Importing the libraries

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

Importing Dataset

In [19]:
dataset = pd.read_csv('1429_1.csv')

# Taking only useful data from the dataset
dataset = dataset.iloc[:,[17,16,14,11]].values

  exec(code_obj, self.user_global_ns, self.user_ns)


Taking Care of Missing Data

In [20]:
# Discarding observations having missing data
data = pd.DataFrame(dataset)
dataset = data.dropna(axis = 0,how = 'any')
X = dataset.iloc[:, :-1].values

# Storing rating separately to use it later
rating = X[:, -1]

y = dataset.iloc[:, -1].values

# Encoding True/False value at 1/0
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
y = le.fit_transform(y)

In [21]:
dataset.head(-1)

Unnamed: 0,0,1,2,3
0,Kindle,This product so far has not disappointed. My c...,5.0,True
1,very fast,great for beginner or experienced person. Boug...,5.0,True
2,Beginner tablet for our 9 year old son.,Inexpensive tablet for him to use and learn on...,5.0,True
3,Good!!!,I've had my Fire HD 8 two weeks now and I love...,4.0,True
4,Fantastic Tablet for kids,I bought this for my grand daughter when she c...,5.0,True
...,...,...,...,...
34619,Wonderful unit,Can use it for best streaming. Can watch all t...,5.0,True
34620,Works great,I am now able to stream tv and movies from aro...,4.0,True
34621,the best,"best streaming device , very portable , amazin...",5.0,False
34622,Love it,Simply the best to watch tv series and movies....,5.0,True


In [22]:
print(X)
print(y)

[['Kindle'
  'This product so far has not disappointed. My children love to use it and I like the ability to monitor control what content they see with ease.'
  5.0]
 ['very fast'
  'great for beginner or experienced person. Bought as a gift and she loves it'
  5.0]
 ['Beginner tablet for our 9 year old son.'
  'Inexpensive tablet for him to use and learn on, step up from the NABI. He was thrilled with it, learn how to Skype on it already...'
  5.0]
 ...
 ['Love it'
  'Simply the best to watch tv series and movies. It works even better if you are an Amazon Prime subscriber, with access to a many free goodies.'
  5.0]
 ['Try it, you will like it'
  'I was looking for ways to cut cost from a raising cable bill and a friend suggested I try the Amazon Fire. At first I didn���t know if this was something I could do. Once I was able to maneuver through the process, I love it.'
  4.0]
 ['Great little device'
  'I enjoy my kindle tv, it beats paying for cable every month ������'
  4.0]]
[1 1 1

Cleaning the Dataset

In [23]:
import re
import nltk
# Downloading stopwords which does not have much signifcance
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stemmer will reduce words in their root form
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
all_stopwords = stopwords.words('english')

# Removing some stopwords which have significance effect in building this model
rem = ['not', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', 
       "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', 
       "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'don', "don't", 
       'just', 'too', 'very', 'no', 'nor', 'only', 'own', 'same', 'again', 'against', 'but',]
for s in rem:
    all_stopwords.remove(s)
    
def find_clean_text(temp):
    # Removing all characters other than alphabet
    temp = re.sub('[^a-zA-Z]', ' ', temp)
    temp = temp.lower()
    temp = temp.split()
    temp = [ps.stem(word) for word in temp if not word in set(all_stopwords)]
    temp = ' '.join(temp)
    return temp

[nltk_data] Downloading package stopwords to /home/nick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
corpus = []
for i in range(X.shape[0]):
    # Concatenating both title and detailed review
    temp = X[i][0] + ' ' + X[i][1]
    temp = find_clean_text(temp)
    corpus.append(temp)

Creating the Bag of Words model

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)
X  = cv.fit_transform(corpus).toarray()

# Adding rating in the matrix of feature X
rating = rating.reshape(rating.shape[0],1)
X = np.append(X,rating,axis=1)

Splitting the dataset into the Training set and Test set

In [26]:
# Splitting dataset into test set and train set which have equal percentage of data with both positive and negative review
#This is done as precautionary measure considering that very small number (approx 1300 out of 34000) data have negative review.
pos_x = []
pos_y = []
neg_x = []
neg_y = []
for i in range(X.shape[0]):
    if y[i]==1:
        pos_x.append(X[i])
        pos_y.append(y[i])
    else:
        neg_x.append(X[i])
        neg_y.append(y[i])

from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(pos_x, pos_y, test_size = 0.20)
X_train, X_test, y_train, y_test = train_test_split(neg_x, neg_y, test_size = 0.20)

for i in range(len(X_train1)):
    X_train.append(X_train1[i])
    y_train.append(y_train1[i])
for i in range(len(X_test1)):
    X_test.append(X_test1[i])
    y_test.append(y_test1[i])
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

Training the Multinomial Naive Bayes on the Training set

In [27]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [28]:
from sklearn.metrics import confusion_matrix, accuracy_score

print('Result on training set :')
print('Confusion matrix :')
print(confusion_matrix(y_train,classifier.predict(X_train)))
print('accuracy : ',accuracy_score(y_train, classifier.predict(X_train)))

print('Result on test set :')
y_pred = classifier.predict(X_test)
print('Confusion matrix :')
print(confusion_matrix(y_test, y_pred))
print('accuracy',accuracy_score(y_test, y_pred))

Result on training set :
Confusion matrix :
[[  777   330]
 [  909 25233]]
accuracy :  0.9545304414840912
Result on test set :
Confusion matrix :
[[ 177  100]
 [ 234 6302]]
accuracy 0.9509760751504477


Making Prediction on some reviews from test set

In [29]:
random.seed(13)
index = random.randrange(dataset.shape[0])
print('Title :',dataset[0][index],'\nReview :',dataset[1][index],'\nRating :',dataset[2][index])
print("True value :", dataset[3][index])
print("Prediction :",classifier.predict(np.append(cv.transform([find_clean_text(dataset[0][index]+' '+dataset[1][index])]).toarray(),[[dataset[3][index]]],axis=1)))

Title : My 3 yr old grandson loves it! 
Review : My grandson really enjoys this tablet, ease of use and lots of different educational programs. Must remember to charge tablet everynight. 
Rating : 5.0
True value : True
Prediction : [1]


In [32]:
index = random.randrange(dataset.shape[0])
print('Title :',dataset[0][index],'\nReview :',dataset[1][index],'\nRating :',dataset[2][index])
print("True value :", dataset[3][index])
print("Prediction :",classifier.predict(np.append(cv.transform([find_clean_text(dataset[0][index]+' '+dataset[1][index])]).toarray(),[[dataset[3][index]]],axis=1)))

Title : Hate Amazon 
Review : Hate Amazon! The app store doesn't have cool apps! 
Rating : 3.0
True value : False
Prediction : [0]


In [33]:
index = random.randrange(dataset.shape[0])
print('Title :',dataset[0][index],'\nReview :',dataset[1][index],'\nRating :',dataset[2][index])
print("True value :", dataset[3][index])
print("Prediction :",classifier.predict(np.append(cv.transform([find_clean_text(dataset[0][index]+' '+dataset[1][index])]).toarray(),[[dataset[3][index]]],axis=1)))

Title : Nice product for the price 
Review : We are used to iPads in our house. This has taken some learning to get used to a slightly different format, but it works great!! The price point is perfect for kids to use without the worry. 
Rating : 4.0
True value : True
Prediction : [1]
