<a href="https://colab.research.google.com/github/niteshpd/aai/blob/master/Amazon_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Fine Food Reviews Sentiment Analysis

### Data Description



### Objective

Given a review, determine whether a review is positive(4 or 5) or neative(1 or 2)

 ## Loading the data

In [None]:
# Import libraries

import sqlite3
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Using SQLite Table to read data.
con = sqlite3.connect('amazon/database.sqlite') #Connection Creation

# Filtering only positive and negative reviews and ignoring neutral ones(Score = 3)
filtered_data = pd.read_sql_query("SELECT * FROM Reviews \
                WHERE Score !=3", con)


# Replace scores with constants 'postive'(Score=4,5) and 'negative'(Score=1,2) 

def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative

In [None]:
print(filtered_data.shape)
filtered_data.head()

## Data Cleaning : Deduplication

In [None]:
display = pd.read_sql_query("""SELECT * FROM Reviews WHERE 
                            Score !=3 AND UserId = "AR5J8UI46CURR" 
                            ORDER BY ProductID""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [None]:
# Sorting data by ProductID in ascending order
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True)

In [None]:
# Dropping the duplicates
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [None]:
# What %age of data is still left
final.size/filtered_data.size * 100

69.25890143662969

In [None]:
display = pd.read_sql_query("""SELECT * FROM Reviews WHERE
            Score !=3 AND Id=44737 OR Id=64422 ORDER BY
            ProductId""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [None]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [None]:
print(final.shape)

# Positive and Negative reviews left in our dataset
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

## Text Preprocessing(StopWords removal, Stemming, Lemmitization)

Steps:
1. Remove HTML Tags
2. Remove Punctuations and Special Characters
3. Check the word is english and not alphanumeric
4. Check if the length of the word is greater than 2
5. Convert to lowercase
6. Remove Stopwords
7. Snowball stemming the word

Collect the words used to describe positive and negative reviews.

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english')) #Set of StopWords in English
sno = nltk.stem.SnowballStemmer('english') #Initialising the Snowball Stemmer

def cleanhtml(sentence):
    cleanr = re.compile('<.*?')
    cleantext = re.sub(cleanr, ' ', sentence) # re.sub = Substitute
    return cleantext

def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'', sentence) #Replaced with empty string
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ', cleaned) # Replaced with space
    return cleaned
print(sno.stem('tasty')) # Stem of the word 'Tasty'

tasti


In [None]:
# Steps to implement checks mentioned in the pre-processing

i = 0
strl = ' '
final_string = []
all_positive_words = [] # Words from positive reviews
all_negative_words = []
s=''
for sent in final['Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent) # Remove HTML Tags
    for words in sent.split():
        for cleaned_words in cleanpunc(words).split():
            if(cleaned_words.isalpha()) & (len(cleaned_words) > 2):
                if(cleaned_words.lower() not in stop):
                    s = sno.stem(cleaned_words.lower()).encode('utf8')
                    filtered_sentence.append(s)
                    if final['Score'].values[i] == 'positive':
                        all_positive_words.append(s)
                    elif final['Score'].values[i] == 'negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue
    strl = b" ".join(filtered_sentence) # List of words to String
    final_string.append(strl) # List of all String Reviews
    i += 1

In [None]:
final['CleanedText'] = final_string # Adding a Column of CleanedText to main DataFrame

In [None]:
# Store final table into SQlLite Table

conn = sqlite3.connect('final.sqlite')
c = conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, schema=None, if_exists='replace')

## /////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [None]:
# Loading data from the saved SQLite file

con = sqlite3.connect('final.sqlite') #Connection Creation

final = pd.read_sql_query("SELECT * FROM Reviews", con)

In [None]:
# Storing labels in variable 'y' and saving CleanedText in variable cleaned_text

cleaned_text = list(final['CleanedText'])
y = final['Score'].replace('positive', 1)
y = y.replace('negative',0)

In [None]:
y.value_counts()

1    307061
0     57110
Name: Score, dtype: int64

## Bag of Words(BoW)

In [None]:
# BoW

count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['Text'].values)

In [None]:
print(type(final_counts))
final_counts.get_shape()

<class 'scipy.sparse.csr.csr_matrix'>


(364171, 115281)

## Bi-Grams and n-Grams

In [None]:
freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)
print("Most common Positive words :",freq_dist_positive.most_common(20))
print("\nMost common Negative words :",freq_dist_negative.most_common(20))

Most common Positive words : [(b'like', 139150), (b'tast', 128631), (b'good', 112216), (b'flavor', 109473), (b'love', 107034), (b'use', 103627), (b'great', 102818), (b'product', 99504), (b'one', 95360), (b'tri', 86237), (b'tea', 83824), (b'coffe', 78610), (b'make', 74835), (b'get', 71962), (b'food', 64752), (b'amazon', 57832), (b'would', 55297), (b'time', 55225), (b'buy', 53903), (b'realli', 52569)]

Most common Negative words : [(b'tast', 34489), (b'like', 32284), (b'product', 29504), (b'one', 20420), (b'flavor', 19561), (b'would', 17901), (b'tri', 17676), (b'use', 15275), (b'good', 14977), (b'coffe', 14677), (b'get', 13758), (b'buy', 13690), (b'order', 12846), (b'food', 12742), (b'dont', 11683), (b'tea', 11657), (b'amazon', 11258), (b'even', 10983), (b'box', 10841), (b'make', 9816)]


In [None]:
# bi-gram, tri-gram and n-gram
# Removing stop words like "not" should be avoided before building n-grams

count_vect = CountVectorizer(ngram_range=(1,2)) #Includes Unigrams and Bigrams
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

In [None]:
final_bigram_counts.get_shape()

(364171, 2910192)

## TF-IDF

In [None]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)

In [None]:
final_tf_idf.get_shape()

In [None]:
features = tf_idf_vect.get_feature_names()
len(features)

In [None]:
features[100000:100010]

In [None]:
print(final_tf_idf[3,:].toarray()[0])

In [None]:
# Top TF-IDF Features

def top_tfidf_feats(row, features, top_n):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['Feature', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0], features,25)

In [None]:
top_tfidf

## Word2Vec

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

# We use pretrained model by Google
# It's 3.3GB once loaded into memory
# Occupies 10GB RAM
# Contains all our corpus words as keys and model[word] as values
# Download 'Google-news-vectors-negative300.bin'

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
model.wv['computer'] # We can get the vector for any word

In [None]:
model.wv.similarity('woman','man')

In [None]:
model.wv.most_similar('woman')

In [None]:
model.wv.most_similar('tasti') # Some of the stemmed words may not have Word to Vectors depending on the training dataset

In [None]:
model.wv.most_similar('tasty')

In [None]:
# Training our own Word2Vec model using our own text corpus
import gensim
i = 0
list_of_sent = []
for sent in final['Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)

In [None]:
print(final['Text'].values[0])
print("*****************************************************")
print(list_of_sent[0])

In [None]:
# min_count is the min frequency that a word should appear for training
#size is the size(dimension) of the vector
#Workers is the number of cores to be used for training

w2v_model = gensim.models.Word2Vec(list_of_sent, min_count=5,size=50, workers=12)

In [None]:
words = list(w2v_model.wv.vocab)
print(len(words))

In [None]:
w2v_model.wv.most_similar('tasty')

In [None]:
w2v_model.wv.most_similar('like')

In [None]:
count_vect_feat = count_vect.get_feature_names()
count_vect_feat.index('like')
print(count_vect_feat[64055])

In [None]:
model.wv['women']

  """Entry point for launching an IPython kernel.


array([-1.39648438e-01,  1.64062500e-01,  1.61132812e-02,  1.07910156e-01,
        2.18750000e-01, -2.63671875e-01, -1.13281250e-01, -1.19628906e-01,
       -6.68945312e-02,  9.61914062e-02, -1.21093750e-01, -1.71875000e-01,
        1.23046875e-01,  1.93359375e-01, -2.99072266e-02,  1.51367188e-01,
       -1.13769531e-01,  1.46484375e-02, -5.58471680e-03,  1.10351562e-01,
       -1.64062500e-01,  3.24707031e-02, -1.09863281e-01, -3.41796875e-01,
       -1.45507812e-01, -2.09960938e-01, -1.82617188e-01,  1.78710938e-01,
       -1.31835938e-01, -1.76757812e-01,  2.42614746e-03, -2.53906250e-02,
       -1.73828125e-01, -1.86523438e-01, -4.05273438e-02,  9.91210938e-02,
       -6.00585938e-02, -1.35742188e-01, -6.59179688e-02,  1.26953125e-01,
       -6.29882812e-02,  9.96093750e-02,  5.17578125e-02, -4.44335938e-02,
       -1.62353516e-02, -9.42382812e-02, -7.42187500e-02, -2.92968750e-02,
        1.78710938e-01,  1.23046875e-01,  2.51953125e-01, -9.08203125e-02,
       -1.76239014e-03,  

In [None]:
# Data Splitting

X_1, X_test, y_1, y_test = train_test_split(final_counts, y, test_size=0.2, random_state = 1)
X_tr, X_cv, y_tr, y_cv = train_test_split(X_1, y_1, test_size = 0.25, random_state = 1)

## KNN

### Bow

In [None]:

# Data Splitting

X_1, X_test, y_1, y_test = train_test_split(final_counts, y, test_size=0.2, random_state = 1)
X_tr, X_cv, y_tr, y_cv = train_test_split(X_1, y_1, test_size = 0.25, random_state = 1)
neigh = KNeighborsClassifier(n_neighbors = 5)
neigh.fit(X_tr, y_tr)
neigh.predict(X_cv[1:3000])
for i in range(1,30,2):
    neigh = KNeighborsClassifier(n_neighbors = i, n_jobs=1)
    neigh.fit(X_tr, y_tr)
    pred = neigh.predict(X_cv)
    
    acc = accuracy_score(y_cv, pred, normalize=True) * float(100)
    print('\n CV Accuracy for k = %d is %d%%' %(i,acc))

### TF-IDF

In [None]:
final_tf_idf

<364171x2910192 sparse matrix of type '<class 'numpy.float64'>'
	with 45049660 stored elements in Compressed Sparse Row format>

### Avg W2Vec

In [None]:
def avg_w2vec(sentences):
    all_vectors = []
    for each in sentences:
        split_words = each.decode('utf-8').split()
        c = 0
        vector = np.zeros(300)
        for word in split_words:
            if word in model.vocab:
                vector += model.wv[word]
                c += 1
        all_vectors.append(vector/c)
    return all_vectors

In [None]:
X = avg_w2vec(cleaned_text)

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


In [None]:
# Data Splitting

X_1, X_test, y_1, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)
X_tr, X_cv, y_tr, y_cv = train_test_split(X_1, y_1, test_size = 0.25, random_state = 1)

In [None]:
neigh = KNeighborsClassifier(n_neighbors=5, n_jobs = -1)
neigh.fit(X[:50000], y[:50000])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
pred = neigh.predict(X_cv[25000:50000])
acc = accuracy_score(y_cv[25000:50000], pred, normalize=True) * float(100)

In [None]:
acc

85.11999999999999