# Amazon Fine Food Review

### Loading the dataset

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os

In [2]:
#using SQLite Table to read data
con = sqlite3.connect('/home/monodeepdas112/Datasets/amazon-fine-food-reviews/database.sqlite')

#filtering only positive and negative reviews
#not taking into consideration the reviews with Score=3
filtered_data = pd.read_sql_query('select * from Reviews where score != 3', con)


#replacing the score column to contain only positive or negative rather than continuous range of values
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

actualScore = filtered_data['Score']
positiveNegativeScore = actualScore.map(partition)
filtered_data['Score'] = positiveNegativeScore

In [3]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Data Cleanup : Deduplication
Removing all the duplicate entries in order to get unbiased results.

In [4]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


#### As we can see we have same product review at the same timestamp for different product IDs from the same person.

In [5]:
#Sorting data according to ProductId in ascending order
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [6]:
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [7]:
#Checking to see how much percentage of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

69.25890143662969

After removing duplicates we are left with the 69% approx of the original data.
Logically HelpfulnessNumerator <= HelpfulnessDenominator. But in some data points it is not so we remove those data points.

In [8]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
final.shape

(364171, 10)

In [9]:
#How many positive and negative reviews are there in our dataset ?
final['Score'].value_counts()

positive    307061
negative     57110
Name: Score, dtype: int64

## BoW Code Sample

In [10]:
count_vec = CountVectorizer()
final_counts = count_vec.fit_transform(final['Text'].values)

In [11]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [12]:
final_counts.shape

(364171, 115281)

#### There are 115281 unique words in the dataset
#### There are 364171 text reviews

## Text Preprocessing

We will do the following:
1. Remove the html tags
2. Remove all the punctuation marks or special characters
3. Check if the word is made up of english letters and not alpha numerical.
4. Check to see if length of the word is greater than 2 (as there are no adjective in 2-letters)
5. Convert the words to lower case
6. Remove Stopwords
7. Finally Snowball Stemming the words

In [13]:
#find all the text than contain html tags
i=0;
for sent in final['Text'].values:
    if(len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1


6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [14]:
#removing stop words
stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initializing the Snowball Stemmer

def cleanHTML(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext

def cleanPunctuation(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r'', sentence)
    return cleaned

print(stop)
print('---------------------------------------------------------------------------------------')
print(sno.stem('tasty'))

{'until', 'why', 'doesn', 'who', 'ma', 'at', 'only', 'am', 'there', 'during', "shouldn't", 'does', 'from', "needn't", "haven't", 'those', 'now', 'll', 'herself', 'once', 'most', 'your', 'that', 'd', 'here', 'then', 'same', 'didn', 'were', 'doing', 'an', 'won', 'i', 's', 'will', 'he', 'her', 'themselves', 'further', 'haven', 'no', "wouldn't", 'for', 'of', 'these', 'under', 'up', 'yourself', "you'll", "shan't", 'because', "doesn't", "weren't", 've', 'wasn', "won't", 'whom', 'o', 'is', 'was', 'being', 'aren', 't', 'shan', 're', 'any', 'ain', 'me', 'weren', "you'd", "you've", 'having', 'own', "don't", 'don', 'in', 'shouldn', 'very', "isn't", 'about', 'or', 'between', 'their', 'which', 'after', 'can', 'it', 'both', 'are', 'to', 'its', 'a', 'out', 'hasn', 'by', 'our', 'did', 'while', 'into', 'how', 'mightn', 'so', "mightn't", "hadn't", 'have', 'ourselves', 'hadn', 'she', 'other', "didn't", 'with', 'below', 'my', 'we', 'this', 'myself', 'down', 'should', 'needn', 'few', 'has', 'more', "mustn'

In [20]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
if not os.path.isfile('/home/monodeepdas112/Datasets/amazon-fine-food-reviews/final.sqlite'):
    final_string=[]
    all_positive_words=[] # store words from +ve reviews here
    all_negative_words=[] # store words from -ve reviews here.
    for i, sent in enumerate(tqdm(final['Text'].values)):
        filtered_sentence=[]
        sent = cleanHTML(sent)#removing HTML tags
        for w in sent.split():
            for cleaned_words in cleanPunctuation(w).split():
                if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                    if(cleaned_words.lower() not in stop):
    #                     s=(sno.stem(cleaned_words.lower())).encode('utf8')
                        s=(sno.stem(cleaned_words.lower()))
                        filtered_sentence.append(s)
                        if (final['Score'].values)[i] == 'positive':
                            all_positive_words.append(s) #list of all words used to describe positive reviews
                        if(final['Score'].values)[i] == 'negative':
                            all_negative_words.append(s) #list of all words used to describe negative reviews reviews
    #                             print('Negative : ', len(all_negative_words))
        str1 = ' '.join(filtered_sentence) #final string of cleaned words
    #     str1 = b" ".join(filtered_sentence) #final string of cleaned words
        #print("***********************************************************************")
        final_string.append(str1)

    #############---- storing the data into .sqlite file ------########################
    final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 
    final['CleanedText']=final['CleanedText']
    # final['CleanedText']=final['CleanedText'].str.decode("utf-8")
        # store final table into an SQlLite table for future.
    conn = sqlite3.connect('/home/monodeepdas112/Datasets/amazon-fine-food-reviews/final.sqlite')
    c=conn.cursor()
    conn.text_factory = str
    final.to_sql('Reviews', conn,  schema=None, if_exists='replace', \
                 index=True, index_label=None, chunksize=None, dtype=None)
    conn.close()


    with open('positive_words.pkl', 'wb') as f:
        pickle.dump(all_positive_words, f)
    with open('negitive_words.pkl', 'wb') as f:
        pickle.dump(all_negative_words, f)
else:
    con = sqlite3.connect('/home/monodeepdas112/Datasets/amazon-fine-food-reviews/final.sqlite')
    final = pd.read_sql_query('select * from Reviews', con)

## Bag Of Words

In [21]:
#BoW
count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(final['CleanedText'].values)
print("the type of count vectorizer ",type(final_counts))
print("the shape of out text BOW vectorizer ",final_counts.get_shape())
print("the number of unique words ", final_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (364171, 113489)
the number of unique words  113489


### Bi-Grams and n-Grams

In [22]:
with open('positive_words.pkl', 'rb') as f:
    all_positive_words = pickle.load(f)
with open('negitive_words.pkl', 'rb') as f:
    all_negative_words = pickle.load(f)
freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)
print('Most Common Positive Words : ', freq_dist_positive.most_common(20))
print('Most Common Negative Words : ', freq_dist_negative.most_common(20))

Most Common Positive Words :  [('like', 138531), ('tast', 126159), ('good', 107583), ('love', 106314), ('flavor', 106287), ('use', 103251), ('great', 98289), ('one', 94769), ('product', 86413), ('tri', 85388), ('tea', 80626), ('coffe', 75775), ('make', 74686), ('get', 71759), ('food', 62462), ('would', 55402), ('time', 53612), ('buy', 53479), ('realli', 52433), ('eat', 51179)]
Most Common Negative Words :  [('tast', 33876), ('like', 32136), ('product', 27341), ('one', 20203), ('flavor', 18758), ('would', 17927), ('tri', 17641), ('use', 15171), ('good', 14597), ('coffe', 14187), ('get', 13733), ('buy', 13563), ('order', 12739), ('food', 12287), ('tea', 11259), ('even', 11034), ('box', 10518), ('make', 9806), ('time', 9580), ('bag', 9459)]


In [23]:
#bi-gram, tri-gram and n-gram
count_vect = CountVectorizer(ngram_range=(1,2))
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

In [24]:
final_bigram_counts.shape

(364171, 2910192)

## TF-IDF

In [25]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['CleanedText'].values)
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (364171, 2951873)
the number of unique words including both unigrams and bigrams  2951873


In [26]:
final_tf_idf.shape

(364171, 2951873)

In [27]:
features = tf_idf_vect.get_feature_names()
len(features)
print("some sample features(unique words in the corpus)",features[100000:100010])

some sample features(unique words in the corpus) ['anoth need', 'anoth negat', 'anoth neighbor', 'anoth nervous', 'anoth neurotoxin', 'anoth never', 'anoth new', 'anoth newer', 'anoth newman', 'anoth next']


In [28]:
# source: https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features,25)

In [29]:
top_tfidf

Unnamed: 0,feature,tfidf
0,grew read,0.192335
1,flimsi take,0.192335
2,rosi movi,0.192335
3,version paperback,0.192335
4,read sendak,0.192335
5,paperback seem,0.192335
6,sendak book,0.192335
7,keep page,0.192335
8,incorpor love,0.192335
9,page open,0.192335


## Word2Vec

In [30]:
with open('/home/monodeepdas112/Datasets/google_w2v_for_amazon.pkl', 'rb') as f:
    # model is dict object, we can directly access any word vector using model[word]
    model = pickle.load(f)

In [34]:
# Train your own Word2Vec model using your own text corpus
i=0
list_of_sent=[]
for sent in final['CleanedText'].values:
    list_of_sent.append(sent.split())

In [28]:
w2v_model=Word2Vec(list_of_sent, min_count=5, size=50, workers=4)
#This took almost 10 hrs to execute

In [29]:
w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

number of words that occured minimum 5 times  22563
sample words  ['witti', 'littl', 'book', 'make', 'son', 'laugh', 'loud', 'recit', 'car', 'drive', 'along', 'alway', 'sing', 'refrain', 'learn', 'whale', 'india', 'droop', 'love', 'new', 'word', 'introduc', 'silli', 'classic', 'will', 'bet', 'still', 'abl', 'memori', 'colleg', 'grew', 'read', 'sendak', 'watch', 'realli', 'rosi', 'movi', 'incorpor', 'howev', 'miss', 'hard', 'cover', 'version', 'paperback', 'seem', 'kind', 'flimsi', 'take', 'two', 'hand']


In [31]:
w2v_model.wv.most_similar('eat')

[('sneak', 0.7494032382965088),
 ('eaten', 0.721808671951294),
 ('ate', 0.7115689516067505),
 ('nibbl', 0.6728719472885132),
 ('munch', 0.6715096235275269),
 ('gobbl', 0.670685350894928),
 ('scarf', 0.6677533388137817),
 ('hungri', 0.6459101438522339),
 ('feed', 0.6402752995491028),
 ('starv', 0.6366564035415649)]

In [36]:
#average word2Vec
sent_vectors = []
for sent in list_of_sent:
    sent_vec = np.zeros(50)
    cnt_words = 0
    for word in sent:
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
        sent_vec /= cnt_words
        sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

13252045
50


In [38]:
#tf-idf word2Vec
tfidf_features = tf_idf_vect.get_feature_names()

tfidf_sentence_vectors = []
row=0
for sent in list_of_sent:
    sent_vec = np.zeros(50)
    weight_sum = 0
    for word in sent:
        try:
            vec = w2v_model.wv[word]
            tfidf = final_tf_idf[row, tfidf_features.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            pass
        sent_vec /= weight_sum
        tfidf_sentence_vectors.append(sent_vec)
        row += 1
print(len(tfidf_sentence_vectors))
print(len(tfidf_sentence_vectors[0]))

13252045
50
