In [57]:
# # Intial Setup for Google Colab

# # Run this cell and select the kaggle.json file downloaded
# # from the Kaggle account settings page.
# from google.colab import files
# files.upload()


# # Let's make sure the kaggle.json file is present.
# !ls -lha kaggle.json

# # Next, install the Kaggle API client.
# !pip install -q kaggle

# # The Kaggle API client expects this file to be in ~/.kaggle,
# # so move it there.
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/

# # This permissions change avoids a warning on Kaggle tool startup.
# !chmod 600 ~/.kaggle/kaggle.json


# # List available datasets.
# !kaggle datasets list


# #Setup amazon fine food review dataset locally
# !kaggle datasets download -d snap/amazon-fine-food-reviews


# !unzip amazon-fine-food-reviews.zip



In [58]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

!pip install -U gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
import mpld3

# Connect to the SQLite database file using SQLITE connection 
# to read the table data.
con = sqlite3.connect('database.sqlite')



#filtering only positive and negative reviews i.e. 
# ommitting those reviews with Score = 3(ambiguous-either +ve and -ve)
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, con)

# Give reviews with Score>3 a positive rating, 
# and reviews with a score<3 a negative rating.

def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'


#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative

Requirement already up-to-date: gensim in /home/srinu/anaconda3/lib/python3.6/site-packages (3.6.0)


In [59]:
print(filtered_data.shape) #looking at the number of attributes and size of the data
filtered_data.head()

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# Data Cleaning: Deduplication

In [60]:
#Here the user "AR5J8UI46CURR" reviews are at the 
# same timestamp(caused by error or by design)
#Eliminate the user to the avoid the inconsistent data

display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [61]:
#Sorting data according to ProductId in ascending order

sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [62]:
#Deduplication of entries

final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [63]:
#Checking to see how much % of data still remains

(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

69.25890143662969

In [64]:
#Eliminating the users when the HelpfulnessDenominator is less than HelpfulnessNumerator

display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)

display.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [65]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [66]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

In [67]:
# find sentences containing HTML tags
import re
i=0;
for sent in final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [68]:
print(nltk.download("stopwords"))
stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('tasty'))

[nltk_data] Downloading package stopwords to /home/srinu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
True
{'because', 'against', 'are', 'the', 'if', 'own', 'our', 'through', 'shouldn', 'her', "you're", 'on', "it's", 'them', 'once', 'your', 'that', 'yourselves', 'wasn', "aren't", 'further', 'and', 'both', 'will', 'below', 've', 'into', "couldn't", 'herself', 'been', 'now', 'themselves', 'whom', 'hadn', 'me', "wasn't", 'do', 'any', 'yours', 'then', 'above', 'how', 'under', 'i', 'what', 'theirs', 're', 's', 'mightn', "shouldn't", 'most', 'ours', 'those', 'about', 'these', 'between', 'just', 'more', 'aren', 'isn', 'down', 'where', "she's", 'we', 'was', "hadn't", 'it', 'didn', 'only', 'for', 'at', "weren't", 'him', "isn't", 'shan', 'mustn', 'its', 'or', 'nor', "should've", 'an', 'who', 'of', 'to', 'than', 'his', 'has', 'ain', 'haven', "mustn't", 'weren', 'being', 'd', 'during', "you'll", "won't", "haven't", 'too', 'am', 'by', 'be', 'again', 'few', 'no', 'their', "mig

In [None]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [70]:
final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 
final['CleanedText']=final['CleanedText'].str.decode("utf-8")

# save the labels into a variable label
labels = final['Score']

In [71]:
print(final.head(3)) #below the processed review can be seen in the CleanedText Column 


# store final table into an SQlLite table for future.
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn,  schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)

# Drop the label feature and store the pixel data in d.
final = final.drop("Score",axis=1)

            Id   ProductId          UserId            ProfileName  \
138706  150524  0006641040   ACITT7DI6IDDL        shari zychinski   
138688  150506  0006641040  A2IW4PEEKO2R0U                  Tracy   
138689  150507  0006641040  A1S4A3IQ2MU7V4  sally sue "sally sue"   

        HelpfulnessNumerator  HelpfulnessDenominator     Score        Time  \
138706                     0                       0  positive   939340800   
138688                     1                       1  positive  1194739200   
138689                     1                       1  positive  1191456000   

                                           Summary  \
138706                   EVERY book is educational   
138688  Love the book, miss the hard cover version   
138689               chicken soup with rice months   

                                                     Text  \
138706  this witty little book makes my son laugh at l...   
138688  I grew up reading these Sendak books, and watc...   
138689  Th

# BOW (Bag Of Words)

In [72]:
#BoW
count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(final['CleanedText'].values)
print("the type of count vectorizer ",type(final_counts))
print(final_counts)
print("the shape of out text BOW vectorizer ",final_counts.get_shape())
print("the number of unique words ", final_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
  (0, 12831)	1
  (0, 39045)	1
  (0, 127)	1
  (0, 59768)	1
  (0, 5927)	1
  (0, 69571)	1
  (0, 12140)	1
  (0, 56782)	1
  (0, 32011)	1
  (0, 70102)	1
  (0, 42338)	1
  (0, 36800)	1
  (0, 18801)	1
  (0, 31263)	1
  (0, 69067)	1
  (0, 35433)	1
  (0, 29095)	1
  (0, 51915)	1
  (0, 56924)	1
  (0, 1892)	1
  (0, 1732)	1
  (0, 18765)	1
  (0, 9539)	1
  (0, 51570)	2
  (0, 36759)	1
  :	:
  (364170, 69424)	1
  (364170, 55559)	2
  (364170, 53955)	1
  (364170, 16499)	1
  (364170, 22249)	1
  (364170, 56249)	1
  (364170, 9062)	3
  (364170, 53435)	1
  (364170, 57671)	1
  (364170, 2750)	1
  (364170, 4237)	1
  (364170, 68461)	1
  (364170, 41124)	1
  (364170, 62791)	1
  (364170, 53023)	1
  (364170, 21181)	1
  (364170, 29616)	1
  (364170, 50229)	1
  (364170, 33980)	1
  (364170, 25576)	1
  (364170, 26437)	1
  (364170, 28270)	1
  (364170, 12831)	1
  (364170, 36800)	1
  (364170, 58171)	1
the shape of out text BOW vectorizer  (364171, 71624)
the nu

In [73]:
from sklearn.preprocessing import normalize
final_counts_norm = normalize(final_counts, norm='l1', axis=1)

#  t-SNE using Scikit-Learn

 #   1.Bag Of Words

In [None]:
from sklearn.manifold import TSNE
data_1000 = final_counts_norm[0:20000,:]
top_1000 = data_1000.toarray()
labels_1000 = labels[0:20000]


model = TSNE(n_components=2, random_state=0, perplexity=5000, n_iter=5000)
tsne_data = model.fit_transform(top_1000)

# creating a new data fram which help us in ploting the result data
tsne_data = np.vstack((tsne_data.T, labels_1000)).T
tsne_df = pd.DataFrame(data=tsne_data, columns=("Dim_1", "Dim_2", "label"))

# Ploting the result of tsne
sns.FacetGrid(tsne_df, hue="label", size=6).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
plt.title('With perplexity = 50')
plt.show()