In [82]:
import sqlite3
import pandas as pd
import numpy as np
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

from nltk.stem.porter import PorterStemmer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [47]:
con = sqlite3.connect('./database.sqlite')

In [48]:
data = pd.read_sql_query('''SELECT * FROM Reviews WHERE Score != 3 LIMIT 5000''', con)
print(data.head())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [49]:
def partition(score):
    if score > 3:
        return 'positive'
    return 'negative'

actualScore = data['Score']
positiveNegative = actualScore.map(partition)
data['Score'] = positiveNegative

In [50]:
print(data.head())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator     Score        Time  \
0                     1                       1  positive  1303862400   
1                     0                       0  negative  1346976000   
2                     1                       1  positive  1219017600   
3                     3                       3  negative  1307923200   
4                     0                       0  positive  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitalit

Data Cleaning

-Deduplication
-Removing false records

In [51]:
sortedData = data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

deDuplicatedData = sortedData.drop_duplicates(subset={'UserId', 'ProfileName', 'Time', 'Text'}, keep='first', inplace=False)
print(deDuplicatedData.shape)

(4986, 10)


Now removing those records where HelpfulnessNumerator > HelpfulnessDenominator

In [52]:
finalData = deDuplicatedData[deDuplicatedData.HelpfulnessDenominator >= deDuplicatedData.HelpfulnessNumerator]
finalData.shape
#print(finalData['Text'])

(4986, 10)

Data Pre-processing

For example

In [53]:
data1501 = finalData['Text'].values[1501]
print(data1501)

Chips Ahoy, chewy oatmeal cookies are so moist that it is difficult to separate a single cookie from the package without it breaking into pieces.  My tastebuds say they are too sweet, not cooked enough, lack ample chocolate chips and oatmeal flakes.  If you like cookie dough, you most likely will enjoy these.  I'll stick with a crunchy, less sweet cookie.


Removing HTML tags from data using bs4's Beautiful Soup

In [54]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(data1501, 'lxml')
text = soup.get_text()
print(text)

Chips Ahoy, chewy oatmeal cookies are so moist that it is difficult to separate a single cookie from the package without it breaking into pieces.  My tastebuds say they are too sweet, not cooked enough, lack ample chocolate chips and oatmeal flakes.  If you like cookie dough, you most likely will enjoy these.  I'll stick with a crunchy, less sweet cookie.


Decontracting i.e. changing don't to do not ,.... etc.

In [55]:
def decontracting(str):
    str = re.sub(r"wont't", 'will not', str)
    str = re.sub(r"can\'t", 'cannot', str)

    str = re.sub(r"n\'t", " not", str)
    str = re.sub(r"\'re", " are", str)
    str = re.sub(r"\'s", " is", str)
    str = re.sub(r"\'d", " would", str)
    str = re.sub(r"\'ll", " will", str)
    str = re.sub(r"\'t", " not", str)
    str = re.sub(r"\'ve", " have", str)
    str = re.sub(r"\'m", " am", str)
    return str    


In [56]:
data1501 = decontracting(data1501)
print(data1501)

Chips Ahoy, chewy oatmeal cookies are so moist that it is difficult to separate a single cookie from the package without it breaking into pieces.  My tastebuds say they are too sweet, not cooked enough, lack ample chocolate chips and oatmeal flakes.  If you like cookie dough, you most likely will enjoy these.  I will stick with a crunchy, less sweet cookie.


Now removing stopwords

In [57]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

Now combining all the pre-processing rules in entire data-set of 'Text'

In [58]:
processedReviews = []

for sentence in tqdm(finalData['Text'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontracting(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    processedReviews.append(sentence.strip())


100%|██████████| 4986/4986 [00:03<00:00, 1644.22it/s]


Now processing with the same rules summary of every review


In [59]:
processedSummary = []

for sentence in tqdm(finalData['Summary'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontracting(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    processedSummary.append(sentence.strip())

100%|██████████| 4986/4986 [00:03<00:00, 1457.15it/s]


Featurization

Bag of Words

In [60]:
countVect = CountVectorizer()
countVect.fit_transform(processedReviews)
print(countVect.get_feature_names()[:10])

finalCount = countVect.transform(processedReviews)
print("the type of count vectorizer ",type(finalCount))
print("the shape of out text BOW vectorizer ",finalCount.get_shape())
print("the number of unique words ", finalCount.get_shape()[1])

['aa', 'aahhhs', 'aback', 'abandon', 'abates', 'abbott', 'abby', 'abdominal', 'abiding', 'ability']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (4986, 12997)
the number of unique words  12997


In [62]:
freqDist = nltk.FreqDist(countVect.get_feature_names())
print(freqDist.most_common(20))

[('aa', 1), ('aahhhs', 1), ('aback', 1), ('abandon', 1), ('abates', 1), ('abbott', 1), ('abby', 1), ('abdominal', 1), ('abiding', 1), ('ability', 1), ('able', 1), ('abor', 1), ('aboulutely', 1), ('absence', 1), ('absent', 1), ('absoloutely', 1), ('absolute', 1), ('absolutely', 1), ('absolutley', 1), ('absolutly', 1)]


In [71]:
allWords = []
for eachSentence in processedReviews:
    for word in eachSentence.split(' '):
        allWords.append(word)
freqDist = nltk.FreqDist(allWords)
print(freqDist.most_common(20))


[('not', 4590), ('like', 1985), ('good', 1735), ('great', 1511), ('taste', 1360), ('one', 1314), ('product', 1288), ('would', 1237), ('flavor', 1164), ('love', 1093), ('coffee', 1049), ('food', 1013), ('chips', 1003), ('tea', 885), ('no', 868), ('really', 829), ('get', 805), ('best', 785), ('much', 760), ('amazon', 713)]


bi-gram, tri-gram

In [72]:
count_vect = CountVectorizer(ngram_range=(1,2), min_df=10, max_features=5000)
final_bigram_counts = count_vect.fit_transform(processedReviews)
print("the type of count vectorizer ",type(final_bigram_counts))
print("the shape of out text BOW vectorizer ",final_bigram_counts.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_bigram_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (4986, 3142)
the number of unique words including both unigrams and bigrams  3142


In [79]:
tfIdfVect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tfIdfVect.fit(processedReviews)
print(tfIdfVect.get_feature_names()[0:10])


['ability', 'able', 'able find', 'able get', 'absolute', 'absolutely', 'absolutely delicious', 'absolutely love', 'absolutely no', 'according']


In [80]:
finalTfIdfVect = tfIdfVect.transform(processedReviews)
print(type(finalTfIdfVect))
print(finalTfIdfVect.get_shape())
print(finalTfIdfVect.get_shape()[1])

<class 'scipy.sparse.csr.csr_matrix'>
(4986, 3142)
3142


Word2Vector

In [105]:
import gensim

listOfSent = []

for item in processedReviews:
    temp = item.split(' ')
    listOfSent.append(temp)

print(listOfSent[0])

w2v_model = gensim.models.Word2Vec(listOfSent, min_count=5, vector_size=50, workers=4)


['product', 'available', 'victor', 'traps', 'unreal', 'course', 'total', 'fly', 'genocide', 'pretty', 'stinky', 'right', 'nearby']


In [109]:
print(len(w2v_model.wv.index_to_key))

3818


In [114]:
w2v_model.wv.most_similar('like')

[('taste', 0.993860125541687),
 ('strong', 0.9918496012687683),
 ('tastes', 0.9872565865516663),
 ('sweet', 0.9866679310798645),
 ('bitter', 0.9830662608146667),
 ('nice', 0.9772301316261292),
 ('flavor', 0.9746538996696472),
 ('really', 0.9728959202766418),
 ('rich', 0.9719640612602234),
 ('smooth', 0.9694072008132935)]