In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import sqlite3

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import string
import re
import os
import pickle
from tqdm import tqdm
from bs4 import BeautifulSoup

import seaborn as sns

from sklearn import metrics
from sklearn.metrics import confusion_matrix, auc, roc_curve

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from gensim.models import Word2Vec, KeyedVectors 

## [1]. Reading Data

In [2]:
!ls


ai-14-tf-idf-code-sample-ref.ipynb
ai-14-tf-idf-code-sample-self.ipynb
database.sqlite


In [3]:
conn = sqlite3.connect('../data/data_01_amazon_fine_food_review/database.sqlite')
print(conn)

<sqlite3.Connection object at 0x0000026D7ADC0E30>


In [4]:
df = pd.read_sql_query("SELECT * FROM Reviews LIMIT 5427", conn)

In [5]:
print(df.head(n = 3))

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...  
2  "Delight" says it all  This is a confection that has been around a fe...  


### [1.] Available columns in DataFrame

In [6]:
print(df.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


### [1.] Columns and their DataType

In [7]:
print(df.dtypes)

Id                         int64
ProductId                 object
UserId                    object
ProfileName               object
HelpfulnessNumerator       int64
HelpfulnessDenominator     int64
Score                      int64
Time                       int64
Summary                   object
Text                      object
dtype: object


> For Amazon fine food review database, It has one column `score`
<br>


### [1.]. Filter out reviews having score equal to 3 as  these are consider as neutral

In [8]:
df = df.loc[(df['Score'] < 3) | (df['Score'] > 3)]
print(df.head(n = 3))

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...  
2  "Delight" says it all  This is a confection that has been around a fe...  


### [1.]. Update existing column score based on belown condition
- Reviews corresponding to score :
  - 4 and 5, would be consider as positive review (1)
  - 1 and 2, would be consider as negative review (0)


In [9]:
df['Score'] = df['Score'].map(lambda score: 1 if score > 3 else 0)
print(df.head(n = 3))

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      1  1303862400   
1                     0                       0      0  1346976000   
2                     1                       1      1  1219017600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...  
2  "Delight" says it all  This is a confection that has been around a fe...  


In [10]:
# DataFrame shape
print("DataFrame shape : {}".format(df.shape))

DataFrame shape : (5000, 10)


In [11]:
display = pd.read_sql_query("SELECT UserId, ProductId, ProfileName, Score, Time, Text, COUNT(*) AS record FROM Reviews GROUP BY UserId HAVING record > 1", conn)

print("DataFrame shape : {}".format(display.shape))
print(display.head(n = 5))

DataFrame shape : (80668, 7)
               UserId   ProductId             ProfileName  Score        Time  \
0  #oc-R115TNMSPFT9I7  B007Y59HVM                 Breyton      2  1331510400   
1  #oc-R11D9D7SHXIJB9  B005HG9ET0  Louis E. Emory "hoppy"      5  1342396800   
2  #oc-R11DNU2NBKQ23Z  B007Y59HVM        Kim Cieszykowski      1  1348531200   
3  #oc-R11O5J5ZVQE25C  B005HG9ET0           Penguin Chick      5  1346889600   
4  #oc-R12KPBODL2B5ZD  B007OSBE1U   Christopher P. Presta      1  1348617600   

                                                Text  record  
0  Overall its just OK when considering the price...       2  
1  My wife has recurring extreme muscle spasms, u...       3  
2  This coffee is horrible and unfortunately not ...       2  
3  This will be the bottle that you grab from the...       3  
4  I didnt like this coffee. Instead of telling y...       2  


#### [1..1]. Get one user record

In [12]:
print(display[display['UserId'] == '#oc-R12KPBODL2B5ZD'])

               UserId   ProductId            ProfileName  Score        Time  \
4  #oc-R12KPBODL2B5ZD  B007OSBE1U  Christopher P. Presta      1  1348617600   

                                                Text  record  
4  I didnt like this coffee. Instead of telling y...       2  


In [13]:
record_count = display['record'].sum()
print("Record count : {}".format(record_count))

Record count : 393063


## [2]. Exploratory Data Analysis

### [2.]. Data Cleaning: Deduplication

#### [2.]. Sort DataFrame based on ProductId

In [14]:
dup_records = df[df.duplicated(subset = ['UserId', 'ProfileName', 'Time', 'Text'])]
print("Duplicate records DataFrame shape: {}".format(dup_records.shape))
print(dup_records.head(n = 3))

Duplicate records DataFrame shape: (14, 10)
        Id   ProductId          UserId   ProfileName  HelpfulnessNumerator  \
29      30  B0001PB9FY  A3HDKO7OW0QNK4  Canadian Fan                     1   
574    575  B000G6RYNE  A3PJZ8TU8FDQ1K  Jared Castle                     2   
2309  2310  B0001VWE0M   AQM74O8Z4FMS0      Sunshine                     0   

      HelpfulnessDenominator  Score        Time  \
29                         1      1  1107820800   
574                        2      1  1231718400   
2309                       0      0  1127606400   

                                          Summary  \
29                The Best Hot Sauce in the World   
574   One bite and you'll become a "chippoisseur"   
2309                               Below standard   

                                                   Text  
29    I don't know if it's the cactus or the tequila...  
574   I'm addicted to salty and tangy flavors, so wh...  
2309  Too much of the white pith on this orange pee

In [15]:
sorted_df = df.sort_values("ProductId", axis = 0, ascending  = True, inplace = False, kind = 'quicksort', na_position = 'last')
print(sorted_df.head(n = 10))
print(sorted_df.shape)

        Id   ProductId          UserId                           ProfileName  \
2773  2774  B00002NCJC  A196AJHU9EASJN                          Alex Chaffee   
2774  2775  B00002NCJC  A13RRPGE79XFFH                              reader48   
1243  1244  B00002Z754  A3B8RCEI0FXFI6                             B G Chase   
1244  1245  B00002Z754  A29Z5PI9BW2PU3                                Robbie   
3203  3204  B000084DVR  A1UGDJP1ZJWVPF          T. Moore "thoughtful reader"   
3202  3203  B000084DVR  A3DKGXWUEP1AI2           Glenna E. Bauer "Puppy Mum"   
1160  1161  B000084E1U  A3DH85EYHW4AQH                          Eric Hochman   
2379  2380  B0000CGFV4  A3LSUKN4IFS6VD                         Internet Diva   
5096  5097  B0000D94SZ  A2R91PG1XPNO0B  Kotton Kandee "not so secretshopper"   
2255  2256  B0000DC5IY    AGYZZ3QXV9S8      Dr. Glenn B. Gero "NJnaturaldoc"   

      HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
2773                     0                    

In [16]:
final = sorted_df.drop_duplicates(subset = ['UserId', 'ProfileName', 'Time', 'Text'])
print(final.shape)
print(final.head())

(4986, 10)
        Id   ProductId          UserId                   ProfileName  \
2773  2774  B00002NCJC  A196AJHU9EASJN                  Alex Chaffee   
2774  2775  B00002NCJC  A13RRPGE79XFFH                      reader48   
1243  1244  B00002Z754  A3B8RCEI0FXFI6                     B G Chase   
1244  1245  B00002Z754  A29Z5PI9BW2PU3                        Robbie   
3203  3204  B000084DVR  A1UGDJP1ZJWVPF  T. Moore "thoughtful reader"   

      HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
2773                     0                       0      1  1282953600   
2774                     0                       0      1  1281052800   
1243                    10                      10      1   962236800   
1244                     7                       7      1   961718400   
3203                     1                       1      1  1177977600   

                             Summary  \
2773                   thirty bucks?   
2774                    Flies Begone 

In [17]:
#Checking how much % of data still remains
rem_data = np.divide(final['Id'].size, df['Id'].size)
print(rem_data)

0.9972


### [2.]. Data Cleaning: invalid data check

In [34]:
data_sanity_i = final[final['HelpfulnessNumerator'] <= final['HelpfulnessDenominator']]  
print("Invalid DataFrame shape: {}".format(data_sanity_i.shape))
print(data_sanity_i.head())

Invalid DataFrame shape: (4986, 10)
        Id   ProductId          UserId                   ProfileName  \
2773  2774  B00002NCJC  A196AJHU9EASJN                  Alex Chaffee   
2774  2775  B00002NCJC  A13RRPGE79XFFH                      reader48   
1243  1244  B00002Z754  A3B8RCEI0FXFI6                     B G Chase   
1244  1245  B00002Z754  A29Z5PI9BW2PU3                        Robbie   
3203  3204  B000084DVR  A1UGDJP1ZJWVPF  T. Moore "thoughtful reader"   

      HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
2773                     0                       0      1  1282953600   
2774                     0                       0      1  1281052800   
1243                    10                      10      1   962236800   
1244                     7                       7      1   961718400   
3203                     1                       1      1  1177977600   

                             Summary  \
2773                   thirty bucks?   
2774        

#### [2.]. Data Cleaning: After data sanity group data based on negative and positive review

In [35]:
aggregated_scores = data_sanity_i['Score'].value_counts()
print(aggregated_scores)

1    4178
0     808
Name: Score, dtype: int64


## [3]. Text processing

#### [3.]. Get Text insight

In [36]:
def display_series_n_rows(series_list, n = 5):
    
    for index, item in enumerate(series_list):
        print(item)
        print("===="*20)
        if index == n:
            break
    
    return

In [37]:
# Print randomly data
text_list = data_sanity_i['Text'].values
print("Series.values data type : {}".format(type(text_list)))
print("Columns Text lenght : {}".format(text_list.size))

Series.values data type : <class 'numpy.ndarray'>
Columns Text lenght : 4986


#### [3.]. Remove URL's from text

In [38]:
text_0 = "Why is this $[...] when the same product is available for $[...] here?<br />http://www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby."
print(text_0)

Why is this $[...] when the same product is available for $[...] here?<br />http://www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.


In [39]:
print(re.sub('https\S+|http\S+', "", text_0))

Why is this $[...] when the same product is available for $[...] here?<br /> /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.


#### [3.]. Perform decontracted operation

In [40]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [41]:
text_1 = "i'm, we're"
text_1 = decontracted(text_1)
print(text_1)

i am, we are


#### [3.]. Python/BeautifulSoup - Remove all tags from an element

In [42]:
from bs4 import BeautifulSoup

In [43]:
def remove_all_tags(text):
    
    soup = BeautifulSoup(text, "lxml")

    return soup.get_text()

In [44]:
text_2 = "<strong>hi</strong><p>Keep thinking positive</p>"
text_2 = remove_all_tags(text_2)
print(text_2)

hiKeep thinking positive


#### [3.]. Remove words with number

In [45]:
text_3 = "We are45 going567to organi97996z7868e event shortly0088"
text_3 = re.sub("\S*\d\S*", "", text_3)
print(text_3)

We    event 


#### [3.]. Remove special characters

In [46]:
text_4 = "Why is this $[...] when the @$#^*&\(\) same product is available for $[...] here?"
text_4 = re.sub("[^A-Za-z0-9]+", " ", text_4)
print(text_4)

Why is this when the same product is available for here 


#### [3.]. Remove stop characters

In [47]:
stopwords = set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [48]:
text_5 = "I am not going anywhere till target did not get achieved."
text_5 = " ".join(word.lower() for word in text_5.split() if word.lower() not in stopwords) 
print(text_5)

not going anywhere till target not get achieved.


In [49]:
def preprocessing_text(raw_txt):
    
    cleaned_txt = ""
    cleaned_txt = re.sub('https\S+|http\S+', "", raw_txt)
    cleaned_txt = decontracted(cleaned_txt)
    cleaned_txt = BeautifulSoup(cleaned_txt, "lxml").get_text()
    cleaned_txt = re.sub("\S*\d\S*", "", cleaned_txt)
    cleaned_txt = re.sub("[^A-Za-z0-9]+", " ", cleaned_txt)
    cleaned_txt = " ".join(word.lower() for word in cleaned_txt.split() if word.lower() not in stopwords) 
    
    return cleaned_txt

In [50]:
data_sanity_i['Text'].head()

2773    Why is this $[...] when the same product is av...
2774    We have used the Victor fly bait for 3 seasons...
1243    I just received my shipment and could hardly w...
1244    This was a really good idea and the final prod...
3203    I'm glad my 45lb cocker/standard poodle puppy ...
Name: Text, dtype: object

In [51]:
data_sanity_i['Text'] = data_sanity_i['Text'].map(preprocessing_text)

In [52]:
print(data_sanity_i.head())

        Id   ProductId          UserId                   ProfileName  \
2773  2774  B00002NCJC  A196AJHU9EASJN                  Alex Chaffee   
2774  2775  B00002NCJC  A13RRPGE79XFFH                      reader48   
1243  1244  B00002Z754  A3B8RCEI0FXFI6                     B G Chase   
1244  1245  B00002Z754  A29Z5PI9BW2PU3                        Robbie   
3203  3204  B000084DVR  A1UGDJP1ZJWVPF  T. Moore "thoughtful reader"   

      HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
2773                     0                       0      1  1282953600   
2774                     0                       0      1  1281052800   
1243                    10                      10      1   962236800   
1244                     7                       7      1   961718400   
3203                     1                       1      1  1177977600   

                             Summary  \
2773                   thirty bucks?   
2774                    Flies Begone   
1243  WO

## <font color=black>[3]. Featurization</font>

### <font color=blue>[3.]. Bag of Words</font>

In [57]:
count_vect = CountVectorizer()
count_vect.fit(data_sanity_i['Text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [58]:
feat_list = count_vect.get_feature_names()
print("Some features name: ", feat_list[:5])
print("=="*20)

Some features name:  ['aa', 'aahhhs', 'aback', 'abandon', 'abates']


In [59]:
final_counts = count_vect.transform(data_sanity_i['Text'])
final_counts_shape = final_counts.get_shape()
print("Shape of out text BOW vectorizer: {}".format(final_counts_shape))

Shape of out text BOW vectorizer: (4986, 12997)


In [60]:
print("Type of Count vectorizer : {}".format(type(final_counts)))
print("Number of unique words {}".format(final_counts_shape[1]))

Type of Count vectorizer : <class 'scipy.sparse.csr.csr_matrix'>
Number of unique words 12997


### <font color=blue>[3.]. Bi-Grams and n-Grams

In [61]:
count_vect_ii = CountVectorizer(ngram_range = (1, 2), min_df = 10, max_features = 5000)

In [62]:
final_bigram_counts = count_vect_ii.fit_transform(data_sanity_i['Text'])
final_bigram_shape = final_bigram_counts.get_shape()

In [63]:
print("Type of Counter Vectorizer : {}".format(type(final_bigram_counts)))
print("Shape of out text BOW vectorizer : {}".format(final_bigram_shape))
print("Number of unique words : {}".format(final_bigram_shape[1]))

Type of Counter Vectorizer : <class 'scipy.sparse.csr.csr_matrix'>
Shape of out text BOW vectorizer : (4986, 3144)
Number of unique words : 3144
