In [28]:
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import sqlite3

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import string
import re
import os
import pickle
from tqdm import tqdm

import seaborn as sns

from sklearn import metrics
from sklearn.metrics import confusion_matrix, auc, roc_curve

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from gensim.models import Word2Vec, KeyedVectors 

## [1]. Reading Data

In [29]:
!ls


ai-14-tf-idf-code-sample-ref.ipynb
ai-14-tf-idf-code-sample-self.ipynb
database.sqlite


In [30]:
conn = sqlite3.connect('../data/data_01_amazon_fine_food_review/database.sqlite')
print(conn)

<sqlite3.Connection object at 0x000001AE45031030>


In [31]:
df = pd.read_sql_query("SELECT * FROM Reviews LIMIT 5427", conn)

In [32]:
print(df.head(n = 3))

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...  
2  "Delight" says it all  This is a confection that has been around a fe...  


### [1.] Available columns in DataFrame

In [33]:
print(df.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


### [1.] Columns and their DataType

In [34]:
print(df.dtypes)

Id                         int64
ProductId                 object
UserId                    object
ProfileName               object
HelpfulnessNumerator       int64
HelpfulnessDenominator     int64
Score                      int64
Time                       int64
Summary                   object
Text                      object
dtype: object


> For Amazon fine food review database, It has one column `score`
<br>


### [1.]. Filter out reviews having score equal to 3 as  these are consider as neutral

In [35]:
df = df.loc[(df['Score'] < 3) | (df['Score'] > 3)]
print(df.head(n = 3))

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...  
2  "Delight" says it all  This is a confection that has been around a fe...  


### [1.]. Update existing column score based on belown condition
- Reviews corresponding to score :
  - 4 and 5, would be consider as positive review (1)
  - 1 and 2, would be consider as negative review (0)


In [36]:
df['Score'] = df['Score'].map(lambda score: 1 if score > 3 else 0)
print(df.head(n = 3))

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      1  1303862400   
1                     0                       0      0  1346976000   
2                     1                       1      1  1219017600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...  
2  "Delight" says it all  This is a confection that has been around a fe...  


In [37]:
# DataFrame shape
print("DataFrame shape : {}".format(df.shape))

DataFrame shape : (5000, 10)


In [38]:
display = pd.read_sql_query("SELECT UserId, ProductId, ProfileName, Score, Time, Text, COUNT(*) AS record FROM Reviews GROUP BY UserId HAVING record > 1", conn)

print("DataFrame shape : {}".format(display.shape))
print(display.head(n = 5))

DataFrame shape : (80668, 7)
               UserId   ProductId             ProfileName  Score        Time  \
0  #oc-R115TNMSPFT9I7  B007Y59HVM                 Breyton      2  1331510400   
1  #oc-R11D9D7SHXIJB9  B005HG9ET0  Louis E. Emory "hoppy"      5  1342396800   
2  #oc-R11DNU2NBKQ23Z  B007Y59HVM        Kim Cieszykowski      1  1348531200   
3  #oc-R11O5J5ZVQE25C  B005HG9ET0           Penguin Chick      5  1346889600   
4  #oc-R12KPBODL2B5ZD  B007OSBE1U   Christopher P. Presta      1  1348617600   

                                                Text  record  
0  Overall its just OK when considering the price...       2  
1  My wife has recurring extreme muscle spasms, u...       3  
2  This coffee is horrible and unfortunately not ...       2  
3  This will be the bottle that you grab from the...       3  
4  I didnt like this coffee. Instead of telling y...       2  


#### [1..1]. Get one user record

In [39]:
print(display[display['UserId'] == '#oc-R12KPBODL2B5ZD'])

               UserId   ProductId            ProfileName  Score        Time  \
4  #oc-R12KPBODL2B5ZD  B007OSBE1U  Christopher P. Presta      1  1348617600   

                                                Text  record  
4  I didnt like this coffee. Instead of telling y...       2  


In [41]:
record_count = display['record'].sum()
print("Record count : {}".format(record_count))

Record count : 393063


## [2]. Exploratory Data Analysis

### [2.]. Data Cleaning: Deduplication

#### [2.]. Sort DataFrame based on ProductId

In [42]:
df_un = df[df.duplicated(subset = ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'])]
print(df_un.head(n = 10))

        Id   ProductId          UserId             ProfileName  \
29      30  B0001PB9FY  A3HDKO7OW0QNK4            Canadian Fan   
2309  2310  B0001VWE0M   AQM74O8Z4FMS0                Sunshine   
2323  2324  B0001VWE0C   AQM74O8Z4FMS0                Sunshine   
3885  3886  B005GX7GVW   AS1FCKNKY95ID  Juli A. Lee "JingleJL"   
3886  3887  B005GX7GVW  A1I34N9LFOSCX7                  Smeggy   
4640  4641  B0002NYO9I   A5DVX3B075B09           Patricia Kays   
4641  4642  B0002NYO9I  A376TWN7I4HMZ8                  helios   

      HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
29                       1                       1      1  1107820800   
2309                     0                       0      0  1127606400   
2323                     0                       0      0  1127606400   
3885                     1                       1      1  1336953600   
3886                     0                       0      1  1349136000   
4640                     0       