In [1]:
import numpy as np
import pandas as pd

# for basic visualizations
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# for advanced visualizations
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff
import pyrsm as rsm

In [2]:
df = pd.read_csv("amazon_baby.csv")
df.shape

(183531, 3)

In [3]:
df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [4]:
# Classify the rating to different sentiment level. 
df['sentiment']=rsm.ifelse(df.rating>=4,'positive',rsm.ifelse(df.rating==3,'neutual','negative'))
df.head()

Unnamed: 0,name,review,rating,sentiment
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,neutual
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,positive
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,positive
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,positive
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,positive


In [5]:
df.isnull().sum()

name         318
review       829
rating         0
sentiment      0
dtype: int64

In [6]:
# Drop rows with Null value
df=df.dropna(axis=0, how="any", thresh=None, subset=None, inplace=False)
df.isnull().sum()

name         0
review       0
rating       0
sentiment    0
dtype: int64

In [7]:
df.shape

(182384, 4)

## TF-IDF

In [8]:
df=df[["review","sentiment"]]

In [9]:
import numpy as np
import pandas as pd
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
nltk.download('punkt') # downloads you a model
nltk.download('stopwords') # <--- this is new
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from collections import defaultdict
freq = defaultdict(int)
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
from math import log

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# return a list of tokens
def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens if w.lower() not in stop]

In [11]:
for i in range(5):
    test_case1 = df.review[i]
    print(pre_processing_by_nltk(test_case1, stemming = True, need_sent = False))

['flannel', 'wipe', 'ok', ',', 'opinion', 'worth', 'keep', '.', 'also', 'order', 'someims', 'vims', 'cloth', 'wipes-ocean', 'blue-12', 'countwhich', 'larger', ',', 'nicer', ',', 'softer', 'textur', 'seem', 'higher', 'qualiti', '.', 'use', 'cloth', 'wipe', 'hand', 'face', 'usingthirsti', '6', 'pack', 'fab', 'wipe', ',', 'boyfor', '8', 'month', 'need', 'replac', 'becaus', 'start', 'get', 'rough', 'stink', 'issu', 'strip', 'longer', 'handl', '.']
['came', 'earli', 'wa', 'disappoint', '.', 'love', 'planet', 'wise', 'bag', 'wipe', 'holder', '.', 'kep', 'osocozi', 'wipe', 'moist', 'doe', 'leak', '.', 'highli', 'recommend', '.']
['veri', 'soft', 'comfort', 'warmer', 'look', '...', 'fit', 'full', 'size', 'bed', 'perfectli', '...', 'would', 'recommend', 'anyon', 'look', 'thi', 'type', 'quilt']
['thi', 'product', 'well', 'worth', 'purchas', '.', 'found', 'anyth', 'els', 'like', 'thi', ',', 'posit', ',', 'ingeni', 'approach', 'lose', 'binki', '.', 'love', 'thi', 'product', 'much', 'ownership', 'd

### frequency vector

In [12]:
DF = defaultdict(float)
for doc in tqdm(df.review):
    tokens = pre_processing_by_nltk(doc)
    for token in set(tokens):
        DF[token] += 1

100%|██████████| 182384/182384 [09:58<00:00, 304.85it/s]


In [13]:
IDF, vocab = dict(), dict()
for token in tqdm(DF):
    if DF[token] < 100:
        pass
    else:
        vocab[token] = len(vocab)
        IDF[token] = log(1 + len(df.review) / DF[token])
IDF['<UNK>'] = 1
vocab['<UNK>'] = len(vocab)
print(len(DF), len(vocab), len(IDF))

100%|██████████| 118284/118284 [00:00<00:00, 850088.08it/s]

118284 3502 3502





In [14]:
def tf(doc, vocab):
    tokens = pre_processing_by_nltk(doc)
    for i, token in enumerate(tokens):
        if token not in vocab:
            tokens[i] = '<UNK>'
    x = [0] * len(vocab)
    for token in tokens:
        token_id = vocab[token]
        x[token_id] += 1   
    return x
X = []
for doc in tqdm(df.review):
    X.append(tf(doc, vocab))

100%|██████████| 182384/182384 [10:31<00:00, 288.98it/s]


In [15]:
def y_to_numeric(y):
    if y == "negative": 
        return 0
    elif y == "neutual": 
        return 1
    else: # 2 
        return 2 
y = df.sentiment.apply(y_to_numeric)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [17]:
clf = LogisticRegression().fit(X_train, y_train)
print("AUROC:",roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr'))
print("Micro F-1:",f1_score(y_test,clf.predict(X_test) ,average="micro"))
print("Macro F-1:",f1_score(y_test,clf.predict(X_test) ,average="macro"))


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



AUROC: 0.8868345735648764
Micro F-1: 0.8403651616086849
Macro F-1: 0.6052111846831921


##  tf-idf vector

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,preprocessor=None,
                        tokenizer=pre_processing_by_nltk,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)

In [74]:
X = tfidf.fit_transform(df.review)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [70]:
clf = LogisticRegression().fit(X_train, y_train)
print("AUROC:",roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr'))
print("Micro F-1:",f1_score(y_test,clf.predict(X_test) ,average="micro"))
print("Macro F-1:",f1_score(y_test,clf.predict(X_test) ,average="macro"))

AUROC: 0.8771597718205223
Micro F-1: 0.743298969072165
Macro F-1: 0.6467183480164836
