In [2]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [11]:
data = pd.read_csv('amazon_baby_subset.csv')
with open('important_words.json','r') as f:
    imp_words = json.load(f)

In [12]:
data.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


### remove punctuation

In [26]:
def remove_punctuation(text):
    text = str(text)
    import string
    return text.translate( text.maketrans('','',string.punctuation) ) 

In [27]:
data['review_clean']=data['review'].apply(remove_punctuation)

In [28]:
data.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...


### Word count using imp words

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
data.isna().astype(int).sum()

name             90
review          241
rating            0
sentiment         0
review_clean      0
dtype: int64

In [31]:
data.fillna({'review':''},inplace=True)

In [32]:
data.isna().astype(int).sum()

name            90
review           0
rating           0
sentiment        0
review_clean     0
dtype: int64

In [33]:
data.shape

(53072, 5)

In [37]:
for word in imp_words:
    data[word]=data['review_clean'].apply(lambda s: s.split().count(word) )

In [38]:
data['contain_perfect']=(data['perfect']>=1).astype(int)

In [39]:
data.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,word,baby,one,great,love,...,picture,completely,wish,buying,babies,won,tub,almost,either,contain_perfect
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [41]:
ctr = data['contain_perfect'].sum()
print(ctr)

2955


## prepare feat matrix

In [42]:
def get_numpy_data(df, feat, label):
    df['constant'] = 1
    feat = ['constant'] + feat
    
    feat_mat = df[feat]
    feat_mat = feat_mat.as_matrix()
    
    label_arr = df[label]
    label_arr = label_arr.as_matrix()
    
    return feat_mat,label_arr

In [50]:
feat = imp_words
label = ['sentiment']
feat_mat,label_arr = get_numpy_data(data,feat,label)

  
  if __name__ == '__main__':


In [51]:
print(feat_mat.shape)
print(label_arr.shape)
label_arr = label_arr.reshape(-1,)
print(label_arr.shape)

(53072, 194)
(53072, 1)
(53072,)


## gradient ascent

In [52]:
## produces probablistic estimate for P(y_i = +1 | x_i, w).

def predict_probability(feat_mat, wts):
    score =np.dot(feat_mat,wts)
    prob = 1/(1+np.exp(-score))
    
    return prob

In [53]:
def feature_derivative(error, feat):     
    derivative = np.dot(error,feat)
    return derivative

In [69]:
# ℓ(w) = ∑ ( ( I[yi=+1] − 1 ) wh(x) − ln(1+exp(−wh(x)) ) )
def compute_log_likelihood(feat_mat, sent, wts):
    indicator = (sent == +1)
    scores = np.dot(feat_mat, wts)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [70]:
from math import sqrt
def logistic_regression(feat_mat, sent, init_wts, step, max_iter):
    
    wts = np.array(init_wts) 
    
    for itr in range(max_iter):
        
        prob = predict_probability(feat_mat,wts)

        indicator = (sent==+1)

        errors = indicator - prob

        for j in range(len(wts)): 
            
            derivative = feature_derivative(errors,feat_mat[:,j])

            wts[j] = wts[j] + step*derivative
            
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feat_mat, sent, wts)
            print('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(max_iter))), itr, lp) )
    return wts

In [71]:
feat_mat = feat_mat
sent = label_arr
init_wts = np.zeros((194,))
step = 1e-7
max_iter = 301

In [72]:
wts = logistic_regression(feat_mat,sent,init_wts,step,max_iter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

### predicting sentiments

In [100]:
score = np.dot( feat_mat,wts )
pred=[]
ctr=0
for i in range(len(score)):
    if(score[i]>0):
        pred.append(+1)
        ctr+=1
    else:
        pred.append(-1)

In [101]:
# no. of positive prediction
ctr

25126

## Accuracy

In [103]:
ctr=0
for i in range(len(pred)):
    if(pred[i]==label_arr[i]):
        ctr+=1
ctr

39903

In [105]:
accuracy = ctr/len(pred)
accuracy

0.7518653904130238

### words contribution to sentiments

In [111]:
coef = wts[1:]
word_coef = [(word, coeff) for word, coeff in zip(imp_words, coef)]
word_coef = sorted(word_coef, key=lambda x:x[1], reverse=True)

In [112]:
word_coef

[('great', 0.0665460841704577),
 ('love', 0.06589076292212326),
 ('easy', 0.06479458680257838),
 ('little', 0.045435626308421365),
 ('loves', 0.04497640139490604),
 ('well', 0.030135001092107077),
 ('perfect', 0.029739937104968462),
 ('old', 0.020077541034775378),
 ('nice', 0.018408707995268992),
 ('daughter', 0.017703199905701694),
 ('soft', 0.01757027224560289),
 ('fits', 0.01688247107140872),
 ('happy', 0.01680529588976808),
 ('baby', 0.015565696580423507),
 ('recommend', 0.015408450108008665),
 ('also', 0.015216196422918844),
 ('best', 0.014991791565630264),
 ('comfortable', 0.0132539900815849),
 ('car', 0.012685935745813375),
 ('clean', 0.012018174433365525),
 ('son', 0.011944817713693955),
 ('bit', 0.011708248093123262),
 ('works', 0.011703160621987424),
 ('size', 0.010715966516270301),
 ('stroller', 0.009909164635972736),
 ('room', 0.009783241021568061),
 ('price', 0.009572733543590181),
 ('play', 0.00917842898398431),
 ('easily', 0.009032818138954407),
 ('kids', 0.0085828430043