In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
data=pd.read_csv('amazon_baby.csv')
data.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [3]:
data.shape

(183531, 3)

In [4]:
data.isna().astype(int).sum()

name      318
review    829
rating      0
dtype: int64

In [5]:
## fill nan values for review
data.fillna({'review':''},inplace = True)

In [6]:
data.isna().astype(int).sum()

name      318
review      0
rating      0
dtype: int64

### remove punctuation

In [29]:
def remove_punctuation(text):
    import string
    return text.translate( text.maketrans('','',string.punctuation) ) 

In [30]:
data['review_clean'] = data['review'].apply(remove_punctuation)

In [31]:
data.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


### Extract sentiments

In [32]:
data = data[data['rating']!=3]

In [33]:
data['sentiment']=data['rating'].apply(lambda rating : +1 if rating>3 else -1)

In [34]:
data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


### index for train and test in json

In [36]:
import json

In [38]:
with open("module-2-assignment-train-idx.json",'r') as f:
    train_idx = json.load(f)

In [39]:
with open("module-2-assignment-test-idx.json",'r') as f:
    test_idx = json.load(f)

In [247]:
train = data.iloc[train_idx,:]
test = data.iloc[test_idx,:]

In [44]:
train.shape

(133416, 5)

In [45]:
test.shape

(33336, 5)

In [46]:
train.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


In [77]:
train = train.iloc[0:200,:]
test = test.iloc[0:100,:]

In [78]:
train.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


## Build word count vector for each review

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [80]:
cv = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words

cv_fit = cv.fit_transform(train['review_clean'])
train_matrix = cv_fit.toarray()

In [87]:
cvt = cv.transform(test['review_clean'])
test_matrix = cvt.toarray()

In [88]:
train_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [89]:
test_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [111]:
cols = cv.get_feature_names()

## Logistic regression

In [90]:
from sklearn.linear_model import LogisticRegression

In [112]:
x_train = pd.DataFrame(train_matrix,columns=cols)
y_train = train['sentiment']

In [226]:
x_test = pd.DataFrame(test_matrix,columns=cols)
y_test = test['sentiment']

In [115]:
x_train.shape

(200, 2125)

In [116]:
x_test.shape

(100, 2125)

In [117]:
reg = LogisticRegression()
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [127]:
coef = reg.coef_
print(type(coef))
coef = coef.reshape(2125,)
coef.shape

<class 'numpy.ndarray'>


(2125,)

In [130]:
## positive and negative weights
pos=0
neg=0
for i in range(len(coef)):
    if(coef[i]<0):
        neg+=1
    else:
        pos+=1
print(pos)
print(neg)

1683
442


In [132]:
test.iloc[0]['review']

"This has been an easy way for my nanny to record all the key events that happen with my baby when I'm not at home.  Would highly recommend it to someone who wants to stay informed of what your baby is up to while you're not home.The only reason this isn't a 5 is because I think there could have been some more standarad pre-printed options.I plan on ordering another one when we run out of pages in this journal."

In [133]:
test.iloc[1]['review']

"I love this journal and our nanny uses it everyday to track on our daughter's sleep, eating and other activities. The layout and design make it very easy to fill in quickly with a comments column to add in details (i.e. we ask the nanny to specify what food she had for lunch, amount of milk she took, specifics of play time--tummy time, walk to park, etc.) I love knowing the details of my baby's day, and I also use it to track weekly on overall schedule patterns and changes (i.e. baby's nap time has moved naturally as she gets older).My only complaint is that there is no real space to communicate back to the nanny, as I sometimes want to reply to the nanny's comments and/or leave her notes about the previous night (i.e. Baby didn't sleep well or baby has rash, etc) or other instructions (Please use sun screen on the baby - it's hot out today) that I often forget to tell her in the rush of the morning. I just stick in post-its now for my notes. All in all, I highly recommend this baby t

### predict sentiment

In [149]:
sample_test_x = x_test[0:4]
sample_test_y = y_test[0:4]

In [150]:
pred = reg.predict(sample_test_x)

In [158]:
def class_pred(scores):
    pred_ = []
    for score in scores:
        if(score>0):
            pred_.append(+1)
        else:
            pred_.append(-1)
    return pred_

### scores

In [159]:
## using function
score = reg.decision_function(sample_test_x)
score

array([-2.09936524,  7.93825516,  2.27431591, 10.03027968])

In [160]:
pred_ = class_pred(score)

In [161]:
print(list(pred))
print(pred_)
print(list(sample_test_y))

[-1, 1, 1, 1]
[-1, 1, 1, 1]
[1, 1, 1, 1]


## probability

In [163]:
reg.predict_proba(sample_test_x)

array([[8.90841468e-01, 1.09158532e-01],
       [3.56701261e-04, 9.99643299e-01],
       [9.32725624e-02, 9.06727438e-01],
       [4.40438985e-05, 9.99955956e-01]])

In [165]:
def prob(scores):
    prob_=[]
    for score in scores:
        probs = 1/(1+np.exp(-score))
        prob_.append(probs)
    return prob_

In [168]:
## matches with using function
prob_ = prob(score)
prob_

[0.1091585319945551,
 0.9996432987386792,
 0.9067274375795634,
 0.9999559561015103]

## for test data

In [192]:
pred = reg.predict(x_test)
probab = reg.predict_proba(x_test)[:,1]
x_test['prob']=probab

In [194]:
x_test.head(2)

Unnamed: 0,1,10,100,10month,10monthsold,10mthold,11,11month,12,14,...,young,younger,youngest,your,youre,youthful,yr,zealand,zipper,prob
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0.109159
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.999643


In [210]:
x_test.sort_values(by='prob',ascending=False,inplace=True)
x_test.tail()

Unnamed: 0,1,10,100,10month,10monthsold,10mthold,11,11month,12,14,...,young,younger,youngest,your,youre,youthful,yr,zealand,zipper,prob
65,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.707826
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.439581
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.187665
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0.109159
87,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.006445


In [223]:
ind = list(x_test.index[0:20])
ind=ind+[1]

In [224]:
test.iloc[ind]

Unnamed: 0,name,review,rating,review_clean,sentiment
284,Crown Crafts The Original NoJo BabySling by Dr...,I am the the mother of a 14 week old who has r...,5,I am the the mother of a 14 week old who has r...,1
557,Baby Trend Diaper Champ,We bought this one because it can be used with...,4,We bought this one because it can be used with...,1
312,Baby Trend Diaper Champ,Ok - newsflash. Diapers are just smelly. We'...,4,Ok newsflash Diapers are just smelly Weve h...,1
369,Baby Trend Diaper Champ,We wanted to have a convenient locking recepta...,2,We wanted to have a convenient locking recepta...,-1
358,Baby Trend Diaper Champ,I purchased this pail about a year ago and fou...,2,I purchased this pail about a year ago and fou...,-1
320,Baby Trend Diaper Champ,I originally put this item on my baby registry...,5,I originally put this item on my baby registry...,1
477,Baby Trend Diaper Champ,The boy is now 1 year old and we have had it s...,4,The boy is now 1 year old and we have had it s...,1
149,Pedal Farm Tractor,We chose this tractor and trailer for a family...,5,We chose this tractor and trailer for a family...,1
287,Crown Crafts The Original NoJo BabySling by Dr...,We purchased the Babysling when our daughter w...,5,We purchased the Babysling when our daughter w...,1
458,Baby Trend Diaper Champ,I'm SO glad that we asked the sales associate ...,5,Im SO glad that we asked the sales associate a...,1


In [221]:
ind = list(x_test.index[-20:])
type(ind)
ind=ind+[1]

In [222]:
test.iloc[ind]

Unnamed: 0,name,review,rating,review_clean,sentiment
464,Baby Trend Diaper Champ,I returned the Diaper G**** that I was given b...,5,I returned the Diaper G that I was given but k...,1
401,Baby Trend Diaper Champ,"Simply put, the Diaper Champ is much easier to...",5,Simply put the Diaper Champ is much easier to ...,1
255,The First Years Massaging Action Teether,My son is obsessed with this teether!! He was ...,5,My son is obsessed with this teether He was fi...,1
120,Modern House Giraffe Children Height Measureme...,It is now haning in my granddaughters bedroom....,5,It is now haning in my granddaughters bedroom ...,1
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1
515,Baby Trend Diaper Champ,"This pail is by far the best! Great price, n...",5,This pail is by far the best Great price no ...,1
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1
470,Baby Trend Diaper Champ,"This is easy, but horribly smelly. I suggest ...",2,This is easy but horribly smelly I suggest sa...,-1
215,North American Bear Company Sleepyhead Bunny B...,"This is a perfect baby gift: washable, no par...",5,This is a perfect baby gift washable no parts...,1
123,Modern House Safari Animal Family removable Vi...,"the instructions were really clear, howerver t...",5,the instructions were really clear howerver th...,1


## Accuracy

In [228]:
## predition on test data
pred = reg.predict(x_test)
pred.shape

(100,)

In [229]:
y_test.shape

(100,)

In [234]:
y_test = np.array(y_test)

In [235]:
correct = 0
for i in range(100):
    if(pred[i]==y_test[i]):
        correct+=1
correct

87

In [241]:
## mannual accuracy
total = 100
accuracy = 87/100
accuracy

0.87

In [242]:
## using function => both matches
reg.score(x_test,y_test)

0.87

In [244]:
## predition on train data
pred = reg.predict(x_train)
reg.score(x_train,y_train)

1.0

## using specific words

In [286]:
train = data.iloc[train_idx,:]
test = data.iloc[test_idx,:]
print(train.shape)
print(test.shape)

(133416, 5)
(33336, 5)


In [287]:
train = train.iloc[0:10000,:]
test = test.iloc[0:1000,:]

In [288]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [289]:
cv = CountVectorizer(vocabulary=significant_words)

In [290]:
cv_fit = cv.fit_transform(train['review_clean'])
train_matrix = cv_fit.toarray() 

In [291]:
train_matrix.shape

(10000, 20)

In [292]:
cvt = cv.transform(test['review_clean'])
test_matrix = cvt.toarray()
test_matrix.shape

(1000, 20)

## train model using specific words

In [293]:
x_train = pd.DataFrame(train_matrix)
y_train = train['sentiment']
x_test = pd.DataFrame(test_matrix)
y_test = test['sentiment']

In [295]:
reg = LogisticRegression()
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [296]:
table = pd.DataFrame({'word':significant_words,
                                         'coefficient':reg.coef_.flatten()})

In [297]:
table

Unnamed: 0,word,coefficient
0,love,1.593449
1,great,0.818823
2,easy,1.221013
3,old,0.099083
4,little,0.45845
5,perfect,1.21343
6,loves,1.666157
7,well,0.356571
8,able,0.467533
9,car,0.090478


In [299]:
pos = table[table['coefficient']>0].count()
pos

word           11
coefficient    11
dtype: int64

In [300]:
score_train = reg.score(x_train,y_train)
score_test = reg.score(x_test,y_test)

In [301]:
print(score_train)
print(score_test)

0.8231
0.82


## majority class prediction

In [302]:
train = data.iloc[train_idx,:]
test = data.iloc[test_idx,:]
print(train.shape)
print(test.shape)

(133416, 5)
(33336, 5)


In [305]:
pos = test[test['sentiment']>0].count()
neg = test[test['sentiment']<=0].count()
print(pos)
print(neg)

name            28048
review          28095
rating          28095
review_clean    28095
sentiment       28095
dtype: int64
name            5234
review          5241
rating          5241
review_clean    5241
sentiment       5241
dtype: int64


In [307]:
acc = pos/len(test)
acc

name            0.841373
review          0.842783
rating          0.842783
review_clean    0.842783
sentiment       0.842783
dtype: float64