In [188]:
import pandas as pd

In [189]:
data = pd.read_csv("amazon_baby.csv")
data.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [190]:
import string

def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(None, string.punctuation)
    return str(text)

In [191]:
data['review'] = data['review'].map(lambda s: remove_punctuation(s))
data.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,These flannel wipes are OK but in my opinion n...,3
1,Planetwise Wipe Pouch,it came early and was not disappointed i love ...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase I h...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried nonstop when I tried...,5


In [192]:
data = data[data['rating'] != 3]
data.reset_index(inplace = True, drop = True)
data.head()

Unnamed: 0,name,review,rating
0,Planetwise Wipe Pouch,it came early and was not disappointed i love ...,5
1,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
2,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase I h...,5
3,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried nonstop when I tried...,5
4,Stop Pacifier Sucking without tears with Thumb...,When the Binky Fairy came to our house we didn...,5


In [193]:
data['sentiment'] = data['rating'].map(lambda rating : +1 if rating > 3 else -1)
data.head()

Unnamed: 0,name,review,rating,sentiment
0,Planetwise Wipe Pouch,it came early and was not disappointed i love ...,5,1
1,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1
2,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase I h...,5,1
3,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried nonstop when I tried...,5,1
4,Stop Pacifier Sucking without tears with Thumb...,When the Binky Fairy came to our house we didn...,5,1


In [194]:
import numpy as np

data[data['review'] == '']

Unnamed: 0,name,review,rating,sentiment
159986,"Summer Infant Bentwood Bassinet with Motion, B...",,5,1


In [195]:
def read_index(filename):
    with open(filename, 'r') as f:
        first_line = f.readline()
    first_line = first_line.translate(None,'[]').strip().split(',')
    first_line = [int(x) for x in first_line]
    return first_line

In [196]:
train_index_path = "train_index.json"
test_index_path = "text_index.json"

train_index = read_index(train_index_path)
test_index = read_index(test_index_path)

In [197]:
train_data = data.loc[train_index]
test_data = data.loc[test_index]

print train_data[train_data['review'] == '']
print test_data[test_data['review'] == '']

test_data.head()

Empty DataFrame
Columns: [name, review, rating, sentiment]
Index: []
                                                     name review  rating  \
159986  Summer Infant Bentwood Bassinet with Motion, B...              5   

        sentiment  
159986          1  


Unnamed: 0,name,review,rating,sentiment
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,1
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,1
14,Nature's Lullabies First Year Sticker Calendar,I love this little calender you can keep track...,5,1
18,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,1
24,"Lamaze Peekaboo, I Love You",One of babys first and favorite books and it i...,4,1


In [199]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review'])
test_matrix = vectorizer.transform(test_data['review'])

In [200]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

# Quiz question: How many weights are >= 0?

In [203]:
print sum(sum(clf.coef_ > 0))

86407


In [204]:
print sum(sum(clf.coef_ < 0))

35306


In [206]:
sample_test_data = test_data[10:13]
print sample_test_data

                                                 name  \
53                          Our Baby Girl Memory Book   
64  Wall Decor Removable Decal Sticker - Colorful ...   
82  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  sentiment  
53  Absolutely love it and all of the Scripture in...       5          1  
64  Would not purchase again or recommend The deca...       2         -1  
82  Was so excited to get this product for my baby...       1         -1  


In [207]:
sample_test_matrix = vectorizer.transform(sample_test_data['review'])
scores = clf.decision_function(sample_test_matrix)
print scores

[  5.58841957  -3.18030646 -10.43006632]


In [208]:
predictions = clf.predict(sample_test_matrix)
print predictions

[ 1 -1 -1]


In [209]:
import math

def sigmoid(x):
    return 1./(1.+math.exp(-x))

##Quiz question: Of the three data points in sample_test_data, which one (first, second, or third) has the lowest probability of being classified as a positive review?

In [211]:
probs_hand = [sigmoid(x) for x in scores]
probs_clf = clf.predict_proba(sample_test_matrix)
print probs_hand
print probs_clf

[0.996273006954279, 0.039913588665066814, 2.953023672678484e-05]
[[  3.72699305e-03   9.96273007e-01]
 [  9.60086411e-01   3.99135887e-02]
 [  9.99970470e-01   2.95302367e-05]]


In [214]:
test_data['prob'] = clf.predict_proba(test_matrix)[:,1]

##Quiz Question: Which of the following products are represented in the 20 most positive reviews?

In [218]:
top_prob = test_data.sort(['prob'], ascending=[False])[:20]
top_prob.head(20)

Unnamed: 0,name,review,rating,sentiment,prob
60039,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",Its always fun to write a review on those prod...,5,1,1
45751,"P'Kolino Silly Soft Seating in Tias, Green",Ive purchased both the PKolino Little Reader C...,4,1,1
127998,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall 38in and thin 2...,5,1,1
134472,"Baby Jogger City Mini GT Single Stroller, Shad...",Amazing Love Love Love it All 5 STARS all the...,5,1,1
104290,"Fisher-Price Cradle 'N Swing, My Little Snuga...",My husband and I cannot state enough how much ...,5,1,1
90999,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,1,1
108269,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram RoccoI bought this pram from Europe...,5,1,1
164117,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2 Its e...,4,1,1
124541,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n Play la...,4,1,1
79040,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,1,1


##Quiz Question: Which of the following products are represented in the 20 most negative reviews?

In [255]:
bot_prob = test_data.sort(['prob'], ascending=[True])[:20]
bot_prob.head(20)

Unnamed: 0,name,review,rating,sentiment,prob
14637,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with FisherPrice prod...,2,-1,9.360864e-16
109216,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,-1,1.912891e-15
70004,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,-1,8.233473e-14
44284,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,-1,1.288161e-13
141135,VTech Communications Safe &amp; Sounds Full Co...,This is my second video monitoring system the ...,1,-1,2.500884e-13
85925,The First Years True Choice P400 Premium Digit...,Note we never installed batteries in these uni...,1,-1,3.531563e-13
48396,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,-1,3.266446e-11
73893,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,-1,3.335426e-11
103564,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITORI purchased this m...,1,-1,9.687584e-11
9768,Philips AVENT Newborn Starter Set,Its 3am in the morning and needless to say thi...,1,-1,1.017452e-10


##Quiz Question: What is the accuracy of the sentiment_model on the test_data? Round your answer to 2 decimal places (e.g. 0.76).

##Quiz Question: Does a higher accuracy value on the training_data always imply that the classifier is better?



In [222]:
correct = sum((test_data['prob'] > 0.5) == test_data['sentiment'])

In [223]:
print correct

27285


In [224]:
print "Accuracy - {0}".format(correct/float(test_data.shape[0]))

Accuracy - 0.818484521238


In [225]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [226]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review'])

In [227]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [228]:
coefficients = pd.DataFrame({'word':significant_words,
                             'coefficient':simple_model.coef_.flatten()})

In [229]:
coefficients

Unnamed: 0,coefficient,word
0,1.36369,love
1,0.944,great
2,1.192538,easy
3,0.085513,old
4,0.520186,little
5,1.509812,perfect
6,1.673074,loves
7,0.50376,well
8,0.190909,able
9,0.058855,car


#Quiz Question: Consider the coefficients of simple_model. How many of the 20 coefficients (corresponding to the 20 significant_words) are positive for the simple_model?

#Quiz Question: Are the positive words in the simple_model also positive words in the sentiment_model?



In [230]:
print sum(simple_model.coef_.flatten() > 0)

10


In [238]:
coef_total = pd.DataFrame({'word':vectorizer.get_feature_names(),
                             'coefficient_full':clf.coef_.flatten()})

In [239]:
coef_total[coef_total['word'].isin(significant_words)]

Unnamed: 0,coefficient_full,word
7386,0.39277,able
20190,-1.388537,broke
22122,0.12651,car
34453,-2.190808,disappointed
37640,1.364923,easy
39961,-0.465854,even
48789,1.239894,great
61494,-0.275259,less
62602,0.642572,little
63567,1.589741,love


In [242]:
full_coef = pd.merge(coef_total, coefficients)

In [243]:
full_coef

Unnamed: 0,coefficient_full,word,coefficient
0,0.39277,able,0.190909
1,-1.388537,broke,-1.651576
2,0.12651,car,0.058855
3,-2.190808,disappointed,-2.348298
4,1.364923,easy,1.192538
5,-0.465854,even,-0.51138
6,1.239894,great,0.944
7,-0.275259,less,-0.209563
8,0.642572,little,0.520186
9,1.589741,love,1.36369


#Quiz Question: Which model (sentiment_model or simple_model) has higher accuracy on the TRAINING set?

In [247]:
accuracy_trian_sentiment_model = sum(clf.predict(train_matrix) == train_data['sentiment'])/float(train_data.shape[0])
accuracy_trian_simple_model = sum(simple_model.predict(train_matrix_word_subset) == train_data['sentiment'])/float(train_data.shape[0])
print accuracy_trian_sentiment_model
print accuracy_trian_simple_model

0.968534508605
0.866822570007


#Quiz Question: Which model (sentiment_model or simple_model) has higher accuracy on the TEST set?

In [248]:
accuracy_test_sentiment_model = sum(clf.predict(test_matrix) == test_data['sentiment'])/float(test_data.shape[0])
accuracy_test_simple_model = sum(simple_model.predict(test_matrix_word_subset) == test_data['sentiment'])/float(test_data.shape[0])
print accuracy_test_sentiment_model
print accuracy_test_simple_model

0.932445404368
0.869360451164


#Quiz Question: Enter the accuracy of the majority class classifier model on the test_data. Round your answer to two decimal places (e.g. 0.76).

#Quiz Question: Is the sentiment_model definitely better than the majority class classifier (the baseline)?


In [250]:
train_data['sentiment'].value_counts()

 1    112164
-1     21252
dtype: int64

In [254]:
accuracy_test_majority_class = sum(1 == test_data['sentiment'])/float(test_data.shape[0])
print accuracy_test_majority_class

0.842782577394
