In [55]:
import pandas as pd
import string
import numpy as np

In [56]:
products = pd.read_csv('amazon_baby.csv')

In [57]:
products

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5
...,...,...,...
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5


In [58]:
def remove_punctuation(text):
    text = text.translate(str.maketrans('','',string.punctuation))
    return text

In [59]:
products = products.fillna({'review':''}) 
products['review_clean'] = products['review'].apply(remove_punctuation)

In [60]:
products = products[products['rating'] != 3]

In [61]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [62]:
traindata = pd.read_json('module-2-assignment-train-idx.json')
testdata = pd.read_json('module-2-assignment-test-idx.json')

In [110]:
train_data = products.iloc[traindata.values.reshape((-1,))]
test_data = products.iloc[testdata.values.reshape((-1,))]

In [111]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [112]:
train_matrix.shape

(133416, 121712)

In [113]:
test_matrix.shape

(33336, 121712)

In [67]:
from sklearn.linear_model import LogisticRegression

In [68]:
sentiment_model = LogisticRegression()

In [126]:
sentiment_model.fit(train_matrix,train_data['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [127]:
l = sentiment_model.coef_.tolist()[0]

In [128]:
postive_weight_count = [i for i in l if i >= 0]

In [129]:
len(postive_weight_count)

90344

In [130]:
sample_test_data = test_data[10:13]

In [131]:
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [132]:
sample_test_data.loc[59]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [133]:
sample_test_data.loc[71]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [134]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[  4.82745546  -3.0040986  -10.02082736]


In [135]:
for i in range(len(scores)):
    if scores[i] > 0:
        print(+1)
    else:
        print(-1)

1
-1
-1


In [136]:
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1])

In [137]:
for i in range(len(scores)):
    print(1/(1+np.exp(-scores[i])))

0.9920567315603764
0.04724105540094916
4.446217101904015e-05


In [138]:
sentiment_model.predict_proba(sample_test_matrix)

array([[7.94326844e-03, 9.92056732e-01],
       [9.52758945e-01, 4.72410554e-02],
       [9.99955538e-01, 4.44621710e-05]])

## Find the most positive (and negative) review 

In [139]:
test_prob = sentiment_model.predict_proba(test_matrix)[:,1]

In [140]:
test_data['prediction'] = test_prob

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['prediction'] = test_prob


In [141]:
test_data.sort_values('prediction',ascending=False).iloc[0:21,:]

Unnamed: 0,name,review,rating,review_clean,sentiment,prediction
168697,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1,1.0
52631,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1,1.0
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1,1.0
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1,1.0
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1,1.0
119182,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1,1.0
133651,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,I got this stroller for my daughter prior to t...,1,1.0
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1,1.0
114796,"Fisher-Price Cradle 'N Swing, My Little Snuga...",My husband and I cannot state enough how much ...,5,My husband and I cannot state enough how much ...,1,1.0
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,We are big Best Bottoms fans here but I wanted...,1,1.0


In [142]:
test_data.sort_values('prediction',ascending=True).iloc[0:21,:]

Unnamed: 0,name,review,rating,review_clean,sentiment,prediction
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,2.084771e-14
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,3.66862e-14
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,5.070359e-12
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,9.538352e-12
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,1.93824e-10
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,4.7136e-10
95420,One Step Ahead Hide-Away Extra Long Bed Rail,"I bought a brand new 56"" hide-away bed safety ...",1,I bought a brand new 56 hideaway bed safety ra...,-1,4.801182e-10
176046,Baby Trend Inertia Infant Car Seat - Horizon,"I really wanted to love this seat; however, I ...",1,I really wanted to love this seat however I wo...,-1,7.556315e-10
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,9.784371e-10
96572,"Baby Jogger Summit XC Double Stroller, Red/Black",Received Jogger as a shower gift so it sat in ...,1,Received Jogger as a shower gift so it sat in ...,-1,1.842551e-09


## Compute accuracy of the classifier
 

In [143]:
def compute_accuracy(model,matrix,data):
    predictions = model.predict(matrix)
    true_predictions = data[data['sentiment'] == predictions]
    accuracy = true_predictions.shape[0]/data.shape[0]
    return accuracy

In [144]:
# predictions = sentiment_model.predict(test_matrix)
# true_predictions = test_data[test_data['sentiment'] == predictions]
# accuracy = true_predictions.shape[0]/test_data.shape[0] 
# accuracy

In [145]:
compute_accuracy(sentiment_model,test_matrix,test_data)

0.9325953923686106

## Learn another classifier with fewer words

In [146]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [147]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

## Train a logistic regression model on a subset of data 

In [148]:
log_reg_simple = LogisticRegression()

In [149]:
simple_model = log_reg_simple.fit(train_matrix_word_subset,train_data['sentiment'])

In [150]:
table = pd.DataFrame({'word':significant_words,'coefficient':simple_model.coef_.flatten()})

In [151]:
table.sort_values('coefficient',ascending=False)

Unnamed: 0,word,coefficient
6,loves,1.673269
5,perfect,1.510263
0,love,1.363697
2,easy,1.192219
1,great,0.94395
4,little,0.520174
7,well,0.50376
8,able,0.190937
3,old,0.085424
9,car,0.058813


## accuracy of the sentiment_model on the train_data

In [152]:
compute_accuracy(sentiment_model,train_matrix,train_data)

0.9475700065959105

In [153]:
compute_accuracy(simple_model,train_matrix_word_subset,train_data)

0.8668225700065959

In [154]:
compute_accuracy(sentiment_model,test_matrix,test_data)

0.9325953923686106

In [155]:
compute_accuracy(simple_model,test_matrix_word_subset,test_data)

0.8693604511639069

In [162]:
from scipy.stats import mode
from sklearn.metrics import accuracy_score
# predict the majority class
def majority_class(y):
    return mode(y)[0]
 
# define dataset
# class1 = train_data.index[train_data['sentiment']==1]
# class_1 = train_data.index[train_data['sentiment']==1]
y = test_data['sentiment'].values
# make predictions
yhat = [majority_class(y) for _ in range(len(y))]
# # calculate accuracy
accuracy = accuracy_score(y, yhat)
print('Accuracy: %.3f' % accuracy)


Accuracy: 0.843
