In [53]:
import pandas as pd 
import numpy as np

In [113]:
products = pd.read_csv('./amazon_baby.csv')

In [14]:
products.columns

Index(['name', 'review', 'rating'], dtype='object')

In [23]:
products.iloc[269]

name      The First Years Massaging Action Teether
review                    A favorite in our house!
rating                                           5
Name: 269, dtype: object

In [114]:
products = products.fillna({'review':''})

In [115]:
def remove_punctuation(text):
    import string
    return text.translate(string.punctuation) 

products['review_clean'] = products['review'].apply(remove_punctuation)

In [116]:
products = products[products['rating'] != 3]

In [117]:
len(products)

166752

In [118]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [119]:
train = pd.read_json('./module-2-assignment-train-idx.json')
test = pd.read_json('./module-2-assignment-test-idx.json')

In [181]:
train_set = products.iloc[train[0]]
test_set = products.iloc[test[0]]


In [182]:
train_set.reset_index(inplace=True)
test_set.reset_index(inplace=True)

In [121]:
print(len(train_set))
print(len(test_set))

133416
33336


## Build the word count vector for each review  
We will now compute the word count for each word that appears in the reviews. A vector consisting of word counts is often referred to as bag-of-word features. Since most words occur in only a few reviews, word count vectors are sparse. For this reason, scikit-learn and many other tools use sparse matrices to store a collection of word count vectors. Refer to appropriate manuals to produce sparse word count vectors. General steps for extracting word count vectors are as follows:

Learn a vocabulary (set of all words) from the training data. Only the words that show up in the training data will be considered for feature extraction.

Compute the occurrences of the words in each review and collect them into a row vector.

Build a sparse matrix where each row is the word count vector for the corresponding review. Call this matrix train_matrix.

Using the same mapping between words and columns, convert the test data into a sparse matrix test_matrix.

The following cell uses CountVectorizer in scikit-learn. Notice the token_pattern argument in the constructor.

-> Keep in mind that the test data must be transformed in the same way as the training data.

In [351]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_set['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_set['review_clean'])

In [352]:
from sklearn.linear_model import LogisticRegression

In [353]:
sentiment_model = LogisticRegression(max_iter=300, solver='liblinear')

In [354]:
sentiment_model.fit(train_matrix, train_set['sentiment'])

In [355]:
sentiment_model.coef_

array([[-5.46105796e-01, -6.16721745e-01, -4.19879244e-02, ...,
         2.84991992e-02,  2.94107540e-03, -8.35351630e-06]])

In [356]:
len(sentiment_model.coef_[sentiment_model.coef_>=0])


39906

In [357]:
len(sentiment_model.coef_[sentiment_model.coef_<=0])

17589

In [358]:
sample_test_data = test_set[10:13]
print(sample_test_data['rating'])
sample_test_data

10    5
11    2
12    1
Name: rating, dtype: int64


Unnamed: 0,index,name,review,rating,review_clean,sentiment,predict
10,59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1,0.993393
11,71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend. The dec...,-1,0.045747
12,91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1,2.1e-05


In [359]:
sample_test_data[0:1]['review']

10    Absolutely love it and all of the Scripture in...
Name: review, dtype: object

In [360]:
sample_test_data[1:2]['review']

11    Would not purchase again or recommend. The dec...
Name: review, dtype: object

In [361]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[  5.01301893  -3.03780145 -10.77591345]


In [362]:
y_predict = [1 if scores[i]>0 else -1 for i in range(len(scores))]

In [198]:
y_predict

[1, -1, -1]

In [199]:
probability = lambda x: 1/(1+np.exp(-x))

In [200]:
probability(scores)

array([9.93393146e-01, 4.57470509e-02, 2.08963858e-05])

In [201]:
sentiment_model.classes_

array([-1,  1])

In [202]:
sentiment_model.predict_proba(sample_test_matrix)

array([[6.60685383e-03, 9.93393146e-01],
       [9.54252949e-01, 4.57470509e-02],
       [9.99979104e-01, 2.08963858e-05]])

** Quiz Question:** Of the three data points in **sample_test_data**, which one (first, second, or third) has the **lowest probability** of being classified as a positive review? third

In [216]:
sentiment_model.predict_proba(test_matrix)[:,1]

array([0.71133173, 0.99999905, 0.92336235, ..., 0.99998084, 0.99999793,
       0.97855212])

In [222]:
test_set['predict'] = sentiment_model.predict_proba(test_matrix)[:,1]
test_set


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['predict'] = sentiment_model.predict_proba(test_matrix)[:,1]


Unnamed: 0,index,name,review,rating,review_clean,sentiment,predict
0,9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,0.711332
1,10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,0.999999
2,16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,"I love this little calender, you can keep trac...",1,0.923362
3,20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,0.999979
4,28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,"One of baby's first and favorite books, and it...",1,0.982756
...,...,...,...,...,...,...,...
33331,183507,Maxboost iPhone 5S/5 Case - Protective Snap-on...,got this for my wife and she loves it would de...,5,got this for my wife and she loves it would de...,1,0.975400
33332,183515,Maxboost iPhone 5S/5 Case - Protective Snap-on...,I love this phone case! My iPhone is always ...,5,I love this phone case! My iPhone is always ...,1,0.999085
33333,183522,Airline Seat Belt Extender - The Best Extensio...,I bought this as a father's day gift for my da...,5,I bought this as a father's day gift for my da...,1,0.999981
33334,183524,Squeasy Snacker 6oz Silicone Reusable Food Pou...,"I love this product, it makes my life easier. ...",5,"I love this product, it makes my life easier. ...",1,0.999998


In [225]:
test_set.sort_values(by='predict', ascending=False)[0:20]

Unnamed: 0,index,name,review,rating,review_clean,sentiment,predict
20743,114796,"Fisher-Price Cradle 'N Swing, My Little Snuga...",My husband and I cannot state enough how much ...,5,My husband and I cannot state enough how much ...,1,1.0
27048,149002,The Original CJ's BuTTer (All Natural Mango Su...,I'm going to try to review the scents I've tri...,4,I'm going to try to review the scents I've tri...,1,1.0
25554,140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall (38in) and thin...,1,1.0
30535,168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,"We are big Best Bottoms fans here, but I wante...",1,1.0
11923,66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,It's always fun to write a review on those pro...,1,1.0
14482,80155,"Simple Wishes Hands-Free Breastpump Bra, Pink,...","I just tried this hands free breastpump bra, a...",5,"I just tried this hands free breastpump bra, a...",1,1.0
9125,50315,"P'Kolino Silly Soft Seating in Tias, Green",I've purchased both the P'Kolino Little Reader...,4,I've purchased both the P'Kolino Little Reader...,1,1.0
26830,147949,"Baby Jogger City Mini GT Single Stroller, Shad...","Amazing, Love, Love, Love it !!! All 5 STARS a...",5,"Amazing, Love, Love, Love it !!! All 5 STARS a...",1,1.0
32782,180646,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2. It's...,4,After much research I purchased an Urbo2. It's...,1,1.0
30634,168697,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Graco's FastAction Jogging Stroller definitely...,1,1.0


In [226]:
test_set.sort_values(by='predict', ascending=True)[0:20]

Unnamed: 0,index,name,review,rating,review_clean,sentiment,predict
21700,120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,8.477054e-16
28184,155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,"This is my second video monitoring system, the...",-1,1.880181e-15
17069,94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note: we never installed batteries in these un...,-1,3.869109e-15
2931,16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with Fisher-Price pro...,-1,4.29108e-15
8818,48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,3.206418e-14
13939,77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,5.644861e-12
1810,9915,Cosco Alpha Omega Elite Convertible Car Seat,I bought this car seat after both seeing the ...,1,I bought this car seat after both seeing the ...,-1,1.786747e-11
9655,53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,6.560592e-11
1942,10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,"It's 3am in the morning and needless to say, t...",-1,1.672529e-10
14711,81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,2.005202e-10


In [371]:
def get_classification_accuracy(model, data, true_labels):
    # First get the predictions
    predict = model.predict(data)

    # Compute the number of correctly classified examples
    correctly = len(true_labels[predict==true_labels])

    # Then compute accuracy by dividing num_correct by total number of examples
    accuracy = correctly/len(true_labels)
    
    return accuracy

In [374]:
accuracy_test = get_classification_accuracy(sentiment_model, test_matrix, test_set['sentiment'])

In [236]:
accuracy_test

0.9320254379649628

## Learn another classifier with fewer words

In [237]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [363]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_set['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_set['review_clean'])

In [364]:
simple_model = LogisticRegression(max_iter=300, solver='liblinear')

In [365]:
simple_model.fit(train_matrix_word_subset, train_set['sentiment'])

In [366]:
simple_model_coef_table = pd.DataFrame({
    'word': significant_words,
    'coefficient': simple_model.coef_.flatten()
})

In [321]:
simple_model_coef_table 

Unnamed: 0,word,coefficient
0,love,1.35644
1,great,0.943518
2,easy,1.183268
3,old,0.082803
4,little,0.514544
5,perfect,1.506902
6,loves,1.677632
7,well,0.530083
8,able,0.19309
9,car,0.056478


In [278]:
def get_term(dict, search_index):
    return list(dict.keys())[list(dict.values()).index(search_index)]

In [325]:
len(sentiment_model.coef_.flatten())

57495

In [326]:
len(vectorizer.vocabulary_.keys())

57495

In [367]:
for word in significant_words:
    msk = np.array(list(vectorizer.vocabulary_.keys()))==word
    print(word, ':', sentiment_model.coef_.flatten()[msk])

love : [-0.31693024]
great : [3.10001156e-07]
easy : [4.81290213e-07]
old : [0.0012269]
little : [0.09819408]
perfect : [0.00480142]
loves : [0.14113531]
well : [1.01749067]
able : [0.25757998]
car : [0.23342418]
broke : [0.14273662]
less : [2.03749988e-05]
even : [0.42490424]
waste : [-0.25387283]
disappointed : [9.79710334e-07]
work : [0.00381257]
product : [0.00240414]
money : [-2.37370528e-08]
would : [0.38223236]
return : [0.05082679]


In [375]:
accuracy_sentiment = get_classification_accuracy(sentiment_model, train_matrix, train_set['sentiment'])
accuracy_sentiment

0.9644870180488098

In [376]:
accuracy_simple = get_classification_accuracy(simple_model, train_matrix_word_subset, train_set['sentiment'])
accuracy_simple

0.8672273190621814

In [377]:
accuracy_sentiment_TEST = get_classification_accuracy(sentiment_model, test_matrix, test_set['sentiment'])
accuracy_sentiment_TEST

0.9320254379649628

In [380]:
accuracy_simple_TEST = get_classification_accuracy(simple_model, test_matrix_word_subset, test_set['sentiment'])
accuracy_simple_TEST

0.8702003839692825

In [382]:
majority_class_classifier = len(test_set[test_set['sentiment']==1])/(len(test_set))
majority_class_classifier

0.8427825773938085