In [4]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as plt
%matplotlib inline
products = pd.read_csv('data/amazon_baby.csv')
train_idx = pd.read_json('data/module-2-assignment-train-idx.json')[0]
test_idx = pd.read_json('data/module-2-assignment-test-idx.json')[0]

In [234]:
def remove_punctuation(text):
    import string
    return text.translate(str.maketrans('','',string.punctuation)) #first parameter should be translate table 


def score_to_lebel(score:np.ndarray):
    return score>0


def score_to_probability(score: np.ndarray):
    return 1/(1+np.exp(-score))


def accur(y,y_predict):
    true_pre = (y==y_predict).sum()
    total = len(y)
    return true_pre/total

In [272]:
products = products.fillna({'review':''})
products['review_clean'] = products['review'].apply(remove_punctuation)
products = products[products['rating'] != 3]
products['sentiment'] = products['rating'].apply(lambda r:1 if r>3 else -1)

train_data = products.iloc[train_idx].reset_index()
# train_data = train_data[train_data['rating']!=3].reset_index()
test_data = products.iloc[test_idx].reset_index()
# test_data = test_data[test_data['rating']!=3].reset_index()

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [273]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [274]:
positive = 0
c=0
for n in sentiment_model.coef_[0]:
    if n>=0:
        positive += 1
    c+=1
positive

85877

In [277]:
sample_test_data = test_data[10:13]
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[  5.60840687  -3.12665506 -10.42354879]


In [278]:
print(score_to_lebel(scores))
print(sentiment_model.predict(sample_test_matrix))
print(score_to_probability(scores))
print(sentiment_model.predict_proba(sample_test_matrix))

[ True False False]
[ 1 -1 -1]
[9.96346491e-01 4.20210525e-02 2.97233236e-05]
[[3.65350948e-03 9.96346491e-01]
 [9.57978948e-01 4.20210525e-02]
 [9.99970277e-01 2.97233236e-05]]


In [282]:
test_data['predicted_sentiment'] = sentiment_model.predict(test_matrix)
test_data['fx'] = sentiment_model.decision_function(test_matrix)
test_data['prob'] = sentiment_model.predict_proba(test_matrix)[:,1]

In [288]:
test_data.nsmallest(20,['fx','prob'])

Unnamed: 0,index,name,review,rating,review_clean,sentiment,predicted_sentiment,fx,prob
2931,16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,-1,-34.626044,9.16432e-16
21700,120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,-1,-33.93069,1.836912e-15
13939,77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,-1,-30.172757,7.872963e-14
8818,48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,-1,-29.616507,1.373138e-13
28184,155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,-1,-29.22792,2.025236e-13
17069,94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,-1,-28.455362,4.385238e-13
9655,53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,-1,-24.074759,3.503202e-11
14711,81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,-1,-23.989734,3.814091e-11
20594,113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1,-1,-23.086585,9.410738e-11
1942,10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1,-1,-22.994054,1.032308e-10


In [280]:
accur(test_data['sentiment'],test_data['predicted_sentiment'])

0.9321154307655387

In [289]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

simple_model = LogisticRegression().fit(train_matrix_word_subset,train_data['sentiment'])

In [290]:
word_coef_table={}
for i in range(20):
    word_coef_table[significant_words[i]]=simple_model.coef_[0][i]

for k,v in word_coef_table.items():
    print(v,k)

1.3636897593104937 love
0.9439995905719677 great
1.1925382734890588 easy
0.08551277946303547 old
0.5201857627181392 little
1.5098124766917815 perfect
1.6730738925932755 loves
0.5037604577675278 well
0.1909085720643394 able
0.058854671152739374 car
-1.651576344965244 broke
-0.20956286453464426 less
-0.5113796317990279 even
-2.0336986139402082 waste
-2.3482982195022064 disappointed
-0.6211687736417648 work
-0.32055623673460826 product
-0.8980307377150831 money
-0.3621667422737741 would
-2.109331090318681 return


In [291]:
a = vectorizer.transform(significant_words)
sentiment_model.predict(a)

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,
        1,  1, -1], dtype=int64)

In [292]:
print(accur(train_data['sentiment'],sentiment_model.predict(train_matrix)))
print(accur(train_data['sentiment'],simple_model.predict(train_matrix_word_subset)))

0.967934880374168
0.8668225700065959


In [293]:
print(accur(test_data['sentiment'],sentiment_model.predict(test_matrix)))
print(accur(test_data['sentiment'],simple_model.predict(test_matrix_word_subset)))

0.9321154307655387
0.8693604511639069


In [294]:
test_data['majority']=1
print(accur(test_data['sentiment'],test_data['majority']))

0.8427825773938085


fancy index會讓df變小 -> data point的index改變