In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anmol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
products = pd.read_csv('amazon_baby.csv')
# products['word_count'] =  products['review'].astype(str).apply(lambda x: Counter(x.split(' ')))
products = products[products['rating'] != 3]
products['sentiment'] = (products['rating'] >= 4).astype(int)
products['review'] = products['review'].astype(str)
products.head(10)

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1
6,A Tale of Baby\'s Days with Peter Rabbit,"Lovely book, it\'s bound tightly so you may no...",4,1
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5,1
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5,1
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,1


In [4]:
products.sentiment.sum() / len(products)

0.8411233448474381

In [4]:
def text_process(review):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in review if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in stop])

# products['filtered_review'] = products.review.apply(text_process)

In [5]:
products.head()

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1


In [6]:
cv = CountVectorizer(stop_words='english', min_df=10, max_features=5000)
f = cv.fit_transform(products['review'])

In [7]:
features_count = f.sum(axis=0).tolist()[0]
features_names = cv.get_feature_names()
features = pd.DataFrame(list(zip(features_names, features_count)), 
                                columns=['features', 'count']
                               ).sort_values(by=['count'], ascending=False)
features

Unnamed: 0,features,count
320,baby,72181
1905,great,55791
2284,just,49867
4709,use,47473
2434,like,45066
2514,love,41994
3747,seat,39386
1408,easy,38873
2469,little,37680
2909,old,36231


In [8]:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(f, products['sentiment'], products.index, random_state=0, test_size=0.2)

lreg = LogisticRegression()
model = lreg.fit(X_train, y_train)

yhat = model.predict(X_test)

# model.predict_proba(X_test)
model.score(X_test, y_test)



0.9218913975592936

In [9]:
products.loc[indices_train, 'predicted_sentiment'] = model.predict_proba(X_train)[:, 1]
products.loc[indices_test, 'predicted_sentiment'] = model.predict_proba(X_test)[:, 1]
products.loc[(products.name=='Baby Trend Diaper Champ') & (products.index.isin(indices_train))].sort_values(by=['predicted_sentiment'], ascending=False)

Unnamed: 0,name,review,rating,sentiment,predicted_sentiment
376,Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper pa...",5,1,1.000000
451,Baby Trend Diaper Champ,"As a first time mother, I wanted to get the be...",5,1,0.999999
414,Baby Trend Diaper Champ,We have been using our Diaper Champ for almost...,5,1,0.999999
477,Baby Trend Diaper Champ,The boy is now 1 year old and we have had it s...,4,1,0.999999
604,Baby Trend Diaper Champ,I have been using this diaper pail for 41/2 mo...,5,1,0.999998
320,Baby Trend Diaper Champ,I originally put this item on my baby registry...,5,1,0.999997
328,Baby Trend Diaper Champ,Diaper Champ or Diaper Genie? That was my dile...,5,1,0.999997
420,Baby Trend Diaper Champ,Baby Luke can turn a clean diaper to a dirty d...,5,1,0.999995
512,Baby Trend Diaper Champ,Bottom line: Using regular bags saves big time...,5,1,0.999989
593,Baby Trend Diaper Champ,My baby is now 8 months and the can has been h...,5,1,0.999987


In [10]:
pd.DataFrame(list(zip(cv.get_feature_names(),model.coef_[0])), columns=['word', 'weight']).sort_values(by=['weight'], ascending=False)

Unnamed: 0,word,weight
2969,outstanding,2.706037
3161,pleasantly,2.544716
1339,downsides,2.273461
568,brilliant,2.205229
200,amazed,2.116980
823,cleans,2.077604
2424,lifesaver,1.990134
3072,penny,1.982461
3920,skeptical,1.958506
148,adores,1.934203


In [12]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
cv2 = CountVectorizer(stop_words='english', min_df=2, vocabulary=selected_words)
f2 = cv2.fit_transform(products['review'])

In [13]:
features_count = f2.sum(axis=0).tolist()[0]
features_names = cv2.get_feature_names()
features = pd.DataFrame(list(zip(features_names, features_count)), 
                                columns=['features', 'count']
                               ).sort_values(by=['count'], ascending=False)
features

Unnamed: 0,features,count
1,great,55791
4,love,41994
6,bad,4183
0,awesome,3892
3,amazing,2628
2,fantastic,1664
7,terrible,1146
5,horrible,1110
10,hate,1107
8,awful,687


In [15]:
pd.DataFrame(f2.toarray(), columns=cv2.get_feature_names()) 

Unnamed: 0,awesome,great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,2,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,2,0,0,0,0,0,0


In [14]:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(f, products['sentiment'], products.index, random_state=0, test_size=0.2)

lreg = LogisticRegression()
model = lreg.fit(X_train, y_train)

yhat = model.predict(X_test)
model.score(X_test, y_test)

0.8458217144913196

In [15]:
products.loc[indices_train, 'predicted_sentiment2'] = model.predict_proba(X_train)[:, 1]
products.loc[indices_test, 'predicted_sentiment2'] = model.predict_proba(X_test)[:, 1]
products.loc[(products.name=='Baby Trend Diaper Champ') & (products.index.isin(indices_train))].sort_values(by=['predicted_sentiment2'], ascending=False)

Unnamed: 0,name,review,rating,sentiment,predicted_sentiment,predicted_sentiment2
329,Baby Trend Diaper Champ,I LOVE LOVE LOVE this product! It is SO much e...,4,1,0.999905,0.998251
446,Baby Trend Diaper Champ,I received my Diaper Champ at my baby shower f...,5,1,0.999558,0.995853
459,Baby Trend Diaper Champ,"Love it, love it, love it! This lives up to t...",5,1,0.999357,0.995853
367,Baby Trend Diaper Champ,"Let me just say, I LOVE THIS PRODUCT!! I used...",5,1,0.999857,0.988857
604,Baby Trend Diaper Champ,I have been using this diaper pail for 41/2 mo...,5,1,0.999998,0.988471
426,Baby Trend Diaper Champ,I love this diaper pale and wouldn\'t dream of...,5,1,0.997222,0.988471
394,Baby Trend Diaper Champ,Works great - no smells. LOVE that it uses re...,5,1,0.998306,0.988471
376,Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper pa...",5,1,1.000000,0.983703
622,Baby Trend Diaper Champ,Love the Diaper Champ. I had planned to get t...,4,1,0.998515,0.983703
469,Baby Trend Diaper Champ,I have a 10 year old daughter and an 8 month o...,5,1,0.999828,0.983703


In [16]:
products.loc[(products.name=='Baby Trend Diaper Champ') & (products.index.isin(indices_test))].sort_values(by=['rating'], ascending=False)

Unnamed: 0,name,review,rating,sentiment,predicted_sentiment,predicted_sentiment2
644,Baby Trend Diaper Champ,I see that there are complaints of stinkiness ...,5,1,0.982142,0.792238
403,Baby Trend Diaper Champ,I agree it\'s better than the genie. I own th...,5,1,0.961535,0.589820
493,Baby Trend Diaper Champ,My husband and I registered for the Diaper Cha...,5,1,0.995029,0.792238
486,Baby Trend Diaper Champ,"This is my second child. With my first, I went...",5,1,0.999462,0.938162
481,Baby Trend Diaper Champ,I love this pail. You just drop the diaper in...,5,1,0.950094,0.938162
480,Baby Trend Diaper Champ,The diaper champ is fantastic. You can use reg...,5,1,0.981935,0.903686
534,Baby Trend Diaper Champ,Wow! This is fabulous. It was a toss-up betw...,5,1,0.995359,0.775760
548,Baby Trend Diaper Champ,This is one of the best things that we got as ...,5,1,0.999841,0.589820
553,Baby Trend Diaper Champ,This is such an easy and clean way to dispose ...,5,1,0.902718,0.792238
582,Baby Trend Diaper Champ,I love this thing! I don\'t notice any smell w...,5,1,0.972297,0.938162


In [17]:
pd.DataFrame(list(zip(cv2.get_feature_names(),model.coef_[0])), columns=['word', 'weight']).sort_values(by=['weight'], ascending=False)

Unnamed: 0,word,weight
4,love,1.38093
0,awesome,1.171903
3,amazing,1.054181
2,fantastic,0.900392
1,great,0.865964
9,wow,-0.097348
6,bad,-0.97525
10,hate,-1.433518
8,awful,-2.055797
7,terrible,-2.209991


In [18]:
model.score(X_test, y_test)

0.8458217144913196

In [19]:
product = products[products['name'] == 'Baby Trend Diaper Champ'].reset_index(drop=True)
product_cv = cv.fit_transform(product['review'])
X_train, X_test, y_train, y_test = train_test_split(product_cv, product['sentiment'], random_state=0, test_size=0.2)

lreg = LogisticRegression()
model = lreg.fit(X_train, y_train)

yhat = model.predict(X_test)

model.predict_proba(X_test)

array([[  4.33349397e-03,   9.95666506e-01],
       [  4.38298778e-03,   9.95617012e-01],
       [  1.98317886e-03,   9.98016821e-01],
       [  5.38950315e-04,   9.99461050e-01],
       [  7.43263920e-01,   2.56736080e-01],
       [  4.98076089e-02,   9.50192391e-01],
       [  3.90172841e-03,   9.96098272e-01],
       [  2.62604304e-01,   7.37395696e-01],
       [  3.85798633e-01,   6.14201367e-01],
       [  3.16138319e-01,   6.83861681e-01],
       [  6.34269999e-01,   3.65730001e-01],
       [  1.67882346e-01,   8.32117654e-01],
       [  1.82338983e-02,   9.81766102e-01],
       [  3.69536341e-02,   9.63046366e-01],
       [  7.83051928e-02,   9.21694807e-01],
       [  6.95673038e-01,   3.04326962e-01],
       [  7.06611466e-05,   9.99929339e-01],
       [  5.87003388e-02,   9.41299661e-01],
       [  1.79633403e-04,   9.99820367e-01],
       [  2.64174026e-01,   7.35825974e-01],
       [  2.07802810e-01,   7.92197190e-01],
       [  1.45654292e-02,   9.85434571e-01],
       [  

In [20]:
model.score(X_test, y_test)

0.81666666666666665

In [21]:
pd.DataFrame(product_cv.toarray(), columns=cv.get_feature_names()) 

Unnamed: 0,13,able,ago,air,away,babies,baby,bad,bag,bags,...,won,wonderful,work,worked,works,worth,wrap,year,years,yes
0,0,0,1,0,0,0,0,0,3,1,...,0,0,0,0,0,0,0,1,1,0
1,0,0,0,1,0,0,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,2,0,1,0,4,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
6,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
7,0,0,0,1,0,2,1,0,1,2,...,0,0,0,0,1,1,0,0,0,0
8,0,0,0,0,0,0,1,0,2,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,3,1,...,0,0,0,0,0,0,0,0,0,0


In [22]:
features_count = product_cv.sum(axis=0).tolist()[0]
features_names = cv.get_feature_names()
features = pd.DataFrame(list(zip(features_names, features_count)), 
                                columns=['features', 'count']
                               ).sort_values(by=['count'], ascending=False)
features

Unnamed: 0,features,count
42,diaper,724
19,champ,361
9,bags,292
226,use,291
8,bag,230
43,diapers,203
69,genie,197
190,smell,187
145,pail,164
101,just,136


In [23]:
pd.DataFrame(list(zip(cv.get_feature_names(),model.coef_[0])), columns=['word', 'weight']).sort_values(by=['weight'], ascending=False)

Unnamed: 0,word,weight
118,love,1.179376
226,use,1.071238
20,change,0.969481
170,recommend,0.928057
227,used,0.818329
72,glad,0.737967
235,week,0.716313
197,special,0.698778
158,problems,0.669353
10,best,0.665752
