In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

In [3]:
import string

In [4]:
products = pd.read_csv(os.getcwd() + '/amazon_baby_subset.csv/amazon_baby_subset.csv')

In [5]:
##Reviews file
important_words = pd.read_json(os.getcwd() + '/important_words.json/important_words.json')

In [6]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [7]:
products.shape

(53072, 4)

In [8]:
important_words.head()

Unnamed: 0,0
0,baby
1,one
2,great
3,love
4,use


In [9]:
important_words.shape

(193, 1)

In [10]:
#Remove Punctuations
table = str.maketrans(string.punctuation, len(string.punctuation)*" ")
def remove_punctuation(text):
    return text.translate(table)

In [11]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

### Apply the remove_punctuation function on every element of the review column and assign the result to the new column review_clean. Note. Many data frame packages support apply operation for this type of task. Consult appropriate manuals.

In [12]:
products['review_clean'] = list(map(remove_punctuation, products['review']))

In [13]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried non stop when I trie...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago ...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of baby s first and favorite books and it...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this ...


In [14]:
len(important_words)

193

### For each word in important_words, we compute a count for the number of times the word occurs in the review.

In [15]:
from collections import Counter

In [16]:
important_words[0]

0            baby
1             one
2           great
3            love
4             use
5           would
6            like
7            easy
8          little
9            seat
10            old
11           well
12            get
13           also
14         really
15            son
16           time
17         bought
18        product
19           good
20       daughter
21           much
22          loves
23       stroller
24            put
25         months
26            car
27          still
28           back
29           used
          ...    
163       started
164      anything
165          last
166       company
167          come
168      returned
169         maybe
170          took
171         broke
172         makes
173          stay
174       instead
175          idea
176          head
177          said
178          less
179          went
180       working
181          high
182          unit
183         seems
184       picture
185    completely
186          wish
187       

In [17]:
import timeit

In [18]:
help(timeit.time.time)

Help on built-in function time in module time:

time(...)
    time() -> floating point number
    
    Return the current time in seconds since the Epoch.
    Fractions of a second may be present if the system clock provides them.



In [19]:
#Count the frequency of a word in the review.
start_time = timeit.time.time()
for word in important_words[0]:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
end_time = timeit.time.time()
print("Time required for lambda and apply:- %.3f seconds" %((end_time-start_time)))

Time required for lambda and apply:- 58.464 seconds


In [20]:
def check_presence(word, text):
    return (text.split().count(word))

start_time = timeit.time.time()
for word in important_words[0]:
    #List Comprehension
    products[word] = [check_presence(word, rev) for rev in products['review']] 
end_time = timeit.time.time()

print("Time required for List Comprehension:- %.3f seconds" %((end_time-start_time)))


Time required for List Comprehension:- 55.441 seconds


In [21]:
0 in products.columns

False

In [22]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried non stop when I trie...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago ...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of baby s first and favorite books and it...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this ...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


### Quiz Question. How many reviews contain the word perfect?

In [23]:
if('perfect' in products.columns):
    print(products.loc[products['perfect']==1,'perfect'].shape[0])

2165


In [24]:
products.loc[products['perfect']==1, :].shape

(2165, 198)

In [25]:
#Convert data frame to multi-dimensional array
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    feature_matrix = dataframe[['constant'] + list(features)]

    feature_matrix, label_array =  list(map(lambda x:x.as_matrix(), 
                                       [feature_matrix, dataframe[label]])) 
    return (feature_matrix, label_array)

In [26]:
feature_matrix, sentiment = get_numpy_data(products, important_words[0], 'sentiment')

  


In [27]:
products[['constant'] + important_words[0].tolist()].head()

Unnamed: 0,constant,baby,one,great,love,use,would,like,easy,little,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [28]:
feature_matrix

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

### Quiz Question: How many features are there in the feature_matrix?

In [29]:
feature_matrix.shape

(53072, 194)

In [30]:
194

194

### Quiz Question: Assuming that the intercept is present, how does the number of features in feature_matrix relate to the number of features in the logistic regression model?

In [31]:
#Both are same
#We learn the weights

## Predict Probability

In [32]:
#Implement the Link Function
def predict_probability(feature_matrix, coefficients):
        
    score = np.dot(feature_matrix, coefficients)
    
    probability_positive = ((1+np.e**(-1*score))**-1)
    
    return (probability_positive)

In [33]:

def cosine_similarity(a, b):
    #a and b are arrays
    def find_mod(vec):
        return (sum([ele**2 for ele in vec]))**(0.5)
    
    mod_a, mod_b = list(map(find_mod, [a,b]))
    
    print(mod_a, mod_b)
    return (np.dot(a,b)/(mod_a*mod_b))

A = np.array([1,2,3])
B = np.array([1, 2, 3])
#A, B = list(map(lambda x: np.array, [A,B]))
print(A, B)
cosine_similarity(A, B)

[1 2 3] [1 2 3]
3.7416573867739413 3.7416573867739413


1.0

## Compute derivative of log likelihood with respect to a single coefficient

In [34]:
def feature_derivative(errors, feature):
    return (np.dot(feature, errors))

In [35]:
np.unique(sentiment)

array([-1,  1])

In [36]:
indicator = pd.Series(sentiment).map({-1:0, 1:1})

## Log Likelihood

In the main lecture, our focus was on the likelihood. In the advanced optional video, however, we introduced a transformation of this likelihood---called the log-likelihood---that simplifies the derivation of the gradient and is more numerically stable. Due to its numerical stability, we will use the log-likelihood instead of the likelihood to assess the algorithm.

In [37]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

### Taking gradient steps

In [38]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients,
                        step_size, max_iter):
    coefficients = np.array(initial_coefficients)
    
    for itr in range(max_iter):
        #coeff = initial_coefficients[i]
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        prediction = predict_probability(feature_matrix,
                                         coefficients)
        
        errors = np.array(indicator) - prediction
        
        for j in range(len(coefficients)):
            derivative = feature_derivative(errors,
                                            feature_matrix[:, j])
            coefficients[j] += step_size*derivative
                    
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print ('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
        
    return (coefficients)

## 14. Now, let us run the logistic regression solver with the parameters below:

    feature_matrix = feature_matrix extracted
    sentiment = sentiment extracted
    initial_coefficients = a 194-dimensional vector filled with zeros
    step_size = 1e-7
    max_iter = 301

In [39]:
initial_coefficients = np.zeros(194)
step_size = 1e-7
max_iter = 301
initial_coefficients.shape

(194,)

In [40]:


dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])

correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),          1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_predictions = np.array( [ 1./(1+np.exp(-correct_scores[0])), 1./(1+np.exp(-correct_scores[1])) ] )

print ('The following outputs must match ')
print ('------------------------------------------------')
print ('correct_predictions           =', correct_predictions)
print ('output of predict_probability =', predict_probability(dummy_feature_matrix, dummy_coefficients))



The following outputs must match 
------------------------------------------------
correct_predictions           = [0.98201379 0.26894142]
output of predict_probability = [0.98201379 0.26894142]


In [41]:
sentiment

array([ 1,  1,  1, ..., -1, -1, -1])

In [42]:
dummy_sentiment = np.array([1, -1])

In [43]:
compute_log_likelihood(dummy_feature_matrix, dummy_sentiment, dummy_coefficients)

-0.3314116154360326

In [44]:
feature_matrix.shape

(53072, 194)

In [45]:
dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])
dummy_sentiment = np.array([-1, 1])

correct_indicators  = np.array( [ -1==+1,                                       1==+1 ] )
correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),                     1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_first_term  = np.array( [ (correct_indicators[0]-1)*correct_scores[0],  (correct_indicators[1]-1)*correct_scores[1] ] )
correct_second_term = np.array( [ np.log(1. + np.exp(-correct_scores[0])),      np.log(1. + np.exp(-correct_scores[1])) ] )

correct_ll          =      sum( [ correct_first_term[0]-correct_second_term[0], correct_first_term[1]-correct_second_term[1] ] ) 

print ('The following outputs must match ')
print ('------------------------------------------------')
print ('correct_log_likelihood           =', correct_ll)
print ('output of compute_log_likelihood =', compute_log_likelihood(dummy_feature_matrix, dummy_sentiment, dummy_coefficients))


The following outputs must match 
------------------------------------------------
correct_log_likelihood           = -5.331411615436032
output of compute_log_likelihood = -5.331411615436032


In [46]:
np.array(indicator).reshape(53072, 1).shape

(53072, 1)

In [47]:
dummy_prediction = predict_probability(dummy_feature_matrix, dummy_coefficients)

In [48]:
dummy_indicator = np.array([-1==+1, +1==+1])
dummy_indicator

array([False,  True])

In [49]:
dummy_prediction

array([0.98201379, 0.26894142])

In [50]:
dummy_error = dummy_indicator - dummy_prediction

In [51]:
dummy_result = pd.DataFrame({'Indicator':dummy_indicator, 
                             'Prediction':dummy_prediction,
                             'Error':dummy_error},
                             columns = ['Indicator', 'Prediction', 'Error'])

In [52]:
dummy_result

Unnamed: 0,Indicator,Prediction,Error
0,False,0.982014,-0.982014
1,True,0.268941,0.731059


In [53]:
initial_coefficients = np.zeros(194)

In [54]:
max_iter

301

In [55]:

coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients,
                        step_size, max_iter)


iteration   0: log likelihood of observed labels = -36782.24149905
iteration   1: log likelihood of observed labels = -36777.77993493
iteration   2: log likelihood of observed labels = -36773.32246359
iteration   3: log likelihood of observed labels = -36768.86907436
iteration   4: log likelihood of observed labels = -36764.41975666
iteration   5: log likelihood of observed labels = -36759.97449997
iteration   6: log likelihood of observed labels = -36755.53329383
iteration   7: log likelihood of observed labels = -36751.09612785
iteration   8: log likelihood of observed labels = -36746.66299174
iteration   9: log likelihood of observed labels = -36742.23387522
iteration  10: log likelihood of observed labels = -36737.80876812
iteration  11: log likelihood of observed labels = -36733.38766031
iteration  12: log likelihood of observed labels = -36728.97054176
iteration  13: log likelihood of observed labels = -36724.55740245
iteration  14: log likelihood of observed labels = -36720.1482

In [56]:
coefficients

array([ 4.73673054e-03,  1.36807710e-02, -5.24032888e-03,  5.06832503e-02,
        6.51628882e-02,  7.55938637e-03, -5.38797585e-02, -4.01267488e-03,
        6.20295470e-02,  4.49642128e-02,  1.73838517e-03,  1.47559181e-02,
        2.06982177e-02, -2.93239973e-02,  1.37731585e-02,  1.01583220e-03,
        1.06265309e-02, -1.70365851e-02, -1.22182219e-02, -3.61762595e-02,
        2.14703057e-03,  1.64555566e-02, -5.82086591e-03,  4.48773640e-02,
        7.20303166e-03,  9.41287916e-04, -1.64434290e-05,  9.53392509e-03,
        8.11709255e-03, -1.94594532e-02,  1.30086724e-03,  1.18890164e-02,
       -1.14665743e-02, -3.02141443e-02,  2.26859843e-02,  1.65406585e-02,
        7.08093759e-04, -1.05719575e-02, -7.41837552e-04, -1.02562826e-02,
       -2.81967169e-03,  5.41935965e-03,  4.59599698e-03,  4.90897395e-03,
       -2.48482415e-04,  3.62836558e-03, -2.76060363e-03, -1.94944998e-02,
        1.24059131e-02,  1.31786668e-03, -1.89826154e-02,  6.95054917e-03,
        1.26710591e-02, -

### Quiz question: As each iteration of gradient ascent passes, does the log likelihood increase or decrease? 

In [57]:
print("Log Likelihood Increases")

Log Likelihood Increases


## Predicting sentiments

15. Recall from lecture that class predictions for a data point x can be computed from the coefficients w using the following formula:

In [58]:
def predict_sentiment(feature_mat, coeff):
    score = np.dot(feature_mat, coeff)
    
    #According to Logistic Function
    if(score>0):
        return (+1)
    else:
        return (-1)
    

### Quiz question: How many reviews were predicted to have positive sentiment? 

In [59]:
predicted_sentiments = [predict_sentiment(feature_matrix[i,:], coefficients)
                        for i in range(len(feature_matrix))]
print("Number of predicted positive sentiments:- "+
      str(len([ele for ele in predicted_sentiments if(ele==1)])))

Number of predicted positive sentiments:- 24714


In [60]:
feature_matrix.shape

(53072, 194)

In [61]:
print("Number of predicted negative sentiments:- "+
      str(len([ele for ele in predicted_sentiments if(ele==-1)])))

Number of predicted negative sentiments:- 28358


### Quiz question: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)

In [62]:
'sentiment' in products.columns

True

In [63]:
#Accuracy  = #Sentiments Predicted as Positive / Total # Observations
#Actually positive sentiments
actual_sentiments = products['sentiment']

results = pd.DataFrame({'Actual': actual_sentiments, 'Predicted': predicted_sentiments},
                       columns=['Actual', 'Predicted'])
results.head()

Unnamed: 0,Actual,Predicted
0,1,1
1,1,-1
2,1,1
3,1,1
4,1,1


In [64]:
def calculate_accuracy(a, b):
    if(a==b):
        return 1
    else:
        return 0


In [65]:
results_predictions = [calculate_accuracy(act, pred) 
                             for act, pred in zip(results['Actual'],
                                                  results['Predicted'])]
results_predictions

[1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,


In [66]:
#Accuracy 
correct_count = len([ele for ele in results_predictions if(ele==1)])
print("Accuracy:- %.2f" %(correct_count/len(results_predictions)))

Accuracy:- 0.74


## Which words contribute most to positive & negative sentiments

In [67]:
coefficients[1:].shape

(193,)

In [68]:
important_words[0].ravel()

array(['baby', 'one', 'great', 'love', 'use', 'would', 'like', 'easy',
       'little', 'seat', 'old', 'well', 'get', 'also', 'really', 'son',
       'time', 'bought', 'product', 'good', 'daughter', 'much', 'loves',
       'stroller', 'put', 'months', 'car', 'still', 'back', 'used',
       'recommend', 'first', 'even', 'perfect', 'nice', 'bag', 'two',
       'using', 'got', 'fit', 'around', 'diaper', 'enough', 'month',
       'price', 'go', 'could', 'soft', 'since', 'buy', 'room', 'works',
       'made', 'child', 'keep', 'size', 'small', 'need', 'year', 'big',
       'make', 'take', 'easily', 'think', 'crib', 'clean', 'way',
       'quality', 'thing', 'better', 'without', 'set', 'new', 'every',
       'cute', 'best', 'bottles', 'work', 'purchased', 'right', 'lot',
       'side', 'happy', 'comfortable', 'toy', 'able', 'kids', 'bit',
       'night', 'long', 'fits', 'see', 'us', 'another', 'play', 'day',
       'money', 'monitor', 'tried', 'thought', 'never', 'item', 'hard',
       'plast

In [69]:
#Positive Reviews
'''
The list containing important words is present.
'''
words_coeff_tuple = [(word, coefficient) for word, coefficient in zip(important_words[0].ravel(), coefficients[1:])]
words_coeff_tuple

[('baby', 0.013680770964074688),
 ('one', -0.005240328876921257),
 ('great', 0.0506832503387355),
 ('love', 0.06516288823238833),
 ('use', 0.0075593863720944815),
 ('would', -0.0538797585447843),
 ('like', -0.004012674876131312),
 ('easy', 0.062029546990689015),
 ('little', 0.04496421284998458),
 ('seat', 0.0017383851737173646),
 ('old', 0.014755918079191233),
 ('well', 0.02069821773359),
 ('get', -0.02932399727767066),
 ('also', 0.01377315851278538),
 ('really', 0.0010158322026321705),
 ('son', 0.010626530928533847),
 ('time', -0.017036585087027243),
 ('bought', -0.01221822188862161),
 ('product', -0.03617625947473382),
 ('good', 0.002147030566264864),
 ('daughter', 0.01645555655729781),
 ('much', -0.005820865907973413),
 ('loves', 0.04487736399389728),
 ('stroller', 0.007203031662434025),
 ('put', 0.000941287915687742),
 ('months', -1.644342902241027e-05),
 ('car', 0.009533925094999023),
 ('still', 0.008117092546400051),
 ('back', -0.019459453150996592),
 ('used', 0.00130086723525915

In [70]:
words_coeff_tuple = sorted(words_coeff_tuple, key=lambda x:x[1], reverse=True)

In [71]:
words_coeff_tuple

[('love', 0.06516288823238833),
 ('easy', 0.062029546990689015),
 ('great', 0.0506832503387355),
 ('little', 0.04496421284998458),
 ('loves', 0.04487736399389728),
 ('perfect', 0.022685984298359947),
 ('well', 0.02069821773359),
 ('fits', 0.01708208087034107),
 ('nice', 0.016540658492544564),
 ('daughter', 0.01645555655729781),
 ('happy', 0.015558951246856233),
 ('old', 0.014755918079191233),
 ('best', 0.013930675730989597),
 ('also', 0.01377315851278538),
 ('baby', 0.013680770964074688),
 ('works', 0.012671059093092074),
 ('soft', 0.012405913106793417),
 ('recommend', 0.011889016377716874),
 ('bit', 0.011081029826280007),
 ('son', 0.010626530928533847),
 ('comfortable', 0.009810283260449552),
 ('car', 0.009533925094999023),
 ('easily', 0.008207289516803398),
 ('still', 0.008117092546400051),
 ('play', 0.007800334817552406),
 ('size', 0.007729922669456849),
 ('use', 0.0075593863720944815),
 ('stroller', 0.007203031662434025),
 ('room', 0.0069505491719928075),
 ('lot', 0.006583022773509

### Ten "most positive" words 
18. Compute the 10 words that have the most positive coefficient values. These words are associated with positive sentiment.

In [72]:
top_10_positive_words = [top_tuple[0] for top_tuple in words_coeff_tuple[:10]]
top_10_positive_words

['love',
 'easy',
 'great',
 'little',
 'loves',
 'perfect',
 'well',
 'fits',
 'nice',
 'daughter']

### Quiz question: Which word is not present in the top 10 "most positive" words? 

### Ten "most negative" words

19. Next, we repeat this exerciese on the 10 most negative words. That is, we compute the 10 words that have the most negative coefficient values. These words are associated with negative sentiment.

In [73]:
words_coeff_tuple[-15:]

[('buy', -0.01898261536532469),
 ('tried', -0.019163565631510185),
 ('back', -0.019459453150996592),
 ('could', -0.01949449976413961),
 ('disappointed', -0.019732758233784013),
 ('monitor', -0.02068181121460328),
 ('thought', -0.020756425233543412),
 ('work', -0.02094666567861414),
 ('money', -0.0223526167156249),
 ('waste', -0.02315534512199584),
 ('return', -0.024570662566524983),
 ('get', -0.02932399727767066),
 ('even', -0.030214144320453017),
 ('product', -0.03617625947473382),
 ('would', -0.0538797585447843)]

### Quiz question: Which word is not present in the top 10 "most negative" words?