In [4]:
import pandas as pd
import numpy as np

In [5]:
products = pd.read_csv("amazon_baby_subset.csv")
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [6]:
products['sentiment'].value_counts()

 1    26579
-1    26493
dtype: int64

## Reading important_words.json

In [7]:
def read_index(filename):
    with open(filename, 'r') as f:
        first_line = f.readline()
    first_line = first_line.translate(None,'[]"').strip().split(',')
    first_line = [x.strip() for x in first_line]
    return first_line

In [8]:
important_words = read_index('important_words.json')
print important_words[:5]

['baby', 'one', 'great', 'love', 'use']


## Remove punctuations

In [9]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [10]:
import string

def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(None, string.punctuation)
    return str(text)

In [11]:
products['review_clean'] = products['review'].map(lambda s: remove_punctuation(s))
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...


## Adding word count

In [12]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [13]:
print products.columns.values

['name' 'review' 'rating' 'sentiment' 'review_clean' 'baby' 'one' 'great'
 'love' 'use' 'would' 'like' 'easy' 'little' 'seat' 'old' 'well' 'get'
 'also' 'really' 'son' 'time' 'bought' 'product' 'good' 'daughter' 'much'
 'loves' 'stroller' 'put' 'months' 'car' 'still' 'back' 'used' 'recommend'
 'first' 'even' 'perfect' 'nice' 'bag' 'two' 'using' 'got' 'fit' 'around'
 'diaper' 'enough' 'month' 'price' 'go' 'could' 'soft' 'since' 'buy' 'room'
 'works' 'made' 'child' 'keep' 'size' 'small' 'need' 'year' 'big' 'make'
 'take' 'easily' 'think' 'crib' 'clean' 'way' 'quality' 'thing' 'better'
 'without' 'set' 'new' 'every' 'cute' 'best' 'bottles' 'work' 'purchased'
 'right' 'lot' 'side' 'happy' 'comfortable' 'toy' 'able' 'kids' 'bit'
 'night' 'long' 'fits' 'see' 'us' 'another' 'play' 'day' 'money' 'monitor'
 'tried' 'thought' 'never' 'item' 'hard' 'plastic' 'however' 'disappointed'
 'reviews' 'something' 'going' 'pump' 'bottle' 'cup' 'waste' 'return'
 'amazon' 'different' 'top' 'want' 'problem' 

##Quiz Question. How many reviews contain the word perfect?



In [14]:
print sum(products['perfect'] > 0)

2955


##Convert data frame to multi-dimensional array



In [15]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix()
    return(feature_matrix, label_array)

In [16]:
feature_matrix, target = get_numpy_data(products, important_words, 'sentiment')

##Quiz Question: How many features are there in the feature_matrix?

In [17]:
feature_matrix.shape

(53072, 194)

##Quiz Question: Assuming that the intercept is present, how does the number of features in feature_matrix relate to the number of features in the logistic regression model?

1 to 1 ?

##Estimating conditional probability with link function

In [18]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
import math

def predict_probability(feature_matrix, coefficients):
    score = np.dot(feature_matrix, coefficients)
    predictions = map(lambda s: 1./(1.+math.exp(-s)), score)
    
    return predictions

##Compute derivative of log likelihood with respect to a single coefficient


In [19]:
def feature_derivative(errors, feature):     
    derivative = np.sum(np.multiply(errors, feature))
    return derivative

## Log-likelihood function

In [23]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

## Gradient Ascent

In [29]:
from math import sqrt

def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment == +1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            derivative = feature_derivative(errors, feature_matrix[:,j])
            # add the step size times the derivative to the current coefficient
            coefficients[j] += step_size * derivative

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

## Logistic regression RUn

In [30]:
log_reg = logistic_regression(feature_matrix, target, [0.] * feature_matrix.shape[1], 1e-7, 301)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

##Quiz question: As each iteration of gradient ascent passes, does the log likelihood increase or decrease?



increases

## Predicting Sentiments

In [55]:
def predict_log_reg(feature_matrix, coefficients):
    scores = np.dot(feature_matrix, coefficients)
    predictions = map(lambda s : 1 if s > 0 else -1, scores)
    return predictions

##Quiz question: How many reviews were predicted to have positive sentiment?



In [69]:
preds = predict_log_reg(feature_matrix, log_reg)
print preds[:5]
print sum(map(lambda s: s == -1, preds))
print sum(map(lambda s: s == 1, preds))

[1, -1, 1, 1, 1]
27946
25126


## Measuring accuracy



In [71]:
print target[:20]
print preds[:20]
print sum(target == preds)
print len(target)
print 39903/53072.

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1]
39903
53072
0.751865390413


In [44]:
def accuracy(true_labels, pred_labels):
    return sum(true_labels == pred_labels)/float(len(true_labels))

##Quiz question: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)



In [47]:
accuracy(target, preds)

0.36305396442568588

## Which words contribute most to positive & negative sentiments



In [48]:
coefficients = list(log_reg[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

##Quiz question: Which word is not present in the top 10 "most positive" words?



In [49]:
word_coefficient_tuples[:10]

[('great', 0.066546084170457681),
 ('love', 0.065890762922123258),
 ('easy', 0.064794586802578394),
 ('little', 0.045435626308421372),
 ('loves', 0.044976401394906038),
 ('well', 0.030135001092107077),
 ('perfect', 0.029739937104968462),
 ('old', 0.020077541034775385),
 ('nice', 0.018408707995268992),
 ('daughter', 0.017703199905701694)]

## Quiz question: Which word is not present in the top 10 "most negative" words?


In [51]:
word_coefficient_tuples[-10:]

[('monitor', -0.02448210054589172),
 ('return', -0.026592778462247283),
 ('back', -0.027742697230661334),
 ('get', -0.028711552980192585),
 ('disappointed', -0.028978976142317068),
 ('even', -0.030051249236035808),
 ('work', -0.03306951529475273),
 ('money', -0.038982037286487116),
 ('product', -0.04151103339210889),
 ('would', -0.053860148445203128)]