In [18]:
import pandas as pd
import numpy as np
from math import sqrt

Loading Data

In [19]:
products = pd.read_csv('amazon_baby_subset.csv')

Sneak peek of data!

In [20]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [21]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53072 entries, 0 to 53071
Data columns (total 4 columns):
name         52982 non-null object
review       52831 non-null object
rating       53072 non-null int64
sentiment    53072 non-null int64
dtypes: int64(2), object(2)
memory usage: 1.6+ MB


Listing names of first 10 products

In [22]:
products['name'][:10]

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

Total Positive and Negative Reviews

In [23]:
print('Products with Positive review:',products[products['sentiment']==1]
      ['sentiment'].count())
print('Products with Negative review:',products[products['sentiment']==-1]
      ['sentiment'].count())

Products with Positive review: 26579
Products with Negative review: 26493


Loading words that will matter.

In [24]:
important_words = pd.read_json('important_words.json')

Replacing empty reviews with NA

In [25]:
products = products.fillna({'review':''})

A function to remove punctuation and replace it with whitespace

In [26]:
def remove_punctuation(text):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

Applying remove punctuation function on every review and storing results in 
a new column named clean_review

In [27]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [28]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...


Making bag of words

By making a column for each important word and filling the column with 
count of occurrence of each important word in each review.

In [29]:
for word in important_words[0]:
    products[word] = products['review_clean'].apply(lambda s : s.split()
                                                    .count(word))

In [30]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,seems,picture,completely,wish,buying,babies,won,tub,almost,either
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


Number of reviews containing word Perfect (Quiz Question)

In [31]:
print(products[products['perfect'] > 0]['name'].count())

2945


function for converting data to Numpy

In [32]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.values
    label_sarray = dataframe[label]
    label_array = label_sarray.values
    return(feature_matrix, label_array)

In [33]:
feature_matrix, sentiment = get_numpy_data(dataframe=products,features=
                                           important_words[0].tolist(),
                                           label='sentiment')

Logistic Regression Implementation Starts here.

In [34]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    # YOUR CODE HERE
    score = np.dot(feature_matrix,coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions = np.divide(1,(1 + np.exp((-1)*score)))
    
    # return predictions
    return predictions

In [35]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors,feature)
        # Return the derivative
    return derivative

In [36]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [1]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients,
                        step_size, max_iter,l2_regularization=0):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy 
    # array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix=feature_matrix,
                                         coefficients=coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

#         for j in xrange(len(coefficients)): # loop over each coefficient
#             # Recall that feature_matrix[:,j] is the feature column associated
#             # with coefficients[j] compute the derivative for coefficients[j].
#             #Save it in a variable called derivative
#             # YOUR CODE HERE
#             derivative = feature_derivative(errors=errors,
#                                             feature=feature_matrix[j])
        partials = np.dot(feature_matrix.T,errors)
        # add the step size times the derivative to the current coefficient
        # YOUR CODE HERE
        coefficients = np.add(coefficients, np.multiply(step_size,partials) )

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and
                                                           itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment,
                                        coefficients)
            print ('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

testing our Logistic Regression

In [67]:
initial_coefficients = np.zeros(194)
coefficients = logistic_regression_2(feature_matrix=feature_matrix, 
                                   sentiment=sentiment,
                                   initial_coefficients=initial_coefficients, 
                                   step_size=1e-7, max_iter=301,
                                   l2_regularization=0.00001)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13466209
iteration   2: log likelihood of observed labels = -36769.35791028
iteration   3: log likelihood of observed labels = -36763.58741210
iteration   4: log likelihood of observed labels = -36757.82315017
iteration   5: log likelihood of observed labels = -36752.06510723
iteration   6: log likelihood of observed labels = -36746.31326614
iteration   7: log likelihood of observed labels = -36740.56760986
iteration   8: log likelihood of observed labels = -36734.82812146
iteration   9: log likelihood of observed labels = -36729.09478413
iteration  10: log likelihood of observed labels = -36723.36758118
iteration  11: log likelihood of observed labels = -36717.64649600
iteration  12: log likelihood of observed labels = -36711.93151212
iteration  13: log likelihood of observed labels = -36706.22261315
iteration  14: log likelihood of observed labels = -36700.5197

For Class Prediction

In [62]:
predictions = predict_probability(feature_matrix=feature_matrix,
                                         coefficients=coefficients)

In [63]:
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = -1

In [64]:
len(predictions[predictions == 1])

25126

In [65]:
len(predictions[predictions == -1])

27946

Calculatin accuracy

In [66]:
accuracy = (predictions == products['sentiment']).sum() / len(products)
print("Accuracy: %.2f" % accuracy)

Accuracy: 0.75


In [67]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in 
                           zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda 
                                 x:x[1], reverse=True)

In [68]:
word_coefficient_tuples

[(0, 0.015565696580423515)]

##### This implementation is with L2 Regularization

In [64]:
def logistic_regression_2(feature_matrix, sentiment, initial_coefficients,
                        step_size, max_iter,l2_regularization):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy 
    # array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix=feature_matrix,
                                         coefficients=coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

#         for j in xrange(len(coefficients)): # loop over each coefficient
#             # Recall that feature_matrix[:,j] is the feature column associated
#             # with coefficients[j] compute the derivative for coefficients[j].
#             #Save it in a variable called derivative
#             # YOUR CODE HERE
#             derivative = feature_derivative(errors=errors,
#                                             feature=feature_matrix[j])
        partials = np.dot(feature_matrix.T,errors)
        # add the step size times the derivative to the current coefficient
        # YOUR CODE HERE
        coefficient_0 = coefficients[0]
        reg_factor = np.multiply(coefficients,2*l2_regularization)
        if l2_regularization > 0:
            reg_factor[0] = coefficient_0
        
        coefficients = np.add(coefficients, np.multiply(step_size,partials))
        #now applying regularization
        coefficients = np.subtract(coefficients,reg_factor)

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and
                                                           itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment,
                                        coefficients)
            print ('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
#             print(reg_factor)
    return coefficients