# Predicting sentiment from product reviews

In [428]:
import numpy as np
import graphlab
import pandas as pd
import time
import sklearn
from sklearn.model_selection import train_test_split
from __future__ import division

# Data preparation

In [95]:
products = graphlab.SFrame("E:\\Machine Learning\\U.W\\Classification\\amazon_baby.gl/")
products.save("E:\\Machine Learning\\U.W\\Classification\\amazon_baby.csv", format="csv")
products = pd.read_csv("E:\\Machine Learning\\U.W\\Classification\\amazon_baby.csv")

In [96]:
type(products)

pandas.core.frame.DataFrame

In [97]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


## Build the word count vector for each review

Let us explore a specific example of a baby product.

In [98]:
products.iloc[269, :]

name      The First Years Massaging Action Teether
review                    A favorite in our house!
rating                                           5
Name: 269, dtype: object

In [99]:
products.iloc[269, :]["review"]

'A favorite in our house!'

Now, we will perform 2 simple data transformations:

1. Remove punctuation.
2. Transform the reviews into word-counts.

**The function below is different than "SFrame" in Graphlab.**

In [100]:
def remove_punctuation(text):
    import string
    
    for punc in string.punctuation:
        text = text.replace(punc, "")
        
    return text

In [101]:
products["review"].fillna("0", inplace=True)

In [102]:
products["review_no_punc"] = products["review"].apply(remove_punctuation)

In [103]:
products.head()

Unnamed: 0,name,review,rating,review_no_punc
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


In [104]:
products["review_no_punc"][269]

'A favorite in our house'

**The function above works as well as in Graphlab.**

**Knowledge of Python:**

1. When *products* is DataFrame, *products["review"]* is pandas.Series, *products[["review"]]* is DataFrame, *products["review"][0]* is String, *products[["review"]].loc[0]* is pandas.Series.
2. *replace* function only works for String.

**The function below is different than Graphlab**

In [105]:
def word_count(text):
    words = {}
    
    words_list = text.split()
    for i in range(len(words_list)):
        if words_list[i] in words:
            words[words_list[i]] += 1
        else:
            words[words_list[i]] = 1
            
            
    return words

In [106]:
products["word_count"] = products["review_no_punc"].apply(word_count)

In [107]:
products.head()

Unnamed: 0,name,review,rating,review_no_punc,word_count
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...,"{u'and': 5, u'stink': 1, u'months': 1, u'order..."
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,"{u'and': 3, u'love': 1, u'it': 3, u'highly': 1..."
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,"{u'and': 2, u'quilt': 1, u'it': 1, u'comfortab..."
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,"{u'and': 3, u'ingenious': 1, u'What': 1, u'lov..."
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,"{u'and': 2, u'all': 1, u'help': 1, u'cried': 1..."


**The function above works as well as Graphlab.**

## Extract sentiments

We will **ignore** all reviews with *rating = 3*, since they tend to have a neutral sentiment.

In [108]:
products = products[products["rating"] != 3]

len(products)

166752

Now, we will assign reviews with a rating of 4 or higher to be *positive* reviews, while the ones with rating of 2 or lower are *negative*. For the sentiment column, we use +1 for the positive class label and -1 for the negative class label.

In [109]:
products["sentiment"] = products["rating"].apply(lambda rating: +1 if rating>3 else -1)

products.head()

Unnamed: 0,name,review,rating,review_no_punc,word_count,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,"{u'and': 3, u'love': 1, u'it': 3, u'highly': 1...",1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,"{u'and': 2, u'quilt': 1, u'it': 1, u'comfortab...",1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,"{u'and': 3, u'ingenious': 1, u'What': 1, u'lov...",1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,"{u'and': 2, u'all': 1, u'help': 1, u'cried': 1...",1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,"{u'and': 2, u'cute': 1, u'would': 1, u'help': ...",1


## Split data into training and test sets

Let's perform a train/test split with 80% of the data in the training set and 20% of the data in the test set. We use `seed=1` so that everyone gets the same result.

**The method below is different than Graphlab.**

In [110]:
train_data, test_data = train_test_split(products, test_size=0.2, random_state=42)

print len(train_data)
print len(test_data)

133401
33351


**In Graphlab, the size is 133416/33336, pretty close.**

# Train a sentiment classifier with logistic regression

We will now use logistic regression to create a sentiment classifier on the training data. This model will use the column **word_count** as a feature and the column **sentiment** as the target.

In [111]:
products.head()

Unnamed: 0,name,review,rating,review_no_punc,word_count,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,"{u'and': 3, u'love': 1, u'it': 3, u'highly': 1...",1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,"{u'and': 2, u'quilt': 1, u'it': 1, u'comfortab...",1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,"{u'and': 3, u'ingenious': 1, u'What': 1, u'lov...",1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,"{u'and': 2, u'all': 1, u'help': 1, u'cried': 1...",1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,"{u'and': 2, u'cute': 1, u'would': 1, u'help': ...",1


In [112]:
products = products.reset_index(drop=True)

In [113]:
products.head()

Unnamed: 0,name,review,rating,review_no_punc,word_count,sentiment
0,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,"{u'and': 3, u'love': 1, u'it': 3, u'highly': 1...",1
1,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,"{u'and': 2, u'quilt': 1, u'it': 1, u'comfortab...",1
2,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,"{u'and': 3, u'ingenious': 1, u'What': 1, u'lov...",1
3,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,"{u'and': 2, u'all': 1, u'help': 1, u'cried': 1...",1
4,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,"{u'and': 2, u'cute': 1, u'would': 1, u'help': ...",1


**The code below transforms the ["word_count"] to sparse matrix.**

In [21]:
# start = time.time()

# df_new = pd.DataFrame()

# for i in range(len(products)):
#     ddd = pd.DataFrame.from_dict(products["word_count"][i], orient="index")
#     ddd = ddd.reset_index()
#     ddd.columns = ["feature", "value"]
#     ddd.insert(loc=0, column="id", value=i)
    
#     df_new = df_new.append(ddd, ignore_index=True)
    
    
# runtime = time.time() - start

**Ran a whole day!!!!**

To save time, we only use the first 1000 observations.

In [363]:
train_data_1000_df, test_data_1000_df = train_test_split(products.iloc[:1000, :], test_size=0.2, random_state=42)

print len(train_data_1000_df)
print len(test_data_1000_df)

products_1000_df = train_data_1000_df.append(test_data_1000_df)

print len(products_1000_df)

800
200
1000


In [364]:
products_1000_df = products_1000_df.reset_index(drop=True)

In [365]:
df_new = products_1000_df["word_count"].apply(pd.Series).stack()
df_new = df_new.reset_index()
df_new.columns = ["id", "feature", "value"]

In [366]:
from sklearn.preprocessing import LabelEncoder

In [367]:
f = LabelEncoder()
df_new_label = f.fit_transform(df_new["feature"])

df_new["feature_id"] = df_new_label

In [368]:
from scipy.sparse import csr_matrix

v = np.array(df_new["value"])
i = np.array(df_new["id"])
j = np.array(df_new["feature_id"])

row = df_new["id"].max() + 1
col = j.max() + 1

mat = csr_matrix((v, (i, j)), shape=(row, col))
features_toarray = mat.toarray()

In [369]:
sentiment_toarray = np.array(products_1000_df["sentiment"])
sentiment_toarray = sentiment_toarray[:, None]

In [370]:
print features_toarray.shape
print sentiment_toarray.shape

(1000L, 7064L)
(1000L, 1L)


In [371]:
products_1000_array = np.hstack((features_toarray, sentiment_toarray))

In [372]:
print products_1000_array.shape

products_1000_array

(1000L, 7065L)


array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.]])

In [373]:
train_data_1000_arr = products_1000_array[:800, :]
test_data_1000_arr = products_1000_array[800:, :]

print len(train_data_1000_arr)
print len(test_data_1000_arr)

800
200


# Train a sentiment classifier with logistic regression

In [374]:
from sklearn.linear_model import LogisticRegression

sentiment_model = LogisticRegression(solver="lbfgs").fit(train_data_1000_arr[:, :-1], train_data_1000_arr[:, -1])

In [375]:
sentiment_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [376]:
weights = sentiment_model.coef_
intercept = sentiment_model.intercept_

print weights
print intercept

[[-0.00472237  0.00015791  0.03777541 ...  0.          0.00669177
   0.0250207 ]]
[1.10986443]


In [377]:
num_positive_weights = len(weights[weights > 0])
num_negative_weights = len(weights[weights < 0])
num_zero_weights = len(weights[weights == 0])

print "Number of positive weights: %s" % num_positive_weights
print "Number of negative weights: %s" % num_negative_weights
print "Number of zero weights: %s" % num_zero_weights

Number of positive weights: 3993
Number of negative weights: 2132
Number of zero weights: 939


## Making predictions with logistic regression

In [378]:
sample_test_data = test_data_1000_df.iloc[10:13, :]

print sample_test_data["rating"]
sample_test_data

811    1
76     5
636    5
Name: rating, dtype: int64


Unnamed: 0,name,review,rating,review_no_punc,word_count,sentiment
811,Safety 1st Tot-Lok Magnetic Key,Not the Tot-Lok key shown in the picture. It ...,1,Not the TotLok key shown in the picture It is...,"{u'customer': 1, u'picture': 1, u'shown': 1, u...",-1
76,Cloth Diaper Pins Stainless Steel Traditional ...,"As another reviewer noted, these are great for...",5,As another reviewer noted these are great for ...,"{u'and': 4, u'dont': 1, u'just': 2, u'Great': ...",1
636,"Summer Infant Ultimate Crib Sheet, 52&quot; x ...",I have 4 daughters and was greatful for this c...,5,I have 4 daughters and was greatful for this c...,"{u'and': 4, u'IDEA': 1, u'all': 2, u'placing':...",1


In [379]:
sample_test_data.iloc[0, :]["review"]

"Not the Tot-Lok key shown in the picture.  It is similar, but not the Tot-Lok key. The customer service is terrible. Don't buy from this seller."

In [380]:
sample_test_data.iloc[1, :]["review"]

"As another reviewer noted, these are great for matching socks before washing.  Now once the socks come out of the washer, I hang them to dry with the pins still on.  The socks stay on the line to dry without clothespins, they just hang with one sock on each side of the line.  After they are dried, I just open the pin and leave it by the hamper on a shelf.  As I put dirty socks into the hamper, I grab a pin and use it.  Has saved a lot of time and I don't have the problem of lost socks or mismatched socks on my feet.  Great idea and am glad I read about this here on the reviews.  Wish I had done this my whole life."

In [381]:
sample_test_data.iloc[2, :]["review"]

'I have 4 daughters and was greatful for this crib sheet.  I had purchased 1 over the telephone several years ago from an ad.  After I received it, I called and purchased 3 more.  My kids are a little older now and no longer in baby beds, but I still use them when they are sick by placing them under their pillows.  It still saves me from changing sheets all of the time.  I have purchased a sheet for everyone I know having a baby for the past 5 years and they all end up buying more of them. THIS IS THE BEST IDEA FOR A CRIB SHEET THAT WAS EVER INVENTED!!!!!'

In [389]:
scores = intercept + np.dot(test_data_1000_arr[:, :-1], weights.T)[10:13]

In [390]:
scores

array([[-0.46432869],
       [ 4.53245859],
       [ 5.27352481]])

### Predicting sentiment

In [391]:
sample_test_data

Unnamed: 0,name,review,rating,review_no_punc,word_count,sentiment
811,Safety 1st Tot-Lok Magnetic Key,Not the Tot-Lok key shown in the picture. It ...,1,Not the TotLok key shown in the picture It is...,"{u'customer': 1, u'picture': 1, u'shown': 1, u...",-1
76,Cloth Diaper Pins Stainless Steel Traditional ...,"As another reviewer noted, these are great for...",5,As another reviewer noted these are great for ...,"{u'and': 4, u'dont': 1, u'just': 2, u'Great': ...",1
636,"Summer Infant Ultimate Crib Sheet, 52&quot; x ...",I have 4 daughters and was greatful for this c...,5,I have 4 daughters and was greatful for this c...,"{u'and': 4, u'IDEA': 1, u'all': 2, u'placing':...",1


In [392]:
prediction = []

for i in range(len(scores)):
    if scores[i][0] > 0:
        predict = 1
    else:
        predict = -1
    prediction.append(predict)
    
prediction

[-1, 1, 1]

In [394]:
print "Class predictions according to SKlearn:"
print sentiment_model.predict(test_data_1000_arr[10:13, :-1])

Class predictions according to SKlearn:
[-1.  1.  1.]


### Probability predictions

In [395]:
scores

array([[-0.46432869],
       [ 4.53245859],
       [ 5.27352481]])

In [396]:
1 / (1+np.exp(-scores))

array([[0.38595944],
       [0.98936022],
       [0.99490062]])

In [397]:
print "Class predictions according to SKlearn:"
print sentiment_model.predict_proba(test_data_1000_arr[10:13, :-1])

Class predictions according to SKlearn:
[[0.61404056 0.38595944]
 [0.01063978 0.98936022]
 [0.00509938 0.99490062]]


# Find the most positive (and negative) review

In [398]:
sentiment_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [410]:
test_data_1000_df["predict"] = sentiment_model.predict_proba(test_data_1000_arr[:, :-1])[:, 1]

test_data_1000_df

Unnamed: 0,name,review,rating,review_no_punc,word_count,sentiment,predict
521,Baby Trend Diaper Champ,I just wanted to say that I love my Diaper Cha...,5,I just wanted to say that I love my Diaper Cha...,"{u'and': 1, u'feed': 1, u'love': 1, u'just': 1...",1,0.994392
737,Safety 1st Power Strip Cover,My little one looooves plug ins. This product...,4,My little one looooves plug ins This product ...,"{u'all': 2, u'less': 1, u'moments': 1, u'despi...",1,0.117232
740,Safety 1st Grow with Me Portable Booster Seat,My first son got this booster when he was 2 ye...,5,My first son got this booster when he was 2 ye...,"{u'all': 1, u'they': 1, u'just': 1, u'reviewer...",1,0.930052
660,"Summer Infant, Ultimate Training Pad - Twin Ma...",This is my second one and it works like a char...,5,This is my second one and it works like a char...,"{u'and': 2, u'dont': 1, u'backing': 1, u'is': ...",1,0.965302
411,Baby Trend Diaper Champ,I am really disappointed with the Diaper Champ...,2,I am really disappointed with the Diaper Champ...,"{u'just': 1, u'grossOtherwise': 1, u'Babies': ...",-1,0.247608
678,Odorless Diaper Pail by Safety 1st,I liked the idea of this pail using regular ki...,1,I liked the idea of this pail using regular ki...,"{u'liked': 1, u'is': 2, u'idea': 1, u'Also': 1...",-1,0.777873
626,"Summer Infant Ultimate Crib Sheet, 52&quot; x ...",The reviews that say this product is soft are ...,2,The reviews that say this product is soft are ...,"{u'and': 1, u'all': 2, u'old': 1, u'Somewhere'...",-1,0.349377
513,Baby Trend Diaper Champ,I had tried a Diaper Genie at a friend's house...,5,I had tried a Diaper Genie at a friends house ...,"{u'all': 3, u'seemed': 1, u'refills': 2, u'We'...",1,0.095494
859,Safety 1st Space Saver Fold-Up Bath Tub,We first saw this fold-up infant tub in the NI...,5,We first saw this foldup infant tub in the NIC...,"{u'infant': 3, u'just': 1, u'over': 1, u'washc...",1,0.989376
136,Pedal Farm Tractor,We bought this tractor to accompany the traile...,5,We bought this tractor to accompany the traile...,"{u'and': 5, u'load': 1, u'love': 1, u'into': 1...",1,0.968170


In [416]:
test_data_1000_df[["name", "predict"]].sort_values(by=["predict"], ascending=False).head(20)

Unnamed: 0,name,predict
377,Baby Trend Diaper Champ,1.0
261,Crown Crafts The Original NoJo BabySling by Dr...,1.0
408,Baby Trend Diaper Champ,0.999998
365,Baby Trend Diaper Champ,0.999996
299,Baby Trend Diaper Champ,0.999995
429,Baby Trend Diaper Champ,0.999994
584,Basic Comfort Rest EZ II Pregnancy Wedge,0.999911
486,Baby Trend Diaper Champ,0.999821
208,Fisher Price - Baby Bowling,0.999779
55,Our Baby Girl Memory Book,0.999725


In [431]:
test_data_1000_df[["name", "predict"]].sort_values(by=["predict"]).head(20)

Unnamed: 0,name,predict
277,Sassy Busy Bugs Bar,0.000588
998,Safety 1st Deluxe 4-in-1 Bath Station,0.007901
986,Safety 1st Deluxe 4-in-1 Bath Station,0.008755
902,Safety 1st Deluxe 4-in-1 Bath Station,0.01112
749,Graco ultraclear baby monitor,0.020646
363,Baby Trend Diaper Champ,0.021105
901,Safety 1st Deluxe 4-in-1 Bath Station,0.026961
822,Safety 1st Hospital's Choice Accu Scan Ear The...,0.030559
942,Safety 1st Deluxe 4-in-1 Bath Station,0.049139
978,Safety 1st Deluxe 4-in-1 Bath Station,0.059762


## Compute accuracy of the classifier

In [432]:
def get_classification_accuracy(model, data, true_labels):
    predictions = model.predict(data)
    num_correct = sum(predictions == true_labels)
    accuracy = num_correct/len(data)
    
    return accuracy

In [433]:
get_classification_accuracy(sentiment_model, test_data_1000_arr[:, :-1], test_data_1000_df["sentiment"])

0.81

## Learn another classifier with fewer words

In [436]:
significant_words = ["love", "great", "easy", "old", "little", "perfect", "loves",
                    "well", "able", "car", "broke", "less", "even", "waste", "disappointed",
                    "work", "product", "money", "would", "return"]

In [437]:
len(significant_words)

20

In [556]:
dict = products["word_count"][0]     
dict_new = {}

for i in range(len(dict)):
    if dict.keys()[i] in significant_words:
        dict_new[dict.keys()[i]] = dict[dict.keys()[i]]
        
dict_new

{'disappointed': 1, 'love': 1}

In [557]:
def dict_trim(text):
    dict0 = text
    dict_new = {}
    
    for i in range(len(dict0)):
        if dict0.keys()[i] in significant_words:
            dict_new[dict0.keys()[i]] = dict0[dict0.keys()[i]]
            
    return dict_new

In [563]:
products_1000_df_simple = products_1000_df

products_1000_df_simple["word_count_subset"] = products_1000_df_simple["word_count"].apply(dict_trim)

In [568]:
products_1000_df_simple_subset = products_1000_df_simple[products_1000_df_simple["word_count_subset"] != {}]

In [569]:
len(products_1000_df_simple_subset) 

850

In [572]:
train_data_1000_df_simple, test_data_1000_df_simple = train_test_split(products_1000_df_simple_subset, test_size=0.2, random_state=42)

print len(train_data_1000_df_simple)
print len(test_data_1000_df_simple)

products_1000_df_simple_subset = train_data_1000_df_simple.append(test_data_1000_df_simple)
products_1000_df_simple_subset = products_1000_df_simple_subset.reset_index(drop=True)

print len(products_1000_df_simple_subset)

680
170
850


In [574]:
df_new_simple = products_1000_df_simple_subset["word_count_subset"].apply(pd.Series).stack()
df_new_simple = df_new_simple.reset_index()
df_new_simple.columns = ["id", "feature", "value"]

In [576]:
f = LabelEncoder()
df_new_simple_label = f.fit_transform(df_new_simple["feature"])

df_new_simple["feature_id"] = df_new_simple_label

In [580]:
v = np.array(df_new_simple["value"])
i = np.array(df_new_simple["id"])
j = np.array(df_new_simple["feature_id"])

row = df_new_simple["id"].max() + 1
col = j.max() + 1

mat = csr_matrix((v, (i, j)), shape=(row, col))
features_toarray_simple = mat.toarray()

In [581]:
features_toarray_simple.shape

(850L, 20L)

In [590]:
sentiment_toarray_simple = np.array(products_1000_df_simple_subset["sentiment"])
sentiment_toarray_simple = sentiment_toarray_simple[:, None]

In [591]:
print features_toarray_simple.shape
print sentiment_toarray_simple.shape

(850L, 20L)
(850L, 1L)


In [592]:
products_1000_array_simple = np.hstack((features_toarray_simple, sentiment_toarray_simple))

In [594]:
print products_1000_array_simple.shape

products_1000_array_simple

(850L, 21L)


array([[ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  2.,  1.],
       [ 0.,  0.,  0., ...,  0.,  4., -1.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.]])

In [596]:
train_data_1000_arr_simple = products_1000_array_simple[:680, :]
test_data_1000_arr_simple = products_1000_array_simple[680:, :]

print len(train_data_1000_arr_simple)
print len(test_data_1000_arr_simple)

680
170


## Train a logistic regression model on a subset of data

In [597]:
simple_model = LogisticRegression(solver="lbfgs").fit(train_data_1000_arr_simple[:, :-1], train_data_1000_arr_simple[:, -1])

In [598]:
simple_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [605]:
get_classification_accuracy(simple_model, test_data_1000_arr_simple[:, :-1], test_data_1000_df_simple["sentiment"])

0.8176470588235294

# Comparing models

In [606]:
get_classification_accuracy(sentiment_model, train_data_1000_arr[:, :-1], train_data_1000_df["sentiment"])

0.99875

In [607]:
get_classification_accuracy(simple_model, train_data_1000_arr_simple[:, :-1], train_data_1000_df_simple["sentiment"])

0.8058823529411765

In [608]:
get_classification_accuracy(sentiment_model, test_data_1000_arr[:, :-1], test_data_1000_df["sentiment"])

0.81

In [610]:
get_classification_accuracy(simple_model, test_data_1000_arr_simple[:, :-1], test_data_1000_df_simple["sentiment"])

0.8176470588235294

## Baseline: Majority class prediction

In [611]:
num_positive = (train_data_1000_df["sentiment"] == 1).sum()
num_negative = (train_data_1000_df["sentiment"] == -1).sum()

print num_positive
print num_negative

598
202


In [612]:
print (test_data_1000_df["sentiment"] == 1).sum()
print (test_data_1000_df["sentiment"] == -1).sum()

149
51


In [613]:
(test_data_1000_df["sentiment"]==1).sum()/len(test_data_1000_df)

0.745