In [163]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import accuracy_score

## 2.3.4 Sentiment Analysis - Iterate and Evaluate Classifier

#### #1. Evaluate Classifier As-Is

In [164]:
#Evaluate using Amazon data

#import data
df = pd.read_csv('amazon.txt', delimiter='\t', header=None)
df.columns = ['message', 'positive']

#look at class balance
df.positive.value_counts()

1    500
0    500
Name: positive, dtype: int64

In [165]:
#standardize to all lowercase
df.message = df.message.apply(lambda x: str(x).lower())

#generate positive keywords
keywords =  ['great', 'good', 'excellent', 'love', 'like', 'liked', 'quality', 'awesome', 'enjoy', 'impressed', 
            'satisfied', 'well', 'recommend', 'sweet', 'sweetest', 'secure', 'best', 'easy', 
            'winner', 'comfortable', 'impressive', 'inexpensive', 'beautiful', 'pretty', 'nice', 'funny', 
            'value', 'ideal', 'happy'
            ]

#write loop to check for keywords in message and populate data columns
for key in keywords:
    df[str(key)] = df.message.str.contains(str(key), case=False)

In [166]:
# **************************************// NO DATA SPLIT //*******************************************
    
#specify outcome (target) and inputs (data)
data = df[keywords]
target = df['positive']

#import Bernoulli NB classifier
from sklearn.naive_bayes import BernoulliNB

#create instance of classifier
bnb = BernoulliNB()

#train model
bnb.fit(data, target)

#classify prediction and store as new variable
df['predicted'] = bnb.predict(data)

#print accuracy
print('Model accuracy (all training data): ', accuracy_score(df.positive, df.predicted))

Model accuracy (all training data):  0.762


In [167]:
# ************************// HOLD OUT 30% DATA FOR TEST SET //***************************

#specify variables
x = data = df[keywords]
y = target = df['positive']

#re-initiate classifier  ##is this a necessary step for each scenario?
bnb = BernoulliNB()

# Split data 70% train, 30% test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=17)

# Train model, predict outcome with training set
y_pred_train = bnb.fit(x_train, y_train).predict(x_train)

# Test using test data
y_pred_test = bnb.predict(x_test)


print('Hold Out Accuracy - training set:', accuracy_score(y_train, y_pred_train))
print('Hold Out Accuracy - test set:', accuracy_score(y_test, y_pred_test))

Hold Out Accuracy - training set: 0.758571428571
Hold Out Accuracy - test set: 0.75


In [168]:
# **************************// 5-WAY CROSS VALIDATION //***********************************

#import cross validation score
from sklearn.model_selection import cross_val_score

#re-initiate classifier  
bnb = BernoulliNB()

#specify variables
x = data = df[keywords]
y = target = df['positive']

#setup cross validation.  cv=5 means 5-fold cross validation using KFold
scores = cross_val_score(bnb, x, y, cv=5)

print(scores)
print(np.mean(scores))


[ 0.8    0.76   0.72   0.735  0.73 ]
0.749


Based on the above results, the model does not appear to be overfit in any of the test scenarios.  When using all of the data as the train/test set, the accuracy is 76%.  When splitting the into train(70%)/test(30%), the model has the same accuracy on the training set as it does the unseen test data.  When using 5-way cross validation (Kfolds), the accuracy ranges from 72-80%. 
<br><br>
To summarize:
* All data: 76%
* 70/30 split: 75% both sets
* Cross validated: 72-80%, average of 75%

#### #2. Modify features and evaluate classifier again.

In [169]:
#count instances of each word in message to identify most common
df.sum()

message        so there is no way for me to plug it in here i...
positive                                                     500
great                                                         98
good                                                          75
excellent                                                     27
love                                                          24
like                                                          33
liked                                                       True
quality                                                       49
awesome                                                        5
enjoy                                                       True
impressed                                                      9
satisfied                                                      4
well                                                          41
recommend                                                     27
sweet                    

In [170]:
#Keep words with counts > 20
top_words = ['great', 'good', 'excellent', 'love', 'like', 'quality', 'well', 'recommend', 'best', 'nice']
print('top_words length:', len(top_words))
print('keywords length:', len(keywords))
print('# features reduced by:', len(keywords) - len(top_words))

top_words length: 10
keywords length: 29
# features reduced by: 19


In [171]:
#re-import data to start with clean version
df2 = pd.read_csv('amazon.txt', delimiter='\t', header=None)
df2.columns = ['message', 'positive']

#standardize to all lowercase
df2.message = df.message.apply(lambda x: str(x).lower())

#write loop to check for condensed keywords in message and populate data columns
for word in top_words:
    df2[str(word)] = df2.message.str.contains(str(word), case=False)

#confirm df looks how we want it to
df2.head()

Unnamed: 0,message,positive,great,good,excellent,love,like,quality,well,recommend,best,nice
0,so there is no way for me to plug it in here i...,0,False,False,False,False,False,False,False,False,False,False
1,"good case, excellent value.",1,False,True,True,False,False,False,False,False,False,False
2,great for the jawbone.,1,True,False,False,False,False,False,False,False,False,False
3,tied to charger for conversations lasting more...,0,False,False,False,False,False,False,False,False,False,False
4,the mic is great.,1,True,False,False,False,False,False,False,False,False,False


In [172]:
# **********************// Re-run model: no train/test split or cross validation //***************************

#specify outcome (target) and inputs (data)
data = df2[top_words]
target = df2['positive']

#re-initiate classifier
bnb = BernoulliNB()

#train model
bnb.fit(data, target)

#classify prediction and store as new variable
df2['predicted'] = bnb.predict(data)

#print accuracy
print('Model accuracy, reduced keywords (all training data): ', accuracy_score(df2.positive, df2.predicted))

Model accuracy, reduced keywords (all training data):  0.731


In [173]:
# ************************// HOLD OUT 30% DATA FOR TEST SET //***************************

#specify variables
x = data = df2[top_words]
y = target = df2['positive']

#re-initiate classifier 
bnb = BernoulliNB()

# Split data 70% train, 30% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=17)

# Train model, predict outcome with training set
y_pred_train = bnb.fit(x_train, y_train).predict(x_train)

# Test using test data
y_pred_test = bnb.predict(x_test)


print('Hold Out Accuracy - training set:', accuracy_score(y_train, y_pred_train))
print('Hold Out Accuracy - test set:', accuracy_score(y_test, y_pred_test))

Hold Out Accuracy - training set: 0.722857142857
Hold Out Accuracy - test set: 0.73


In [174]:
# **************************// 10-WAY CROSS VALIDATION //***********************************

#re-initiate classifier  
bnb = BernoulliNB()

#specify variables
x = data = df2[top_words]
y = target = df2['positive']

#setup cross validation.  cv=10 means 10-fold cross validation using KFold
scores = cross_val_score(bnb, x, y, cv=10)

print(scores)
print(np.mean(scores))

[ 0.83  0.75  0.8   0.73  0.71  0.69  0.67  0.71  0.75  0.63]
0.727


Summary of results:

* Keyword features reduced from 21 to 10
* Modeling with full data set gives 73% accuracy (vs. 76% from before)
* Modeling with 70/30 split gives 73% accuracy with test set (vs. 75% from before)
* Modeling with 10-way cross validation gives an average accuracy of 73% (vs. 75% before), larger variation observed in test results relative to 5-way cross validation.

Conclusions:
* All model configurations approximately the same on overall accuracy.  Likely because the dataset is fairly large (n=1000) and equally balanced between positive and negative sentiments.  Model features can be reduced by 50% while still maintaining same level of accuracy.