# The beginning of this report is copied over from the previous exercise. Scroll further down to see the beginning of 2.3.4.

Pick one of the company data files and build your own classifier. When you're satisfied with its performance (at this point just using the accuracy measure shown in the example), test it on one of the other datasets to see how well these kinds of classifiers translate from one context to another.

Include your model and a brief writeup of your feature engineering and selection process to submit and review with your mentor.

In [1]:
# Import the necessary features for this exercise
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import sklearn
from sklearn.preprocessing import normalize
from sklearn.naive_bayes import BernoulliNB

# Import the data
df = pd.read_table('C:\\Users\\maken\\sentiment_labelled_sentences\\sentiment_labelled_sentences\\amazon_cells_labelled.txt', header = None, names = ['review', 'positive'])

df.describe()
df.head()

Unnamed: 0,review,positive
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [2]:
keywords = ['good', 'excellent', 'great', 'well']

for key in keywords:
    df[str(key)] = df.review.str.contains(
        str(key),
        case=False
    )

# Print the headers to check how the previous method is working. 
df.head()

Unnamed: 0,review,positive,good,excellent,great,well
0,So there is no way for me to plug it in here i...,0,False,False,False,False
1,"Good case, Excellent value.",1,True,True,False,False
2,Great for the jawbone.,1,False,False,True,False
3,Tied to charger for conversations lasting more...,0,False,False,False,False
4,The mic is great.,1,False,False,True,False


In [3]:
# Convert the int column of positive into a boolean
df['positive'] = (df['positive'] == 1)
df.head()

Unnamed: 0,review,positive,good,excellent,great,well
0,So there is no way for me to plug it in here i...,False,False,False,False,False
1,"Good case, Excellent value.",True,True,True,False,False
2,Great for the jawbone.,True,False,False,True,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False
4,The mic is great.,True,False,False,True,False


In [4]:
data = df[keywords]
target = df['positive']

bnb = BernoulliNB()

bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 325


In [5]:
# 32.5% mislabeled. Maybe we should improve the prediction model by using better keywords. 

# Let's find a way to extract the most common words used in positive reviews and then we will update our keyword list. 

# Create a feature for positive reviews.
positive = np.where(df['positive'] == True)
#print(positive[0])




# Read through the strings, count the occurrence of words.
word_count = {}
for rowindex in positive[0]:
    for review in df.loc[rowindex, ['review']].str.lower().str.replace(',','').str.replace('.','').str.split(' '):
        for word in review:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

from collections import OrderedDict
od = OrderedDict(sorted(word_count.items(), key = lambda x:x[1], reverse=True))
print(od)

OrderedDict([('the', 237), ('and', 188), ('i', 153), ('is', 141), ('it', 124), ('a', 104), ('this', 103), ('to', 86), ('great', 82), ('phone', 80), ('my', 72), ('very', 69), ('for', 65), ('with', 64), ('good', 61), ('of', 49), ('works', 45), ('on', 44), ('have', 38), ('was', 36), ('in', 34), ('product', 33), ('that', 32), ('well', 31), ('quality', 30), ('headset', 30), ('sound', 26), ('so', 26), ('excellent', 24), ('has', 24), ('one', 23), ('are', 22), ('battery', 22), ('use', 21), ('had', 21), ('nice', 21), ('price', 21), ('but', 21), ('you', 20), ('best', 20), ('as', 20), ('love', 20), ('recommend', 19), ("i've", 19), ('all', 19), ('than', 19), ('like', 18), ('would', 17), ('case', 16), ('from', 16), ("it's", 16), ('ear', 16), ('any', 15), ('not', 15), ('really', 15), ('-', 14), ('comfortable', 14), ('easy', 14), ('your', 14), ('happy', 13), ('these', 13), ('new', 12), ('up', 12), ('fine', 12), ('just', 12), ('been', 12), ('no', 12), ('better', 12), ('am', 12), ('can', 11), ('car', 1

In [6]:
# Select your keywords from this list. 

new_keywords = ['great', 'very', 'good', 'works', 'excellent', 'recommend', 'happy', 'better', 'comfortable', 'worked']

for key in new_keywords:
    df[str(key)] = df.review.str.contains(
        str(key),
        case=False
    )

# Print the headers to check how the previous method is working. 
df.head()

Unnamed: 0,review,positive,good,excellent,great,well,very,works,recommend,happy,better,comfortable,worked
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,True,True,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,True,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,True,False,False,False,False,False,False,False,False


In [20]:
data = df[new_keywords]
amazon_target = df['positive']

amazon_bnb = BernoulliNB()

amazon_bnb.fit(data, amazon_target)

amazon_pos_pred = amazon_bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (amazon_target != amazon_pos_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 289


In [8]:
# By increasing the number of keywords and using keywords that are most common, we improved the inaccuracy from 32.5% to 28.9%

# Now see how the model performs on another dataset. 

In [9]:
imdb = pd.read_table('C:\\Users\\maken\\sentiment_labelled_sentences\\sentiment_labelled_sentences\\imdb_labelled.txt', header = None, names = ['review', 'positive'])
# Convert the int column of positive into a boolean
imdb['positive'] = (imdb['positive'] == 1)
imdb.head()

Unnamed: 0,review,positive
0,"A very, very, very slow-moving, aimless movie ...",False
1,Not sure who was more lost - the flat characte...,False
2,Attempting artiness with black & white and cle...,False
3,Very little music or anything to speak of.,False
4,The best scene in the movie was when Gerardo i...,True


In [10]:
# Test the first model

keywords = ['good', 'excellent', 'great', 'well']

for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(
        str(key),
        case=False
    )

# Print the headers to check how the previous method is working. 
imdb.head()

Unnamed: 0,review,positive,good,excellent,great,well
0,"A very, very, very slow-moving, aimless movie ...",False,False,False,False,False
1,Not sure who was more lost - the flat characte...,False,False,False,False,False
2,Attempting artiness with black & white and cle...,False,False,False,False,False
3,Very little music or anything to speak of.,False,False,False,False,False
4,The best scene in the movie was when Gerardo i...,True,False,False,False,False


In [11]:
imdb_data = imdb[keywords]
imdb_positive_target = imdb['positive']

imdb_bnb = BernoulliNB()

imdb_bnb.fit(data, target)

imdb_pos_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 748 points : 330


In [12]:
# 33.0% inaccuracy. Let's test our new model.

for key in new_keywords:
    imdb[str(key)] = imdb.review.str.contains(
        str(key),
        case=False
    )

# Print the headers to check how the previous method is working. 
imdb.head()

Unnamed: 0,review,positive,good,excellent,great,well,very,works,recommend,happy,better,comfortable,worked
0,"A very, very, very slow-moving, aimless movie ...",False,False,False,False,False,True,False,False,False,False,False,False
1,Not sure who was more lost - the flat characte...,False,False,False,False,False,False,False,False,False,False,False,False
2,Attempting artiness with black & white and cle...,False,False,False,False,False,False,False,False,False,False,False,False
3,Very little music or anything to speak of.,False,False,False,False,False,True,False,False,False,False,False,False
4,The best scene in the movie was when Gerardo i...,True,False,False,False,False,False,False,False,False,False,False,False


In [13]:
data = imdb[new_keywords]
target = imdb['positive']

bnb = BernoulliNB()

bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 748 points : 334


In [16]:
# 44.65% inaccuracy
# So, the new model is more accurate for predicting sentiment with amazon reviews, 
# but significantly worse at predicting the sentiment of imdb reviews.  

# 2.3.4 Starts Here

It's time to revisit your classifier from the previous assignment. Using the evaluation techniques we've covered here, look at your classifier's performance in more detail. Then go back and iterate. Repeat this process until you have five different versions of your classifier. Once you've iterated, answer these questions to compare the performance of each:

Do any of your classifiers seem to overfit?

Which seem to perform the best? Why?

What features seemed to be most impactful to performance?

Write up your iterations and answers to the above questions in a few pages. Submit a link below and go over it with your mentor to see if they have any other ideas on how you could improve your classifier's performance.

In [71]:
# accuracy_results will take the necessary data and print the results. 
def accuracy_results(dataframe, fraction):
    
    amazon_training_set = dataframe.sample(frac=fraction)
    
    # create a testing set that is all of the data minus the training set
    amazon_testing_set = dataframe[~dataframe.index.isin(amazon_training_set.index)]
    
    amazon_data = amazon_training_set[new_keywords]
    
    amazon_target = amazon_training_set['positive']

    amazon_training_set_bnb = BernoulliNB()

    amazon_training_set_bnb.fit(amazon_data, amazon_target)

    amazon_pos_pred = amazon_training_set_bnb.predict(amazon_testing_set[new_keywords])

    print("Number of mislabeled points out of a total {} points: {}".format(
        amazon_testing_set[new_keywords].shape[0],
        (amazon_pos_pred != amazon_testing_set['positive']).sum()
    ))
    
    amazon_conf_matrix = confusion_matrix(amazon_testing_set['positive'], amazon_pos_pred)
    print(amazon_conf_matrix)
    
    false_positives = amazon_conf_matrix[0,1]
    print("Number of false positives: {}".format(false_positives))
    
    false_negatives = amazon_conf_matrix[1,0]
    print("Number of false negatives: {}".format(false_negatives))
    
    true_positives = amazon_conf_matrix[1,1]
    print("Number of true positives: {}".format(true_positives))
    
    true_negatives = amazon_conf_matrix[0,0]
    print("Number of true negatives: {}".format(true_negatives))
    
    sensitivity = true_positives / (true_positives + false_negatives)
    print("Sensitivity: {}".format(sensitivity))
    
    specificity = true_negatives / (true_negatives + false_positives)
    print("Specificity: {}".format(specificity))

In [72]:
# Now pass the credentials you want for the first testing and training set. If you run it twice, it will grab different data.
accuracy_results(df, .3)

Number of mislabeled points out of a total 700 points: 195
[[317  39]
 [156 188]]
Number of false positives: 39
Number of false negatives: 156
Number of true positives: 188
Number of true negatives: 317
Sensitivity: 0.5465116279069767
Specificity: 0.8904494382022472


In [69]:
# The low sensitivity shows that the model is not as good at predicting positive values. 
# The high specificity shows that the model is good at predicting the negative values.

In [73]:
accuracy_results(df, .4)

Number of mislabeled points out of a total 600 points: 178
[[260  39]
 [139 162]]
Number of false positives: 39
Number of false negatives: 139
Number of true positives: 162
Number of true negatives: 260
Sensitivity: 0.5382059800664452
Specificity: 0.8695652173913043


In [74]:
accuracy_results(df, .7)

Number of mislabeled points out of a total 300 points: 91
[[128  24]
 [ 67  81]]
Number of false positives: 24
Number of false negatives: 67
Number of true positives: 81
Number of true negatives: 128
Sensitivity: 0.5472972972972973
Specificity: 0.8421052631578947


In [75]:
# The model appears to be consistent since the sensitivity and specificity are similar across different sample sizes. 

In [108]:
# Make a folds function with sklearn's KFold
from sklearn.model_selection import KFold

def kfolds(dataframe, num_splits):
    kfold_data = dataframe
    kf = KFold(n_splits=num_splits)
    splits = kf.split(kfold_data)  

   
    training_sets = []
    testing_sets = [] 
    for training, testing in splits:
        training_sets.append(dataframe.loc[training])
        testing_sets.append(dataframe.loc[testing])
    
    # Need to make arrays for sensitivites and specificities so that we can calculate the mean.
    sensitivities = np.array([])
    specificities = np.array([])
    for training in training_sets[:-1]:
        testing = testing_sets[-1]
 
        training_data = training[new_keywords]
    
        training_target = training['positive']
        
        model_bnb = BernoulliNB()

        model_bnb.fit(training_data, training_target)

        pos_pred = model_bnb.predict(testing[new_keywords])
    
        print("Number of mislabeled points out of a total {} points: {}".format(
            testing[new_keywords].shape[0],
            (pos_pred != testing['positive']).sum()
        ))
    
        amazon_conf_matrix = confusion_matrix(testing['positive'], pos_pred)
        print(amazon_conf_matrix)

        false_positives = amazon_conf_matrix[0,1]
    
        false_negatives = amazon_conf_matrix[1,0]
    
        true_positives = amazon_conf_matrix[1,1]
    
        true_negatives = amazon_conf_matrix[0,0]

        
        sensitivity = true_positives / (true_positives + false_negatives)
        sensitivities = np.append(sensitivities, [sensitivity])
        #sensitivities.append(sensitivity)

        specificity = true_negatives / (true_negatives + false_positives)
        specificities = np.append(specificities, [specificity])
        #specificities.append(specificity)
    
    #print('Sensitivity: {}'.format(sensitivities.mean()))
    #print('Specificity: {}'.format(specificities.mean()))
    #print(sensitivities.mean())
    #print(specificities.mean())
    
    return sensitivities.mean(), specificities.mean()

In [None]:
# Create a dataframe to store the results for any different kfolds calls
stats_data = np.array(number_of_splits, sensitivity, specificity)

In [109]:
# Call the kfolds function and set the num_splits to 3 (the default)
kfolds(df, 3)

Number of mislabeled points out of a total 333 points: 98
[[153  23]
 [ 75  82]]
Number of mislabeled points out of a total 333 points: 101
[[153  23]
 [ 78  79]]


(0.51273885350318471, 0.86931818181818177)

In [105]:
kfolds(df, 10)

Number of mislabeled points out of a total 100 points: 35
[[51  7]
 [28 14]]
Number of mislabeled points out of a total 100 points: 35
[[51  7]
 [28 14]]
Number of mislabeled points out of a total 100 points: 35
[[51  7]
 [28 14]]
Number of mislabeled points out of a total 100 points: 33
[[51  7]
 [26 16]]
Number of mislabeled points out of a total 100 points: 35
[[51  7]
 [28 14]]
Number of mislabeled points out of a total 100 points: 35
[[51  7]
 [28 14]]
Number of mislabeled points out of a total 100 points: 35
[[51  7]
 [28 14]]
Number of mislabeled points out of a total 100 points: 35
[[51  7]
 [28 14]]
Number of mislabeled points out of a total 100 points: 35
[[51  7]
 [28 14]]
Sensitivity: 0.33862433862433866
Specificity: 0.8793103448275862


In [104]:
kfolds(df, 20)

Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 14
[[29  4]
 [10  7]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mislabeled points out of a total 50 points: 15
[[29  4]
 [11  6]]
Number of mi

Do any of your classifiers seem to overfit?
    
    No.
    
Which seem to perform the best? Why?
    
    The first model (using accuracy_results) function, appears to perform the best due to its accuracy in predicting values as well as the consistency of both the sensitivity and the specificity across varying sample sizes. 
    
What features seemed to be most impactful to performance?
    
    A large k-folds value has an impact on performance. 