# Simple Keyword Model

In [49]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

amazon = pd.read_table("amazon.txt",header=None)
yelp = pd.read_table("yelp.txt")

amazon.columns = ['text','positive']
yelp.columns = ['text','positive']

# Make outcome column boolean
amazon['positive'] = amazon['positive'] == 1


# Positive and negative keywords
positive_words = ['awesome', 'superb','perfect','enjoyable','outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great',
                 'amazing','love','loved','enjoy','nice','awesome','best','super']
negative_words = ['bad', 'terrible','useless', 'hate', 'sucks','worst','awful','ineffective','lame','stupid','cheap','unreliable',
                 'fail','poor','broke','broken',"doesn't work"]

# Making the df column names for each variable
#for word in positive_words + negative_words:
   #amazon[word] = amazon['text'].str.contains(' ' + word + ' ',case=False) | amazon['text'].str.lower().str.split(' ').str.get(0).str.contains(word)

for word in positive_words + negative_words:
    #amazon[word] = amazon['text'].apply(lambda x: [w in x for w in positive_words + negative_words])
    amazon[word] = amazon['text'].apply(lambda x: word in x)
    
# Set up the outcome variable and model vars
variables = amazon[positive_words + negative_words]
outcome = amazon['positive']

# Import model, instantiate, and train
from sklearn.naive_bayes import BernoulliNB
bernoulli = BernoulliNB()
bernoulli.fit(variables, outcome)

# Store predictions
model1predictions = bernoulli.predict(variables)

# Add array for whether or not prediction was accurate for said observation
model1results =  model1predictions == amazon['positive']
model1_accuracy = 1- (model1results.value_counts(dropna=False).loc[False] / len(model1results))
#model1_sensitivity

from sklearn.metrics import confusion_matrix
c = confusion_matrix(outcome, model1predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(model1_accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(model1_accuracy-0.5))
confusion

Accuracy: 0.637
Sensitivity: 0.316
Specificity: 0.958
Improvement over baseline: 0.137


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,479,21
Pos,342,158


### Assessment

The overall accuracy is 0.637, with a specificity of 0.96 and sensitivity of 0.32. Interestingly, this simplified model is very good at predicting negative reviews but pretty terrible at guessing positive reviews. I don't think it's accurate enough to bother using on the other data considering the other versions we have available.

# Common words in pos/neg reviews model

In [43]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

amazon = pd.read_table("amazon.txt",header=None)
yelp = pd.read_table("yelp.txt",header=None)

amazon.columns = ['text','positive']
yelp.columns = ['text','positive']

# Make outcome column boolean
amazon['positive'] = amazon['positive'] == 1
yelp['positive'] = yelp['positive'] == 1

In [44]:
# Converting all text columns into a list of words
amazon['text'] = np.array(amazon['text'].str.lower().str.split(" |,|\.|!|\?"))

# Creating positive and negative subsets to calculate most popular words for pos & neg
amazon_pos = amazon[amazon['positive'] == 1]
amazon_neg = amazon[amazon['positive'] == 0]

# Calculate most common words in positive & negative reviews
x = sum(amazon_pos['text'], [])
y = sum(amazon_neg['text'], [])

from collections import Counter

for item in x:
    c = Counter(x)
    common_tuples = c.most_common(200)
    top_pos_words = [i[0] for i in common_tuples]

for item in y:
    c = Counter(y)
    common_tuples = c.most_common(200)
    top_neg_words = [i[0] for i in common_tuples]

In [45]:
# Remove overlapping words for each
pos_unique = [x for x in top_pos_words if x not in top_neg_words]
neg_unique = [x for x in top_neg_words if x not in top_pos_words]

In [46]:
print(len(pos_unique),len(neg_unique))

74 74


In [47]:
print(pos_unique)

['works', 'excellent', 'price', 'best', 'nice', 'love', 'easy', 'comfortable', 'happy', 'fine', 'been', 'far', 'clear', 'device', 'cell', 'fits', 'camera', 'got', 'working', 'highly', 'pretty', 'years', 'without', 'everything', 'cool', 'wear', 'lot', 'jabra', 'people', 'found', 'both', 'light', 'perfectly', 'value', 'impressed', 'say', 'priced', 'sturdy', 'gets', 'little', 'tried', 'definitely', 'pleased', 'small', 'voice', 'awesome', 'overall', 'range', 'amazon', 'cases', 'original', 'ears', 'seems', 'keyboard', 'their', 'several', 'most', 'headsets', 'verizon', 'order', 'free', 'shipping', 'pictures', 'leather', 'fast', 'comfortably', 'job', '&', 'glad', 'phones', 'look', 'charm', 'being', 'simple']


In [48]:
print(neg_unique)

['money', 'first', 'then', 'do', 'poor', 'waste', "doesn't", 'bad', 'what', 'could', 'worst', 'will', 'calls', 'off', 'same', 'piece', 'hear', 'charge', 'disappointed', 'enough', 'thing', 'terrible', 'plug', 'volume', 'design', 'horrible', 'customer', 'junk', 'unit', 'by', 'how', "didn't", 'talk', 'broke', 'over', 'useless', 'back', 'however', 'last', 'went', '3', 'days', 'buttons', 'months', 'completely', 'stay', 'company', 'never', 'crap', 'difficult', 'cheap', 'way', 'dropped', 'we', 'big', 'week', 'within', 'down', '1', 'signal', 'put', 'some', 'disappointment', 'return', 'old', 'nokia', 'want', 'anything', 'disappointing', 'picture', 'low', 'anyone', 'none', 'easily']


In [49]:
print(amazon['text'])

0      [so, there, is, no, way, for, me, to, plug, it...
1                     [good, case, , excellent, value, ]
2                           [great, for, the, jawbone, ]
3      [tied, to, charger, for, conversations, lastin...
4                                [the, mic, is, great, ]
5      [i, have, to, jiggle, the, plug, to, get, it, ...
6      [if, you, have, several, dozen, or, several, h...
7      [if, you, are, razr, owner, , , you, must, hav...
8          [needless, to, say, , i, wasted, my, money, ]
9             [what, a, waste, of, money, and, time, , ]
10               [and, the, sound, quality, is, great, ]
11     [he, was, very, impressed, when, going, from, ...
12     [if, the, two, were, seperated, by, a, mere, 5...
13                         [very, good, quality, though]
14     [the, design, is, very, odd, , as, the, ear, "...
15     [highly, recommend, for, any, one, who, has, a...
16          [i, advise, everyone, do, not, be, fooled, ]
17                             

In [50]:
# Creating the features based on our words
for word in pos_unique + neg_unique:
    amazon[word] = amazon['text'].apply(lambda x: word in x)
    yelp[word] = yelp['text'].apply(lambda x: word in x) #Creating the necessary feature for Yelp to use down the road
    
# Set up the outcome variable and model vars
variables = amazon[pos_unique + neg_unique]
outcome = amazon['positive']

# Import model, instantiate, and train
from sklearn.naive_bayes import BernoulliNB
bernoulli = BernoulliNB()
bernoulli.fit(variables, outcome)

# Store predictions
model2predictions = bernoulli.predict(variables)

# Add array for whether or not prediction was accurate for said observation
model2results =  model2predictions == amazon['positive']
model2_accuracy = 1- (model2results.value_counts(dropna=False).loc[False] / len(model2results))

from sklearn.metrics import confusion_matrix
c = confusion_matrix(outcome, model2predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(model2_accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(model2_accuracy-0.5))
confusion

Accuracy: 0.8140000000000001
Sensitivity: 0.94
Specificity: 0.688
Improvement over baseline: 0.31400000000000006


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,344,156
Pos,30,470


### Initial Assessment

This one does better overall and much better in sensitivity. The specificity actually declined a bit with this model, though.

In [51]:
# Set up the outcome variable and model vars
yelp_vars = yelp[pos_unique + neg_unique]
yelp_actual = yelp['positive']

# Store predictions, add as a df column
yelp_predictions = bernoulli.predict(yelp_vars)

# Add array for whether or not prediction was accurate for said observation
model2results =  yelp_predictions == yelp_actual
model2_accuracy = 1- (model2results.value_counts(dropna=False).loc[False] / len(model2results))

from sklearn.metrics import confusion_matrix
c = confusion_matrix(yelp_actual, yelp_predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(model2_accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(model2_accuracy-0.5))
confusion

Accuracy: 0.639
Sensitivity: 0.698
Specificity: 0.58
Improvement over baseline: 0.139


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,290,210
Pos,151,349


The above model is less accurate, but similar to when we tested results on the Amazon data, it is more sensitive than it is specific. It may be overfitting, so let's try to see what happens if we use a holdout group.

In [52]:
#set up traindata, testdata
from sklearn.model_selection import train_test_split
train, test = train_test_split(amazon,test_size=0.3) # Fine tune this
traindata, testdata = train[pos_unique + neg_unique], test[pos_unique + neg_unique]

# Set up the outcome variable and model vars
variables = train[pos_unique + neg_unique]
outcome = train['positive']

# Import model, instantiate, and train
from sklearn.naive_bayes import BernoulliNB
bernoulli = BernoulliNB()
bernoulli.fit(variables, outcome)

# Store predictions
predictions = bernoulli.predict(testdata)

# Add array for whether or not prediction was accurate for said observation
results =  predictions == test['positive']
accuracy = 1- (results.value_counts(dropna=False).loc[False] / len(results))

from sklearn.metrics import confusion_matrix

c = confusion_matrix(test['positive'], predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(accuracy-0.5))
confusion

Accuracy: 0.7166666666666667
Sensitivity: 0.577922077922078
Specificity: 0.863013698630137
Improvement over baseline: 0.21666666666666667


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,126,20
Pos,65,89


Accuracy does go down a little bit, so perhaps it was overfit. Let's try this on Yelp data now to see if it works better.

In [26]:
# Set up the outcome variable and model vars
yelp_vars = yelp[pos_unique + neg_unique]
yelp_actual = yelp['positive']

# Store predictions, add as a df column
yelp_predictions = bernoulli.predict(yelp_vars)

# Add array for whether or not prediction was accurate for said observation
results =  yelp_predictions == yelp_actual
accuracy = 1- (results.value_counts(dropna=False).loc[False] / len(results))

from sklearn.metrics import confusion_matrix
c = confusion_matrix(yelp_actual, yelp_predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(accuracy-0.5))
confusion

Accuracy: 0.562
Sensitivity: 0.318
Specificity: 0.806
Improvement over baseline: 0.062000000000000055


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,403,97
Pos,341,159


As we suspected, it doesn't perform so well. Let's try a different type of model.

# Most Correlated Words as Features

In [38]:
amazon = pd.read_table("amazon.txt",header=None)
yelp = pd.read_table("yelp.txt",header=None)

amazon.columns = ['text','positive']
yelp.columns = ['text','positive']

# Make outcome column boolean
amazon['positive'] = amazon['positive'] == 1
yelp['positive'] = yelp['positive'] == 1

# Converting all text columns into a list of words
amazon['text'] = np.array(amazon['text'].str.lower().str.split(" |,|\.|!|\?"))

In [39]:
all_words = sum(amazon['text'], [])
wordset = set(all_words)
wordsdf=pd.DataFrame()

# Create df with all of the correlations to positive, by word
for word in wordset:
    wordsdf[word] = amazon['text'].apply(lambda x: word in x)

wordsdf['positive'] = amazon['positive']

#calculate correlations - for loop could be more efficient
correlations = wordsdf.corr().filter(['positive']).drop(['positive'])
pos_correlations = correlations.abs().sort_values('positive',ascending=False)

#old, inefficient way
#correlations = wordsdf.corr()
#pos_correlations = correlations.abs().sort_values('positive',ascending=False).loc[:,'positive']

#let's take the top 200
features = pos_correlations[:199].index.values

In [40]:
# Add these words as features to the df
for word in features:
    amazon[word] = amazon['text'].apply(lambda x: word in x)
    yelp[word] = yelp['text'].apply(lambda x: word in x) #Creating the necessary feature for Yelp to use down the road
    
# Set up the outcome variable and model vars
variables = amazon[features]
outcome = amazon['positive']

# Import model, instantiate, and train
bernoulli.fit(variables, outcome)

# Store predictions, add as a df column
predictions = bernoulli.predict(variables)

# Add array for whether or not prediction was accurate for said observation
results =  predictions == amazon['positive']
accuracy = 1- (results.value_counts(dropna=False).loc[False] / len(results))

from sklearn.metrics import confusion_matrix
c = confusion_matrix(amazon['positive'], predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(accuracy-0.5))
confusion

Accuracy: 0.881
Sensitivity: 0.88
Specificity: 0.882
Improvement over baseline: 0.381


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,441,59
Pos,60,440


This is the best model and methodology we've come up with so far. Now let's see how well it did with Yelp.

In [41]:
# Set up the outcome variable and model vars
yelp_vars = yelp[features]
yelp_actual = yelp['positive']

# Store predictions, add as a df column
yelp_predictions = bernoulli.predict(yelp_vars)

# Add array for whether or not prediction was accurate for said observation
results =  yelp_predictions == yelp_actual
accuracy = 1- (results.value_counts(dropna=False).loc[False] / len(results))

from sklearn.metrics import confusion_matrix
c = confusion_matrix(yelp_actual, yelp_predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(accuracy-0.5))
confusion

Accuracy: 0.6779999999999999
Sensitivity: 0.51
Specificity: 0.846
Improvement over baseline: 0.17799999999999994


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,423,77
Pos,245,255


Looks like we may be overfitting again. Let's reduce the number of features and see how we do.

# Model 4 - Optimized Features Based on Holdouts

Let's try some optimization of the number of features to try to improve results. We'll try a bunch of different variations of the number of features and see how we do. More specifically, let's try steps of 5 between 0 and 200 (that's about the upper limit of what my computer seems to be able to handle in a reasonable amount of time).

In [63]:
amazon = pd.read_table("amazon.txt",header=None)
yelp = pd.read_table("yelp.txt",header=None)

amazon.columns = ['text','positive']
yelp.columns = ['text','positive']

# Make outcome column boolean
amazon['positive'] = amazon['positive'] == 1
yelp['positive'] = yelp['positive'] == 1

# Converting all text columns into a list of words
amazon['text'] = np.array(amazon['text'].str.lower().str.split(" |,|\.|!|\?"))

#set up traindata, testdata
from sklearn.model_selection import train_test_split
train, test = train_test_split(amazon,test_size=0.3) # Fine tune this

all_words = sum(train['text'], [])
wordset = set(all_words)
wordsdf=pd.DataFrame()

# Create df with all of the correlations to positive, by word
for word in wordset:
    wordsdf[word] = train['text'].apply(lambda x: word in x)    

wordsdf['positive'] = train['positive']

#calculate correlations
correlations = wordsdf.corr().filter(['positive']).drop(['positive'])
pos_correlations = correlations.abs().sort_values('positive',ascending=False)

#let's take the top 200
features_tot = pos_correlations[:199].index.values

for num_features in np.arange(4,199,5):
    
    features = features_tot[0:num_features]
    
    # Add these words as features to the df
    for word in features:
        train[word] = train['text'].apply(lambda x: word in x)
        test[word] = test['text'].apply(lambda x: word in x)
        yelp[word] = yelp['text'].apply(lambda x: word in x) #Creating the necessary feature for Yelp to use down the road
    
    traindata, testdata = train[features], test[features]

    # Set up the outcome variable and model vars
    variables = train[features]
    outcome = train['positive']

    # Import model, instantiate, and train
    bernoulli.fit(variables, outcome)

    # Store predictions, add as a df column
    predictions = bernoulli.predict(testdata)

    # Add array for whether or not prediction was accurate for said observation
    results =  predictions == test['positive']
    accuracy = 1- (results.value_counts(dropna=False).loc[False] / len(results))
    baseline = accuracy-0.5

    print('Baseline improvement on test set for {} features: {}'.format(num_features,baseline))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Baseline improvement for 4 features: 0.16999999999999993
Baseline improvement for 9 features: 0.22666666666666668
Baseline improvement for 14 features: 0.2333333333333334
Baseline improvement for 19 features: 0.24
Baseline improvement for 24 features: 0.22999999999999998
Baseline improvement for 29 features: 0.24
Baseline improvement for 34 features: 0.24
Baseline improvement for 39 features: 0.2466666666666666
Baseline improvement for 44 features: 0.2466666666666666
Baseline improvement for 49 features: 0.2433333333333334
Baseline improvement for 54 features: 0.25
Baseline improvement for 59 features: 0.18666666666666665
Baseline improvement for 64 features: 0.19666666666666666
Baseline improvement for 69 features: 0.19999999999999996
Baseline improvement for 74 features: 0.20333333333333337
Baseline improvement for 79 features: 0.20666666666666667
Baseline improvement for 84 features: 0.19999999999999996
Baseline improvement for 89 features: 0.20666666666666667
Baseline improvement f

Looks like we see declining returns in baseline improvement after 55 features, so let's create a model with that and see how that works.

In [66]:
# 55 features 
features = pos_correlations[:54].index.values

# Add these words as features to the df
for word in features:
    amazon[word] = amazon['text'].apply(lambda x: word in x)
    yelp[word] = yelp['text'].apply(lambda x: word in x) #Creating the necessary feature for Yelp to use down the road
    
# Set up the outcome variable and model vars
variables = amazon[features]
outcome = amazon['positive']

# Import model, instantiate, and train
bernoulli.fit(variables, outcome)

# Store predictions, add as a df column
predictions = bernoulli.predict(variables)

# Add array for whether or not prediction was accurate for said observation
results =  predictions == amazon['positive']
accuracy = 1- (results.value_counts(dropna=False).loc[False] / len(results))

from sklearn.metrics import confusion_matrix
c = confusion_matrix(amazon['positive'], predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(accuracy-0.5))
confusion

Accuracy: 0.8
Sensitivity: 0.736
Specificity: 0.864
Improvement over baseline: 0.30000000000000004


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,432,68
Pos,132,368


So this one is slightly less accurate than the one with 200 variables, as we would suspect, but interestingly has a noticeably better sensitivity. Let's see how it does versus the Yelp data.

In [68]:
# Set up the outcome variable and model vars
yelp_vars = yelp[features]
yelp_actual = yelp['positive']

# Store predictions, add as a df column
yelp_predictions = bernoulli.predict(yelp_vars)

# Add array for whether or not prediction was accurate for said observation
results =  yelp_predictions == yelp_actual
accuracy = 1- (results.value_counts(dropna=False).loc[False] / len(results))

from sklearn.metrics import confusion_matrix
c = confusion_matrix(yelp_actual, yelp_predictions)
confusion = pd.DataFrame(c)
confusion = confusion.rename(columns={0:'Guessed Neg',1:'Guessed Pos'},index={0:'Neg',1:'Pos'})
sensitivity = confusion.loc['Pos','Guessed Pos'] / confusion.sum(axis=1).loc['Pos']
specificity = confusion.loc['Neg','Guessed Neg'] / confusion.sum(axis=1).loc['Neg']

print('Accuracy: {}'.format(accuracy))
print('Sensitivity: {}'.format(sensitivity))
print('Specificity: {}'.format(specificity))
print('Improvement over baseline: {}'.format(accuracy-0.5))
confusion

Accuracy: 0.6579999999999999
Sensitivity: 0.49
Specificity: 0.826
Improvement over baseline: 0.15799999999999992


Unnamed: 0,Guessed Neg,Guessed Pos
Neg,413,87
Pos,255,245


It performs about as well on the Yelp data. At this point, I think it's safe to say that our methodology has us stuck in about a 65-70% range for accuracy on the Yelp data, and modifications of the number of features may not be enough to breka out of that. Just for fun, let's try a PCA on those 200 features to see how that performs.