In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

amazon = pd.read_table("amazon.txt",header=None)
yelp = pd.read_table("yelp.txt",header=None)

amazon.columns = ['text','positive']
yelp.columns = ['text','positive']

# Make outcome column boolean
amazon['positive'] = amazon['positive'] == 1
yelp['positive'] = yelp['positive'] == 1

We'll start with the Amazon data and use that to train a naive bayes model for predicting sentiment. The first step is to define which keywords would indicate a positive or negative review. My initial approach was to create lists of terms that I thought would be positive or negative, but I eventually settled on a more programmatic approach. Let's split the data into those with a negative review and those with a positive. Then, let's calculate the most common words in each of those datasets. Next, remove any duplicates, since these are likely common words like "the", or will be indeterminate in whether they indicate a good or bad review. We'll feed these into the model and see how we do.

In [2]:
# Converting all text columns into a list of words
amazon['text'] = np.array(amazon['text'].str.lower().str.split(" |,|\.|!|\?"))

# Creating positive and negative subsets to calculate most popular words for pos & neg
amazon_pos = amazon[amazon['positive'] == 1]
amazon_neg = amazon[amazon['positive'] == 0]

# Calculate most common words in positive & negative reviews
x = sum(amazon_pos['text'], [])
y = sum(amazon_neg['text'], [])

from collections import Counter

for item in x:
    c = Counter(x)
    common_tuples = c.most_common(200)
    top_pos_words = [i[0] for i in common_tuples]

for item in y:
    c = Counter(y)
    common_tuples = c.most_common(200)
    top_neg_words = [i[0] for i in common_tuples]

In [3]:
# Remove overlapping words for each
pos_unique = [x for x in top_pos_words if x not in top_neg_words]
neg_unique = [x for x in top_neg_words if x not in top_pos_words]

In [4]:
print(len(pos_unique),len(neg_unique))

74 74


In [5]:
print(pos_unique)

['works', 'excellent', 'price', 'best', 'nice', 'love', 'easy', 'comfortable', 'happy', 'fine', 'been', 'far', 'clear', 'device', 'cell', 'fits', 'camera', 'got', 'working', 'highly', 'pretty', 'years', 'without', 'everything', 'cool', 'wear', 'lot', 'jabra', 'people', 'found', 'both', 'light', 'perfectly', 'value', 'impressed', 'say', 'priced', 'sturdy', 'gets', 'little', 'tried', 'definitely', 'pleased', 'small', 'voice', 'awesome', 'overall', 'range', 'amazon', 'cases', 'original', 'ears', 'seems', 'keyboard', 'their', 'several', 'most', 'headsets', 'verizon', 'order', 'free', 'shipping', 'pictures', 'leather', 'fast', 'comfortably', 'job', '&', 'glad', 'phones', 'look', 'charm', 'being', 'simple']


In [6]:
print(neg_unique)

['money', 'first', 'then', 'do', 'poor', 'waste', "doesn't", 'bad', 'what', 'could', 'worst', 'will', 'calls', 'off', 'same', 'piece', 'hear', 'charge', 'disappointed', 'enough', 'thing', 'terrible', 'plug', 'volume', 'design', 'horrible', 'customer', 'junk', 'unit', 'by', 'how', "didn't", 'talk', 'broke', 'over', 'useless', 'back', 'however', 'last', 'went', '3', 'days', 'buttons', 'months', 'completely', 'stay', 'company', 'never', 'crap', 'difficult', 'cheap', 'way', 'dropped', 'we', 'big', 'week', 'within', 'down', '1', 'signal', 'put', 'some', 'disappointment', 'return', 'old', 'nokia', 'want', 'anything', 'disappointing', 'picture', 'low', 'anyone', 'none', 'easily']


In [7]:
print(amazon['text'])

0      [so, there, is, no, way, for, me, to, plug, it...
1                     [good, case, , excellent, value, ]
2                           [great, for, the, jawbone, ]
3      [tied, to, charger, for, conversations, lastin...
4                                [the, mic, is, great, ]
5      [i, have, to, jiggle, the, plug, to, get, it, ...
6      [if, you, have, several, dozen, or, several, h...
7      [if, you, are, razr, owner, , , you, must, hav...
8          [needless, to, say, , i, wasted, my, money, ]
9             [what, a, waste, of, money, and, time, , ]
10               [and, the, sound, quality, is, great, ]
11     [he, was, very, impressed, when, going, from, ...
12     [if, the, two, were, seperated, by, a, mere, 5...
13                         [very, good, quality, though]
14     [the, design, is, very, odd, , as, the, ear, "...
15     [highly, recommend, for, any, one, who, has, a...
16          [i, advise, everyone, do, not, be, fooled, ]
17                             

In [8]:
# Creating the features based on our words
for word in pos_unique + neg_unique:
    amazon[word] = amazon['text'].apply(lambda x: word in x)
    yelp[word] = yelp['text'].apply(lambda x: word in x) #Creating the necessary feature for Yelp to use down the road
    
# Set up the outcome variable and model vars
variables = amazon[pos_unique + neg_unique]
outcome = amazon['positive']

# Import model, instantiate, and train
from sklearn.naive_bayes import BernoulliNB
bernoulli = BernoulliNB()
bernoulli.fit(variables, outcome)

# Store predictions, add as a df column
predictions = bernoulli.predict(variables)
amazon['predictions'] = predictions

# Add column for whether or not prediction was accurate for said observation
amazon['accurate'] =  amazon['predictions'] == amazon['positive']
wrong_predictions = amazon['accurate'].value_counts(dropna=False).loc[False]
observations = len(amazon['accurate'])
print('Of {} observations, there were {} incorrect predictions from our model.'.format(observations, wrong_predictions))
print('Or in other words, our model was wrong {:.1f} percent of the time.'.format((wrong_predictions/observations) * 100))

Of 1000 observations, there were 186 incorrect predictions from our model.
Or in other words, our model was wrong 18.6 percent of the time.


This is pretty good! My alternate model, in which I just guessed corresponding good or bad keywords, had an error rate north of 30%.

Next, let's see how well this fits the other data (Yelp).

In [9]:
# Set up the outcome variable and model vars
yelp_vars = yelp[pos_unique + neg_unique]

# Store predictions, add as a df column
yelp_predictions = bernoulli.predict(yelp_vars)
yelp['predictions'] = yelp_predictions

# Add column for whether or not prediction was accurate for said observation
yelp['accurate'] =  yelp['predictions'] == yelp['positive']
wrong_predictions = yelp['accurate'].value_counts(dropna=False).loc[False]
observations = len(yelp['accurate'])
print('Of {} observations, there were {} incorrect predictions from our model.'.format(observations, wrong_predictions))
print('Or in other words, our model was wrong {:.1f} percent of the time.'.format((wrong_predictions/observations) * 100))

Of 1000 observations, there were 361 incorrect predictions from our model.
Or in other words, our model was wrong 36.1 percent of the time.




The above isn't great, but it's not terrible either! I think the flaw might be that I'm bringing in the most common words featured in positive and negative reviews, but not necessarily the values that are __likeliest__ to exist in a positive or negative review. Let's try doing that quickly.

______________________________

In [20]:
amazon = pd.read_table("amazon.txt",header=None)
yelp = pd.read_table("yelp.txt",header=None)

amazon.columns = ['text','positive']
yelp.columns = ['text','positive']

# Make outcome column boolean
amazon['positive'] = amazon['positive'] == 1
yelp['positive'] = yelp['positive'] == 1

# Converting all text columns into a list of words
amazon['text'] = np.array(amazon['text'].str.lower().str.split(" |,|\.|!|\?"))

In [25]:
all_words = sum(amazon['text'], [])
wordset = set(all_words)
wordsdf=pd.DataFrame()

# Create df with all of the correlations to positive, by word
for word in wordset:
    wordsdf[word] = amazon['text'].apply(lambda x: word in x)

wordsdf['positive'] = amazon['positive']

#calculate correlations
correlations = wordsdf.corr().filter(['positive']).drop(['positive'])
pos_correlations = correlations.abs().sort_values('positive',ascending=False)

#old, inefficient way
#correlations = wordsdf.corr()
#pos_correlations = correlations.abs().sort_values('positive',ascending=False).loc[:,'positive']

#let's take the top 50
features = pos_correlations[:100].index.values

In [26]:
# Add these words as features to the df
for word in features:
    amazon[word] = amazon['text'].apply(lambda x: word in x)
    yelp[word] = yelp['text'].apply(lambda x: word in x) #Creating the necessary feature for Yelp to use down the road
    
# Set up the outcome variable and model vars
variables = amazon[features]
outcome = amazon['positive']

# Import model, instantiate, and train
bernoulli.fit(variables, outcome)

# Store predictions, add as a df column
predictions = bernoulli.predict(variables)
amazon['predictions'] = predictions

# Add column for whether or not prediction was accurate for said observation
amazon['accurate'] =  amazon['predictions'] == amazon['positive']
wrong_predictions = amazon['accurate'].value_counts(dropna=False).loc[False]
observations = len(amazon['accurate'])
print('Of {} observations, there were {} incorrect predictions from our model.'.format(observations, wrong_predictions))
print('Or in other words, our model was wrong {:.1f} percent of the time.'.format((wrong_predictions/observations) * 100))

Of 1000 observations, there were 157 incorrect predictions from our model.
Or in other words, our model was wrong 15.7 percent of the time.


Better! Now let's see how well it did with Yelp.

In [27]:
# Set up the outcome variable and model vars
yelp_vars = yelp[features]

# Store predictions, add as a df column
yelp_predictions = bernoulli.predict(yelp_vars)
yelp['predictions'] = yelp_predictions

# Add column for whether or not prediction was accurate for said observation
yelp['accurate'] =  yelp['predictions'] == yelp['positive']
wrong_predictions = yelp['accurate'].value_counts(dropna=False).loc[False]
observations = len(yelp['accurate'])
print('Of {} observations, there were {} incorrect predictions from our model.'.format(observations, wrong_predictions))
print('Or in other words, our model was wrong {:.1f} percent of the time.'.format((wrong_predictions/observations) * 100))

Of 1000 observations, there were 314 incorrect predictions from our model.
Or in other words, our model was wrong 31.4 percent of the time.


Also slightly better. Interesting!