In [66]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

yelp_raw = pd.read_csv('data/yelp_labelled.txt', delimiter= '\t', header=None)

yelp_raw.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
def analyze

In [67]:
# set column names
df = yelp_raw
df.columns = ['message', 'negative']

In [68]:
df['negative'] = (df['negative'] == 0)
# Note that if you run this cell a second time everything will become false.
# So... Don't.

In [69]:
# replace punctuation 
df['message'] = df.message.str.replace(r'[^a-zA-Z\d\s:]', '')
# make lower case
df['message'] = df['message'].str.lower()

In [70]:
# split negative messages and combine into one list
negative_words = df.message[df.negative].str.cat(sep=' ').split()

positive_words = df.message[df.negative == False].str.cat(sep=' ').split()

In [71]:
# Unique Words
print('negative:', len(np.unique(negative_words)), ' positive:', len(np.unique(positive_words)))

negative: 1397  positive: 1246


In [72]:
keywords = pd.Series(negative_words).value_counts().keys().ravel()

keywords

array(['the', 'i', 'and', ..., 'oil', 'hands', 'milk'], dtype=object)

In [73]:
diff = np.setdiff1d(
        ar1=pd.Series(negative_words).value_counts().keys().ravel(), 
                      ar2=pd.Series(positive_words).value_counts().keys().ravel()
                     )
diff

array(['1', '10', '1199', '12', '15', '15lb', '17', '1979', '30', '30s',
       '34ths', '35', '400', '40min', '45', '785', '90', 'accountant',
       'ache', 'acknowledged', 'actual', 'ahead', 'airline', 'ala',
       'albondigas', 'allergy', 'alone', 'although', 'angry',
       'anticipated', 'anymore', 'anytime', 'anyways', 'apart',
       'apologize', 'apology', 'appalling', 'apparently', 'appealing',
       'appetite', 'appetizer', 'apple', 'arent', 'arepas', 'arrives',
       'arriving', 'article', 'asked', 'asking', 'atrocious', 'attached',
       'attack', 'attention', 'attitudes', 'average', 'avocado', 'avoid',
       'avoided', 'awful', 'awkward', 'awkwardly', 'bad', 'ball',
       'bamboo', 'banana', 'bare', 'barely', 'based', 'basically',
       'batch', 'batter', 'beensteppedinandtrackedeverywhere', 'begin',
       'behind', 'bellagio', 'below', 'besides', 'between', 'bigger',
       'biggest', 'binge', 'bird', 'bisque', 'bitches', 'bites', 'blah',
       'blame', 'bland',

In [74]:
new = df
keywords = diff

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    new[str(key)] = new.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [75]:
data = new[keywords]
target = new['negative']

In [76]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}\nAccuracy: {}%".format(
    data.shape[0],
    (target != y_pred).sum(), round((1 - (target != y_pred).sum()/data.shape[0]) * 100, 2)
))

Number of mislabeled points out of a total 1000 points : 285
Accuracy: 71.5%


In [77]:
imdb_raw = pd.read_csv('data/imdb_labelled.txt', delimiter= '\t', header=None)

imdb_raw.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [78]:
# set column names
imdb_raw.columns = ['message', 'negative']

imdb_raw['negative'] = (imdb_raw['negative'] == 0)
# Note that if you run this cell a second time everything will become false.
# So... Don't.


In [79]:
df = imdb_raw
# replace punctuation 
df['message'] = df.message.str.replace(r'[^a-zA-Z\d\s:]', '')
# make lower case
df['message'] = df['message'].str.lower()

In [80]:
# split negative messages and combine into one list
negative_words = df.message[df.negative].str.cat(sep=' ').split()

positive_words = df.message[df.negative == False].str.cat(sep=' ').split()

In [81]:
# Unique Words
print('negative:', len(np.unique(negative_words)), ' positive:', len(np.unique(positive_words)))

negative: 1928  positive: 2014


In [82]:
keywords = pd.Series(negative_words).value_counts().keys().ravel()

keywords

array(['the', 'a', 'of', ..., 'loves', 'barney', 'girlfriendboyfriend'],
      dtype=object)

In [83]:
diff = np.setdiff1d(
        ar1=pd.Series(negative_words).value_counts().keys().ravel(), 
                      ar2=pd.Series(positive_words).value_counts().keys().ravel()
                     )
diff

array(['010', '1948', '1971', ..., 'zillion', 'zombiestudents', 'zombiez'],
      dtype=object)

In [84]:
new = imdb_raw
keywords = diff

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    new[str(key)] = new.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [85]:
data = new[keywords]
target = new['negative']

In [86]:
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}\nAccuracy: {}%".format(
    data.shape[0],
    (target != y_pred).sum(), round((1 - (target != y_pred).sum()/data.shape[0]) * 100, 2)
))

Number of mislabeled points out of a total 748 points : 3
Accuracy: 99.6%


In [107]:
amazon_raw = pd.read_csv('data/amazon_cells_labelled.txt', delimiter= '\t', header=None)

amazon_raw.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [108]:
# set column names
amazon_raw.columns = ['message', 'negative']

amazon_raw['negative'] = (amazon_raw['negative'] == 0)
# Note that if you run this cell a second time everything will become false.
# So... Don't.


In [109]:
df = amazon_raw

# replace punctuation 
df['message'] = df.message.str.replace(r'[^a-zA-Z\d\s:]', '')
# make lower case
df['message'] = df['message'].str.lower()

In [111]:
# split negative messages and combine into one list
negative_words = df.message[df.negative].str.cat(sep=' ').split()

positive_words = df.message[df.negative == False].str.cat(sep=' ').split()

In [112]:
# Unique Words
print('negative:', len(np.unique(negative_words)), ' positive:', len(np.unique(positive_words)))

negative: 1293  positive: 1137


In [113]:
keywords = pd.Series(negative_words).value_counts().keys().ravel()

keywords

array(['the', 'i', 'it', ..., 'row', 'nicer', 'excellent'], dtype=object)

In [114]:
diff = np.setdiff1d(
        ar1=pd.Series(negative_words).value_counts().keys().ravel(), 
                      ar2=pd.Series(positive_words).value_counts().keys().ravel()
                     )
diff

array(['11', '13', '2160', '23', '375', '3o', '45', '4s', '5320', '5of',
       '5year', '6', '744', '8', '8125', '8525', 'abhor', 'ability',
       'abound', 'above', 'abovepretty', 'absolutel', 'accept',
       'acceptable', 'accessory', 'accessoryone', 'accidentally',
       'activate', 'activated', 'ad', 'adapter', 'add', 'addition',
       'adhesive', 'advise', 'aggravating', 'along', 'although', 'amp',
       'angeles', 'angle', 'answer', 'antena', 'anything', 'anyway',
       'apartment', 'apparently', 'appealing', 'appearance', 'area',
       'arguing', 'asia', 'assumed', 'atleast', 'att', 'auto', 'average',
       'avoiding', 'awkward', 'backlight', 'bad', 'balance', 'basement',
       'basically', 'be3', 'bed', 'beep', 'beeping', 'believe', 'bells',
       'bend', 'bethe', 'beware', 'biggest', 'bills', 'bit', 'bland',
       'blew', 'bluetoooth', 'bmw', 'booking', 'boost', 'bother',
       'bottom', 'bottowm', 'bougth', 'break', 'breakage', 'breaking',
       'breaks', 'broke

In [117]:

keywords = diff

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    df[str(key)] = df.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

AttributeError: Can only use .str accessor with string values, which use np.object_ dtype in pandas

In [118]:
data = new[keywords]
target = new['negative']

KeyError: "['messaging' 'messes' 'metal' 'methe' 'microsofts' 'might' 'mind' 'mins'\n 'minute' 'minutesmajor' 'misleading' 'missed' 'mistake' 'mode' 'model'\n 'monkeys' 'morning' 'mostly' 'mother' 'moving' 'mp3' 'mp3s' 'muddy'\n 'muffled' 'multiple' 'music' 'mute' 'nano' 'navigate' 'near' 'needless'\n 'negatively' 'neither' 'ngage' 'nicer' 'night' 'nightmare' 'noises'\n 'none' 'note' 'noted' 'nothing' 'noticed' 'numbers' 'odd' 'offering' 'oh'\n 'ok' 'old' 'once' 'ones' 'onethis' 'onid' 'online' 'onlyi' 'opens'\n 'operates' 'option' 'originally' 'os' 'override' 'pads' 'painful'\n 'particular' 'party' 'pause' 'pay' 'pens' 'performed' 'periodically'\n 'person' 'phonebattery' 'phonemy' 'phones2' 'photo' 'pitiful' 'places'\n 'planning' 'plans' 'plantronincs' 'player' 'please' 'pleather' 'plugged'\n 'point' 'poor' 'poorly' 'port' 'possesed' 'possibility' 'potentially'\n 'practically' 'practice' 'preferably' 'pricing' 'prime' 'problemvery'\n 'procedure' 'procedures' 'produce' 'products' 'promised' 'properly'\n 'pros' 'provide' 'puff' 'pull' 'purcashed' 'purpose' 'push' 'pushed'\n 'quiet' 'randomly' 'rare' 'rated' 'razor' 'reach' 'reaching' 'ready'\n 'reason' 'reccommend' 'receipt' 'receive' 'receiving' 'recently'\n 'recessed' 'recharge' 'recieve' 'recognizes' 'refund' 'refurb' 'refuse'\n 'refused' 'regretted' 'relatively' 'reliability' 'remorse' 'renders'\n 'reoccurebottom' 'replaced' 'replacementr' 'reset' 'resolution' 'respect'\n 'restart' 'restocking' 'return' 'returned' 'returning' 'reverse' 'review'\n 'riingtones' 'ringing' 'rip' 'ripped' 'risk' 'rocketed' 'row'\n 'rubberpetroleum' 'runs' 'saggy' 'samsungcrap' 'save' 'saying' 'says'\n 'scary' 'screens' 'screenthis' 'seat' 'securly' 'seeen' 'sensor' 'sent'\n 'seperated' 'severe' 'share' 'shooters' 'short' 'shouldnt' 'shouting'\n 'sim' 'sins' 'sitting' 'skip' 'slid' 'slow' 'slowly' 'smartphone' 'smell'\n 'smoke' 'smoking' 'smudged' 'snap' 'snug' 'somehow' 'somewhere' 'son'\n 'songs' 'soon' 'sorry' 'soundwise' 'sources' 'soyo' 'speakerphone'\n 'specially' 'speed' 'spring' 'sprint' 'stand' 'standard' 'stari'\n 'startac' 'starter' 'starts' 'stated' 'stay' 'steep' 'steer' 'stop'\n 'stopped' 'stops' 'strange' 'strap' 'strength' 'stress' 'stuck' 'study'\n 'stupid' 'sucked' 'sucks' 'sudden' 'suddenly' 'sunglasses' 'superfast'\n 'support' 'supposedly' 'switch' 'sync' 'takes' 'talking' 'tape' 'tech'\n 'telephone' 'tell' 'terrible' 'texas' 'text' 'theory' 'thereplacement'\n 'thorn' 'threepack' 'threw' 'tick' 'ticking' 'tied' 'tinny' 'today'\n 'toilet' 'told' 'tones' 'top' 'totally' 'touch' 'touches' 'tracfone'\n 'tracking' 'trash' 'tricky' 'truly' 'trust' 'tungsten' 'turned' 'ugly'\n 'unacceptable' 'unacceptableunless' 'unacceptible' 'unbearable'\n 'uncomfortable' 'understand' 'unfortunately' 'unhappy' 'unintelligible'\n 'unit' 'units' 'unknown' 'unless' 'unreliable' 'unsatisfactory' 'until'\n 'unusable' 'update' 'upgrade' 'upload' 'upstairs' 'useless' 'utter'\n 'utterly' 'v115g' 'v265' 'v3c' 'v3i' 'vehicle' 'verizons' 'videos'\n 'visor' 'vx9900' 'waaay' 'wake' 'walked' 'warning' 'warranty' 'waste'\n 'wasted' 'wasting' 'weak' 'wearing' 'website' 'week' 'weird' 'whatsoever'\n 'whine' 'whistles' 'wifi' 'wind' 'window' 'windows' 'wiping' 'wired'\n 'wirefly' 'wires' 'wish' 'wit' 'wobbly' 'wonder' 'wooden' 'word' 'worst'\n 'worthless' 'wrong' 'wrongfirst' 'yell' 'zero'] not in index"

In [119]:
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}\nAccuracy: {}%".format(
    data.shape[0],
    (target != y_pred).sum(), round((1 - (target != y_pred).sum()/data.shape[0]) * 100, 2)
))

Number of mislabeled points out of a total 748 points : 3
Accuracy: 99.6%
