In [136]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [137]:
# Grab and process the raw data
sms_raw = pd.read_csv('SMSSpamCollection', delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords
keywords = ['click', 'offer', 'winner', 'buy', 'free', 
            'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )


sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [138]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[4770,   55],
       [ 549,  198]])

In [139]:
matches = (target == y_pred).sum()
targeted = target.count()

print(matches/targeted)

0.8916008614501076


In [147]:
confusion = pd.crosstab(target, y_pred)

In [164]:
TN = confusion[0][0]
FN = confusion[0][1]
FP = confusion[1][0]
TP = confusion[1][1]
sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
print(confusion)
print(sensitivity)
print(specificity)

col_0  False  True 
spam               
False   4770     55
True     549    198
0.26506024096385544
0.9886010362694301


In [168]:
actual_spam / data.shape[0]

0.13406317300789664

In [169]:
# Test your model with different holdout groups

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.884304932735426
Testing on Sample: 0.8916008614501076


In [172]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.89784946, 0.89426523, 0.89426523, 0.890681  , 0.89605735,
       0.89048474, 0.88150808, 0.89028777, 0.88489209, 0.89568345])