In [283]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [284]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

In [285]:
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent', 'XXX', 'win','cash','contract','mobile','CASH','WINNER!!','URGENT!','urgent','Urgent']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    sms_raw[str(key)] = sms_raw.message.str.contains(
        str(key) + ' ',
        case=False
    )

In [286]:
sms_spam = sms_raw.query('spam == True')
sms_spam.head(50)

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,XXX,win,contract,mobile,CASH,WINNER!!,URGENT!,Urgent


In [287]:
sms_raw['allcaps'] = sms_raw.message.str.isupper()

In [288]:
sms_raw.head()

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,XXX,win,contract,mobile,CASH,WINNER!!,URGENT!,Urgent,allcaps
0,ham,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,ham,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False
3,ham,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [289]:
#Before we go further, let's turn the spam column into a boolean so we can easily do some statistics to prepare for modeling.
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
# Note that if you run this cell a second time everything will become false.
# So... Don't.

In [290]:
sms_raw.head()

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,XXX,win,contract,mobile,CASH,WINNER!!,URGENT!,Urgent,allcaps
0,False,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,True,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False
3,False,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [291]:
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

In [292]:
data.head()

Unnamed: 0,click,offer,winner,buy,free,cash,urgent,XXX,win,cash.1,contract,mobile,CASH,WINNER!!,URGENT!,urgent.1,Urgent,allcaps
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [293]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 5572 points : 467


In [316]:
print(1-467/5572)

0.9161880832735104


In [294]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

                # SPAM
#Predicter     True         False
#True           HIT        False Neg
#False         False Pos      HIT

array([[4748,   77],
       [ 390,  357]])

In [295]:
# translation - there are 4748 + 77 = ... HAM messages, I've only mis-identified 77 of those -- FALSE POSITIVE, type 1 error
# there are 390 + 357 = ... SPAM messages, I've mis-identified over half of those (390) - miss, type 2 error, false negative

#390 of my 467 errors are failing to identify spam.
#Sensitivity is percentage of positives correctly identified
print(357/(390+357))

0.4779116465863454


In [296]:
# Specificity__is the percentage of negatives correctly identified, 4748/4825 or,
print(4748/4825)
# Note that I did worse here than the example but much better on "sensitivity" which makes sense

0.9840414507772021


In [297]:
# add the y_pred to my dataframe
sms_raw['y_pred'] = y_pred

In [298]:
sms_raw.columns

Index(['spam', 'message', 'click', 'offer', 'winner', 'buy', 'free', 'cash',
       'urgent', 'XXX', 'win', 'contract', 'mobile', 'CASH', 'WINNER!!',
       'URGENT!', 'Urgent', 'allcaps', 'y_pred'],
      dtype='object')

In [299]:
sms_compare= sms_raw[['spam','y_pred']]


In [300]:
spamct = sms_compare.query('spam == True & y_pred == True')
print('Number of actual spam, categorized as spam')
print(spamct.count())
spamctmiss = sms_compare.query('spam == True & y_pred == False')
print('Number of actual spam, NOT categorized as spam -- False Negative')
print(spamctmiss.count())
#Sensitivity is percentage of positives correctly identified
# So sensitivity for me is:
print(spamct.count()/(spamct.count()+spamctmiss.count()))


hamct = sms_compare.query('spam == False & y_pred == False')
print('Number of actual ham, categorized as ham')
print(hamct.count())
hamctmiss = sms_compare.query('spam == False & y_pred == True')
print('Number of actual ham, categorized as spam -- False Positive')
print(hamctmiss.count())
#Sensitivity is percentage of positives correctly identified
# So sensitivity for me is:
print(hamct.count()/(hamct.count()+hamctmiss.count()))

Number of actual spam, categorized as spam
spam      357
y_pred    357
dtype: int64
Number of actual spam, NOT categorized as spam -- False Negative
spam      390
y_pred    390
dtype: int64
spam      0.477912
y_pred    0.477912
dtype: float64
Number of actual ham, categorized as ham
spam      4748
y_pred    4748
dtype: int64
Number of actual ham, categorized as spam -- False Positive
spam      77
y_pred    77
dtype: int64
spam      0.984041
y_pred    0.984041
dtype: float64


### Build training and test samples

In [301]:
sms_raw.head(1)

Unnamed: 0,spam,message,click,offer,winner,buy,free,cash,urgent,XXX,win,contract,mobile,CASH,WINNER!!,URGENT!,Urgent,allcaps,y_pred
0,False,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [302]:
#dataframe = sms_raw

msk = np.random.rand(len(sms_raw)) < 0.8
train = sms_raw[msk]
test = sms_raw[~msk]

In [310]:
print(len(train))
print(len(test))

4427
1145


In [314]:
data = train[keywords]
data2 = test[keywords]
target = train['spam']
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data2)

# Display our results.
print("Number of mislabeled points out of a total {} test points : {}".format(
    data2.shape[0],
    (test['spam'] != y_pred).sum()
))


Number of mislabeled points out of a total 1145 test points : 108


In [315]:
print(1-108/1145)

0.9056768558951965


So when I trained and tested on the whole dataset, I got 91.6%; on the train/test cross-validation, I got 90.6%

### Cross-validation

In [336]:
#Make 5 folds
val1, val2, val3, val4, val5 = np.split(sms_raw.sample(frac=1), [int(.2*len(sms_raw)), int(.4*len(sms_raw)),int(.6*len(sms_raw)),int(.8*len(sms_raw))])


In [352]:
test = val1
rest = [val2,val3,val4,val5]
train = pd.concat(rest)

In [353]:
data = train[keywords]
data2 = test[keywords]
target = train['spam']
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data2)

# Display our results.
print("Number of mislabeled points out of a total {} test points in {} : {}".format(
    data2.shape[0],'val1',
    (test['spam'] != y_pred).sum()
))


Number of mislabeled points out of a total 1114 test points in val1 : 95


In [356]:
test = val2
rest = [val1,val3,val4,val5]
train = pd.concat(rest)

data = train[keywords]
data2 = test[keywords]
target = train['spam']
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data2)

# Display our results.
print("Number of mislabeled points out of a total {} test points in {} : {}".format(
    data2.shape[0],'val2',
    (test['spam'] != y_pred).sum()
))

Number of mislabeled points out of a total 1114 test points in val2 : 90


In [357]:
test = val3
rest = [val1,val2,val4,val5]
train = pd.concat(rest)

data = train[keywords]
data2 = test[keywords]
target = train['spam']
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data2)

# Display our results.
print("Number of mislabeled points out of a total {} test points in {} : {}".format(
    data2.shape[0],'val3',
    (test['spam'] != y_pred).sum()
))

Number of mislabeled points out of a total 1115 test points in val3 : 103


In [358]:
test = val4
rest = [val1,val2,val3,val5]
train = pd.concat(rest)

data = train[keywords]
data2 = test[keywords]
target = train['spam']
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data2)

# Display our results.
print("Number of mislabeled points out of a total {} test points in {} : {}".format(
    data2.shape[0],'val4',
    (test['spam'] != y_pred).sum()
))

Number of mislabeled points out of a total 1114 test points in val4 : 97


In [359]:
test = val5
rest = [val1,val2,val3,val4]
train = pd.concat(rest)

data = train[keywords]
data2 = test[keywords]
target = train['spam']
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data2)

# Display our results.
print("Number of mislabeled points out of a total {} test points in {} : {}".format(
    data2.shape[0],'val5',
    (test['spam'] != y_pred).sum()
))

Number of mislabeled points out of a total 1115 test points in val5 : 86


In [372]:
#So my results for my 5 folds were: 95,90, 103,  97,  86
err = [95,90, 103,  97,  86]
avg_error = np.mean(err)
print(avg_error)
sd_error = np.std(err)
print(sd_error)

94.2
5.84465567848
