In [1]:
from io import StringIO

from itertools import combinations

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import requests

from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

%matplotlib inline

Go to the UCI Machine Learning repository and download the Spambase dataset. Make sure you read the documentation for the data. This explains what the attributes are in the data file. Load in the data file, doing any cleaning necessary to get usable data.
https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.DOCUMENTATION

Subsample the data set so 60% is training data and 40% is test data. You can subsample however you like, including splitting the original file. Just make sure that you have a representative data set. (The original is about 60% not-spam and 40% spam.)

Then, write code to classify the data into spam and not-spam, training with your training data and testing on your test data.

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

In [3]:
r = requests.get(url)
names = (['word_freq_' + str(i+1) for i in range(48)] 
         + ['char_freq_' + str(j+1) for j in range(6)] 
         + ['capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'spam'])
spam = pd.read_csv(StringIO(r.text), names=names)
spam.head()

Unnamed: 0,word_freq_1,word_freq_2,word_freq_3,word_freq_4,word_freq_5,word_freq_6,word_freq_7,word_freq_8,word_freq_9,word_freq_10,...,char_freq_1,char_freq_2,char_freq_3,char_freq_4,char_freq_5,char_freq_6,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


Check for any null values

In [4]:
spam.isnull().sum().sum()

0

Subsample the data set so 60% is training data and 40% is test data. You can subsample however you like, including splitting the original file. Just make sure that you have a representative data set. (The original is about 60% not-spam and 40% spam.)

In [5]:
train_x, test_x, train_y, test_y = train_test_split(spam.drop('spam', axis=1),
                                                    spam['spam'],
                                                    test_size = 0.4, random_state=6)
sum(test_y) / len(test_y)

0.39978272677892451

Then, write code to classify the data into spam and not-spam, training with your training data and testing on your test data.

In [6]:
model = MultinomialNB().fit(train_x, train_y)

In [7]:
predictions = model.predict(test_x)
sum(predictions) / len(predictions)

0.40901683867463334

In [8]:
accuracy_rate = accuracy_score(test_y, predictions)

From UCI Data:

*~7% misclassification error. False positives (marking good mail as spam) are very undesirable.If we insist on zero false positives in the training/testing set, 20-25% of the spam passed through the filter.*

In [9]:
#"Marking good mail as spam"
test_results = pd.DataFrame({'test_y': test_y, 'predictions': predictions})
test_results['False Positive'] = test_results['predictions'] > test_results['test_y']
test_results['False Negative'] = test_results['predictions'] < test_results['test_y']
test_results['Correct'] = test_results['predictions'] == test_results['test_y']

In [10]:
false_pos_rate = test_results['False Positive'].sum() / len(test_results)
false_neg_rate = test_results['False Negative'].sum() / len(test_results)
print("""
Results:
Accuracy: {:.2f}%
False Positive: {:.2f}%
False Negative: {:.2f}%
""".format(accuracy_rate * 100, false_pos_rate * 100, false_neg_rate * 100))


Results:
Accuracy: 81.04%
False Positive: 9.94%
False Negative: 9.02%



## Advanced Mode

In addition to the normal mode requirements, try reducing or changing your features in order to get better results.

Find another source of spam/not-spam data, break it down into features, and perform the same exercise as above. How well does your algorithm perform on the new data?

In [11]:
# combo_accs = []
# combo_names = []
# for i in range(1,58):
#     for combo in combinations(train_x.columns, i):
#         combo = list(combo)
#         test_x_combo = test_x[combo]
#         train_x_combo = train_x[combo]
#         model_combo = MultinomialNB().fit(train_x_combo, train_y)
#         pred_combo = model_combo.predict(test_x_combo)
#         pred_acc = accuracy_score(test_y, pred_combo)
        
#         combo_accs.append(pred_acc)
#         combo_names.append(combo)

# test_comb_accs = pd.DataFrame(combo_names, combo_accs)

I tried getting all combos above, but this was taking longer than I had patience for...
So now I'll investigate looking only at words, or words and characters.

In [12]:
test_x_words = test_x[test_x.columns[:48]]
train_x_words = train_x[train_x.columns[:48]]

model_words = MultinomialNB().fit(train_x_words, train_y)
pred_words = model_words.predict(test_x_words)
pred_words_acc = accuracy_score(test_y, pred_words)

In [13]:
pred_words_acc

0.86800651819663222

In [14]:
test_x_words_chars = test_x[test_x.columns[:54]]
train_x_words_chars = train_x[train_x.columns[:54]]

model_words_chars = MultinomialNB().fit(train_x_words_chars, train_y)
pred_words_chars = model_words_chars.predict(test_x_words_chars)
pred_words_chars_acc = accuracy_score(test_y, pred_words_chars)
pred_words_chars_acc

0.87506789788158612

We get a much higher accuracy score of 87.5% when we exclude the three capital run length metrics, and look only at words and characters.

Now, to check and see what happens if we add each of the capital run metrics back in.

In [15]:
test_x_words_chars = test_x[test_x.columns[:55]]
train_x_words_chars = train_x[train_x.columns[:55]]

model_words_chars = MultinomialNB().fit(train_x_words_chars, train_y)
pred_words_chars = model_words_chars.predict(test_x_words_chars)
pred_words_chars_acc = accuracy_score(test_y, pred_words_chars)
pred_words_chars_acc

0.87235198261814229

In [16]:
list(test_x.columns[:54]) + ['capital_run_length_longest']
test_x_words_chars = test_x[list(test_x.columns[:54]) + ['capital_run_length_longest']]
train_x_words_chars = train_x[list(train_x.columns[:54]) + ['capital_run_length_longest']]

model_words_chars = MultinomialNB().fit(train_x_words_chars, train_y)
pred_words_chars = model_words_chars.predict(test_x_words_chars)
pred_words_chars_acc = accuracy_score(test_y, pred_words_chars)
pred_words_chars_acc

0.8359587180879956

In [18]:
list(test_x.columns[:54]) + ['capital_run_length_total']
test_x_words_chars = test_x[list(test_x.columns[:54]) + ['capital_run_length_total']]
train_x_words_chars = train_x[list(train_x.columns[:54]) + ['capital_run_length_total']]

model_words_chars = MultinomialNB().fit(train_x_words_chars, train_y)
pred_words_chars = model_words_chars.predict(test_x_words_chars)
pred_words_chars_acc = accuracy_score(test_y, pred_words_chars)
pred_words_chars_acc

0.85279739272134714

As each of these lowered the accuracy score, we will assume that excluding the three capital run length features will lead to a more accurate model.  