In [2]:
# Spam filtering tutorial
import os  # this library enables file system operations

In [110]:
# Read the training data and add body of text for each email into a list
# The text starts on the third line
# The filenames themselves begin with 'spm' if they are spam so use that to create classification labels
# Once these commands have run we will have a list of email contents and a list of labels for the emails
# to train our model with
train_dir = 'Data/Spam/train-mails/'
email_files = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]
training_data = []
train_labels = np.zeros(len(email_files))
for n, mail in enumerate(email_files):
    if mail.split('/')[3][0:3] == 'spm':
        train_labels[n] = 1
    with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:  #Body of email is only 3rd line of text file
                    training_data.append(line)

In [112]:
# Import functions to convert text to a numerical matrix representation
# Each row of the matrix will represent an email
# Each column represents a word. The words are generated from the full set of emails.
# The no_features variable is used to decide how many words to include in the model
# This should be experimented with to find the best results
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 200

In [117]:
# In this case we are using the TFIDF representation of the words
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

# Convert the text in documents to TFIDF
train_tfidf = tfidf_vectorizer.fit_transform(training_data)

# This will return a list of the unique words used as features. Particularly useful if max_features has been used
# because you can see which were retained.
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [106]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC

In [None]:
# Train two models to compare the outputs
# Multinomial Naive Bayes takes a probabilistic approach
# SVC uses a Support Vector Machine classifier which is a complex mathematical model
model1 = MultinomialNB()
model2 = LinearSVC()

# Fit each model to the training data
model1.fit(train_tfidf,train_labels)
model2.fit(train_tfidf,train_labels)

In [119]:
# Read the test data and add body of text for each email into a list
# Produce list of class labels too
test_dir = 'Data/Spam/test-mails/'
email_files = [os.path.join(test_dir,f) for f in os.listdir(test_dir)]
test_data = []
test_labels = np.zeros(len(email_files))
for n,mail in enumerate(email_files):
    if mail.split('/')[3][0:3] == 'spm':
        test_labels[n] = 1
    with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:  #Body of email is only 3rd line of text file
                    test_data.append(line)

In [128]:
# A confusion matrix is a useful method of checking the effectiveness of classification models
# If we have a matrix C then for binary classification (indexing = C[row, column]):
#     The count of true negatives is C[0,0]
#     The count of false negatives is C[1,0]
#     The count of true positives is C[1,1]
#     The count of false positives is C[0,1].
from sklearn.metrics import confusion_matrix
test_tfidf = tfidf_vectorizer.transform(test_data)
result1 = model1.predict(test_tfidf)
result2 = model2.predict(test_tfidf)
print ("NaiveBayes Results:\n",confusion_matrix(test_labels,result1), "\n")
print ("Support Vector Machine Results:\n",confusion_matrix(test_labels,result2))

NaiveBayes Results:
 [[128   2]
 [  8 122]] 

Support Vector Machine Results:
 [[129   1]
 [  4 126]]


In [122]:
# We can reduce the amount of typing a lot by using the pipeline function
# It was useful above to break it into steps for learning purposes
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer',  CountVectorizer()),
    ('classifier',  MultinomialNB()) ])

pipeline.fit(training_data, train_labels)
predictions = pipeline.predict(test_data) # ['spam', 'ham']
print ("Pipeline Results:\n",confusion_matrix(test_labels,predictions))

[[129   1]
 [  6 124]]


In [138]:
# Cross validation is an important technique for avoid overfitting a trained model
# Overfitting means that it will work very well for a training set because the model has been tuned very
# finely to get good results for it. But when we try it on new data it will perform badly.
# Cross validation runs the pipeline several times with different training and validation sets

from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score

k_fold = KFold(n=len(training_data), n_folds=6)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = [training_data[i] for i in train_indices]
    train_y = train_labels[train_indices]

    test_text = [training_data[i] for i in test_indices]
    test_y = train_labels[test_indices]

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=1)
    scores.append(score)

print('Total emails classified:', len(training_data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:\n', confusion)


Total emails classified: 702
Score: 0.986266897092
Confusion matrix:
 [[344   7]
 [  3 348]]
