In [77]:
import nltk
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from spamassassin_client import SpamAssassin
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickhansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
# get all of the formatted data
enron_df, ling_df, sacorp_df = pd.read_csv('data/formattedData/enronFormatted.csv'), pd.read_csv('data/formattedData/lingFormatted.csv'), pd.read_csv('data/formattedData/SAcorpusFormatted.csv')

enron_spam = enron_df[enron_df['label'] == 1]
enron_ham = enron_df[enron_df['label'] == 0]

ling_spam = ling_df[ling_df['label'] == 1]
ling_ham = ling_df[ling_df['label'] == 0]

sacorp_spam = sacorp_df[sacorp_df['label'] == 1]
sacorp_ham = sacorp_df[sacorp_df['label'] == 0]

(enron_spam.shape[0], enron_ham.shape[0]), (ling_spam.shape[0], ling_ham.shape[0]), (sacorp_spam.shape[0], sacorp_ham.shape[0])

((5000, 5000), (433, 2172), (347, 1891))

In [79]:
# take 300 ham and 300 spam from each dataset
sampleSize = 347

enron_spam_sample = enron_spam.sample(5000)
enron_ham_sample = enron_ham.sample(5000)

ling_spam_sample = ling_spam.sample(433)
ling_ham_sample = ling_ham.sample(2172)

sacorp_spam_sample = sacorp_spam.sample(347)
sacorp_ham_sample = sacorp_ham.sample(1891)

# combine the datasets
data = pd.concat([enron_spam_sample, enron_ham_sample, ling_spam_sample, ling_ham_sample, sacorp_spam_sample, sacorp_ham_sample])
data = data.sample(frac=1).reset_index(drop=True) # shuffle the data
data.shape

(14843, 2)

In [80]:
def remove_header(text):
    return text.split('\n\n', 1)[1]

In [81]:
data_tuples = list(zip(data['text'], data['label']))
data_tuples = [(remove_header(text), label) for (text, label) in data_tuples]

In [82]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

vocab = set()
for text, label in data_tuples:
    for word in nltk.word_tokenize(text):
        w_l = word.lower()
        if w_l.isalpha() and word not in STOP_WORDS: # TODO: should also check if in stopwords?
            vocab.add(w_l)

In [None]:
def get_features(text):
    rec_words = [w.lower() for w in nltk.word_tokenize(text)]
    features = {}
    for w in rec_words:
        if w in vocab:
            features[w] = True
    return features

In [None]:
feature_tuples = [(get_features(text), label) for (text, label) in data_tuples]

In [None]:
splitPoint = len(feature_tuples) // 3
train, test = feature_tuples[splitPoint:], feature_tuples[:splitPoint]
len(train), len(test)

(1388, 694)

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train)
# now, it is tested on the test set and the accuracy reported
print("Accuracy: ", nltk.classify.accuracy(classifier, test)) #nltk.classify

Accuracy:  0.9265129682997119


In [None]:
classifier.show_most_informative_features(10)

Most Informative Features
                language = True                0 : 1      =     53.3 : 1.0
                mailings = True                1 : 0      =     44.1 : 1.0
                 amazing = True                1 : 0      =     28.8 : 1.0
                  topics = True                0 : 1      =     27.2 : 1.0
                   wrote = True                0 : 1      =     26.0 : 1.0
                evidence = True                0 : 1      =     24.5 : 1.0
                     ect = True                0 : 1      =     23.8 : 1.0
                 science = True                0 : 1      =     23.8 : 1.0
                     sep = True                0 : 1      =     23.1 : 1.0
                   fresh = True                1 : 0      =     22.9 : 1.0


In [None]:
def show_most_informative_features_in_list(classifier, n=10):
    """
    Return a nested list of the "most informative" features 
    used by the classifier along with it's predominant labels
    """
    cpdist = classifier._feature_probdist       # probability distribution for feature values given labels
    feature_list = []
    for (fname, fval) in classifier.most_informative_features(n):
        def labelprob(l):
            return cpdist[l, fname].prob(fval)
        labels = sorted([l for l in classifier._labels if fval in cpdist[l, fname].samples()], 
                        key=labelprob)
        feature_list.append([fname, labels[-1]])
    return feature_list

In [None]:
spam_features = [x for x in show_most_informative_features_in_list(classifier, n=500) if x[1] == 1]
len(spam_features)

254

In [None]:
spammy_words = [x[0] for x in spam_features]
spammy_words[:10]

['mailings',
 'amazing',
 'fresh',
 'toll',
 'postal',
 'instructions',
 'proven',
 'advertise',
 'removal',
 'engines']