In [89]:
import nltk
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from spamassassin_client import SpamAssassin
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('stopwords')
np.random.seed(42)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickhansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [90]:
# get all of the formatted data
enron_df, ling_df, sacorp_df = pd.read_csv('data/formattedData/enronFormatted.csv'), pd.read_csv('data/formattedData/lingFormatted.csv'), pd.read_csv('data/formattedData/SAcorpusFormatted.csv')

enron_spam = enron_df[enron_df['label'] == 1]
enron_ham = enron_df[enron_df['label'] == 0]

ling_spam = ling_df[ling_df['label'] == 1]
ling_ham = ling_df[ling_df['label'] == 0]

sacorp_spam = sacorp_df[sacorp_df['label'] == 1]
sacorp_ham = sacorp_df[sacorp_df['label'] == 0]

(enron_spam.shape[0], enron_ham.shape[0]), (ling_spam.shape[0], ling_ham.shape[0]), (sacorp_spam.shape[0], sacorp_ham.shape[0])

((5000, 5000), (433, 2172), (347, 1891))

In [91]:
sampleSize = 347

enron_spam_sample = enron_spam.sample(1000)
enron_ham_sample = enron_ham.sample(1000)

ling_spam_sample = ling_spam.sample(433)
ling_ham_sample = ling_ham.sample(433)

sacorp_spam_sample = sacorp_spam.sample(347)
sacorp_ham_sample = sacorp_ham.sample(347)

# combine the datasets
data = pd.concat([enron_spam_sample, enron_ham_sample, ling_spam_sample, ling_ham_sample, sacorp_spam_sample, sacorp_ham_sample]).sample(frac=1).reset_index(drop=True)
print(f'Original Dataset Size: {data.shape}')
data = data.sample(n=3560).reset_index(drop=True) # take random sample of n = 2000
print(f'Sampled Size: {data.shape}')

Original Dataset Size: (3560, 2)
Sampled Size: (3560, 2)


In [92]:
def remove_header(text):
    # remove header and just append subject content with body to keep that semantic heading information
    spl = text.split("\n\n", 1)

    subj_content = spl[0].split("\n", 1)[0].split("Subject: ", 1)[1].strip()
    body_content = spl[1].strip()

    return subj_content + "\n" + body_content

In [93]:
data_tuples = list(zip(data['text'], data['label']))
data_tuples = [(remove_header(text), label) for (text, label) in data_tuples]

In [94]:
def get_one_to_n_grams(words, n=2):
    n_grams = []
    for i in range(1, n+1):
        n_grams.extend(nltk.ngrams(words, i))
    return n_grams

def is_alpha_ngram(ngram):
    return all([w.isalpha() for w in ngram])

In [95]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

gram_vocab = set() # vocab composed of bigrams and unigrams
for text, label in data_tuples:
    words = [w.lower() for w in nltk.word_tokenize(text)]
    one_two_grams = get_one_to_n_grams(words, n=2)
    for g in one_two_grams:
        if is_alpha_ngram(g):
            gram_vocab.add(g)

In [96]:
def get_features(text):
    words = [w.lower() for w in nltk.word_tokenize(text)]
    one_two_grams = get_one_to_n_grams(words, n=1)
    features = {}
    for g in one_two_grams:
        if g in gram_vocab:
            features[g] = True
    return features

In [97]:
feature_tuples = [(get_features(text), label) for (text, label) in data_tuples]

In [98]:
splitPoint = len(feature_tuples) // 3
train, test = feature_tuples[splitPoint:], feature_tuples[:splitPoint]
len(train), len(test)

(2374, 1186)

In [99]:
classifier = nltk.NaiveBayesClassifier.train(train)
# now, it is tested on the test set and the accuracy reported
print("Accuracy: ", nltk.classify.accuracy(classifier, test)) #nltk.classify

Accuracy:  0.9595278246205734


In [100]:
classifier.show_most_informative_features(10)

Most Informative Features
        ('linguistics',) = True                0 : 1      =     95.3 : 1.0
                ('ect',) = True                0 : 1      =     82.5 : 1.0
          ('forwarded',) = True                0 : 1      =     46.7 : 1.0
              ('vince',) = True                0 : 1      =     45.5 : 1.0
      ('advertisement',) = True                1 : 0      =     37.3 : 1.0
             ('syntax',) = True                0 : 1      =     36.7 : 1.0
            ('grammar',) = True                0 : 1      =     35.4 : 1.0
           ('deadline',) = True                0 : 1      =     33.3 : 1.0
           ('abstract',) = True                0 : 1      =     32.7 : 1.0
           ('mailings',) = True                1 : 0      =     32.3 : 1.0


In [101]:
def show_most_informative_features_in_list(classifier, n=10):
    """
    Return a nested list of the "most informative" features 
    used by the classifier along with it's predominant labels
    """
    cpdist = classifier._feature_probdist       # probability distribution for feature values given labels
    feature_list = []
    for (fname, fval) in classifier.most_informative_features(n):
        def labelprob(l):
            return cpdist[l, fname].prob(fval)
        labels = sorted([l for l in classifier._labels if fval in cpdist[l, fname].samples()], 
                        key=labelprob)
        feature_list.append([fname, labels[-1]])
    return feature_list

In [102]:
spam_features = [x for x in show_most_informative_features_in_list(classifier, n=500) if x[1] == 1]
len(spam_features)

232

In [103]:
spammy_grams = [x[0] for x in spam_features]
spammy_grams[:10]

[('advertisement',),
 ('mailings',),
 ('viagra',),
 ('earning',),
 ('mlm',),
 ('advertising',),
 ('featured',),
 ('php',),
 ('tips',),
 ('websites',)]

In [104]:
# write the spammy words to a file
with open('spammy_grams.txt', 'w') as f:
    for gram in spammy_grams:
        f.write(','.join(list(gram)) + '\n')