In [1]:
import pandas as pd
import os
import re
from nltk.corpus import stopwords
from nltk import tokenize

We do a little processing on the dataset to get it into a more suitable form:
* Convert all text to lowercase
* Strip all stop words
* Strip all words with <3 characters (possibly unnecessary given the previous step?)
* Strip all punctuation (maybe also numerals?)
* Throw out all forward and reply emails (spam is not generally interacted with, so these aren't interesting for us)

First thing we need to do is load all the data:

In [2]:
os.chdir("data/Enron-Spam")

In [3]:
emails = []
isspam = []
folders = os.listdir()
for folder in folders:
    hamnames = os.listdir(os.path.join(folder, 'ham'))
    spamnames = os.listdir(os.path.join(folder, 'spam'))
    for hamname in hamnames:
        with open(os.path.join(folder, 'ham', hamname), errors='ignore') as hamfile:
            emails.append(hamfile.read())
            isspam.append(False)
    for spamname in spamnames:
        with open(os.path.join(folder, 'spam', spamname), errors='ignore') as spamfile:
            emails.append(spamfile.read())
            isspam.append(True)

Now we put this data into a data frame:

In [4]:
data = pd.DataFrame({'Email' : emails, 'Spam' : isspam})

It is now time to process the data. The first thing we do is to throw out forwards and replies so as to avoid having to process these entries.

In [5]:
def replyorforward(email): # Note that forwards come in two different forms in this dataset, so we must recognise both
    return ("re :" in email) or ("fw :" in email) or ("- - - - - - - - - - - - - - - - - - - - - - forwarded" in email)

data.drop(data[list(map(replyorforward, data.Email))].index, inplace=True)

Now we clean up and tokenize the remaining entries:

In [6]:
stop = set(stopwords.words('english')) # We use the list of stop words provided by the nltk library
loweremail = map(lambda x: x.lower(), data.Email)
tokenemail = map(tokenize.word_tokenize, loweremail)

def strip(email): # Takes a tokenized email and strips it of stopwords and tokens with length < 3 (which will include punctuation for free)
    return [word for word in email if ((word not in stop) and (len(word) > 2))]

strippedemail = map(lambda x:strip(x)[1:], tokenemail) # we also drop the first token, since this is always 'subject'

data.Email = list(strippedemail)

Finally, we write out the processed data to a csv file and reset the indices:

In [None]:
data.reset_index(drop=True, inplace=True)
data.to_csv("../data.csv")