In [1]:
import pandas as pd
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

Before running this notebook, the pre-processed Enron-Spam dataset must be downloaded from http://www2.aueb.gr/users/ion/data/enron-spam/
Each file downloaded must be extracted into the data/Enron-Spam folder in the project directory, which should contain folders named enron1-6.

We do a little processing on the dataset to get it into a more suitable form:
* Convert all text to lowercase
* Strip all stop words
* Strip all words with <3 characters (possibly unnecessary given the previous step?)
* Strip all punctuation (maybe also numerals?)
* Throw out all forward and reply emails (spam is not generally interacted with, so these aren't interesting for us)

First thing we need to do is load all the data:

In [2]:
os.chdir("../data/Enron-Spam")

In [None]:
emails = []
isspam = []
folders = os.listdir()
for folder in folders:
    hamnames = os.listdir(os.path.join(folder, 'ham'))
    spamnames = os.listdir(os.path.join(folder, 'spam'))
    for hamname in hamnames:
        with open(os.path.join(folder, 'ham', hamname), errors='ignore') as hamfile:
            emails.append(hamfile.read())
            isspam.append(False)
    for spamname in spamnames:
        with open(os.path.join(folder, 'spam', spamname), errors='ignore') as spamfile:
            emails.append(spamfile.read())
            isspam.append(True)

Now we put this data into a data frame:

In [None]:
data = pd.DataFrame({'Email' : emails, 'Spam' : isspam})

It is now time to process the data. The first thing we do is to throw out forwards and replies so as to avoid having to process these entries.

In [None]:
def replyorforward(email): # Note that forwards come in two different forms in this dataset, so we must recognise both
    return ("re :" in email) or ("fw :" in email) or ("- - - - - - - - - - - - - - - - - - - - - - forwarded" in email)

data.drop(data[list(map(replyorforward, data.Email))].index, inplace=True)

Now we clean up and tokenize the remaining entries:

In [None]:
stop = set(stopwords.words('english')) # We use the list of stop words provided by the nltk library
loweremail = map(lambda x: x.lower(), data.Email)
tokenemail = map(word_tokenize, loweremail)
taggedemail = map(pos_tag, tokenemail) # Adds part-of-speech tags to each word - these aren't needed yet, but they will be later

def strip(email): # Takes a tokenized email and strips it of stopwords and tokens with length < 3 (which will include punctuation for free)
    return [word for word in email if ((word[0] not in stop) and (len(word[0]) > 2))]

strippedemail = list(map(lambda x:strip(x)[1:], taggedemail)) # we also drop the first token, since this is always 'subject'

data.Email = list(map(lambda x: list(map(lambda y: y[0], x)), strippedemail))

Now we apply lemmatization.

In [None]:
wnl = WordNetLemmatizer()
def lemmatize(taggedword):
    word, tag = taggedword
    if tag[0] == 'N': # word is a noun
        lemma = wnl.lemmatize(word, 'n')
    elif tag[0] == 'V': # word is a verb
        lemma = wnl.lemmatize(word, 'v')
    elif tag[0] == 'J': # word is an adjective
        lemma = wnl.lemmatize(word, 'a')
    elif tag[0] == 'R': # word is an adverb
        lemma = wnl.lemmatize(word, 'r')
    else: # word is not lemmatizable
        lemma = word
    return lemma
lemmatizedemail = map(lambda x: list(map(lemmatize, x)), strippedemail)

def stripnotag(email):
    return [word for word in email if ((word not in stop) and (len(word) > 2))]

strippedlemmatized = map(stripnotag, lemmatizedemail) # we strip again, to remove short/stop word lemmas

data.insert(1, 'Lemmatized', list(strippedlemmatized))

Finally, we write out the processed data to a csv file and reset the indices:

In [None]:
data.reset_index(drop=True, inplace=True)
data.to_csv("../data.csv")