In [18]:
import codecs
import os
import random

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

First we need to retrieve emails. Let's store file contents in a list of strings:

In [19]:
def get_training_files(dir):
    'Get relevant training data files from the folder.'
    r = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            if 'spam' in name or 'ham' in name:
                r.append(os.path.join(root, name))
    return r

all_files = 'data'
emails = []
for f in get_training_files(all_files):
    with codecs.open(f, 'r', encoding='utf-8', errors='ignore') as fdata:
        emails.append(fdata.read().replace('\n', ' ').replace('\r', ''))

print(emails[:2])

['Subject: christmas tree farm pictures ', 'Subject: vastar resources , inc . gary , production from the high island larger block a - 1 # 2 commenced on saturday at 2 : 00 p . m . at about 6 , 500 gross . carlos expects between 9 , 500 and 10 , 000 gross for tomorrow . vastar owns 68 % of the gross production . george x 3 - 6992 - - - - - - - - - - - - - - - - - - - - - - forwarded by george weissman / hou / ect on 12 / 13 / 99 10 : 16 am - - - - - - - - - - - - - - - - - - - - - - - - - - - daren j farmer 12 / 10 / 99 10 : 38 am to : carlos j rodriguez / hou / ect @ ect cc : george weissman / hou / ect @ ect , melissa graves / hou / ect @ ect subject : vastar resources , inc . carlos , please call linda and get everything set up . i \' m going to estimate 4 , 500 coming up tomorrow , with a 2 , 000 increase each following day based on my conversations with bill fischer at bmar . d . - - - - - - - - - - - - - - - - - - - - - - forwarded by daren j farmer / hou / ect on 12 / 10 / 99 10 

For Named Entity Recognition, we will use **Spacy**. Their NER model was trained on the OntoNotes 5 corpus. The problem with our emails is that they are all lowercased already, which results in certain entities like person names being very difficult to find.

In [24]:
nlp = en_core_web_sm.load()
random.shuffle(emails)
for email in emails[:10]:
    doc = nlp(email)
    print('Randomly selected email: ')
    print(email)
    print('\n\n')
    print('Named entities found: ')
    print([(X.text, X.label_) for X in doc.ents])
    print('\n\n')

Randomly selected email: 
Subject: now you can diversify the acts in your bedroom ! cialis drug information : an online resource on cialis , a new fda approved impotence drug we all have strength enough to endure the misfortunes of others . blessed are those who can give without remembering , and take without forgetting . doubt is not a pleasant condition , but certainty is absurd . this hath not offended the king .



Named entities found: 
[]



Randomly selected email: 
Subject: next wednesday ' s interviews hi bill , amy asked that i set up three interviews with you for wednesday of next week . please let me know your availablity that day and i will go ahead and get started on the scheduling . thanks ! - grace



Named entities found: 
[('next wednesday', 'DATE'), ('three', 'CARDINAL'), ('wednesday', 'DATE'), ('next week', 'DATE'), ('that day', 'DATE')]



Randomly selected email: 
Subject: actual numbers of staff sorry about the delay i have been waiting for info from new york and