# Extract addresses from the Enron dataset

The [Enron Email Dataset](https://www.cs.cmu.edu/~enron/) was collected and prepared by the CALO Project (A Cognitive Assistant that Learns and Organizes).
It contains data from about 150 users, mostly senior management of Enron, organized into folders.
The corpus contains a total of about 0.5M messages. This data was originally made public, and posted to the web, by the Federal Energy Regulatory Commission during its investigation.

This code uses simple regex to extract possible mailing addresses from the Enron dataset, for further inspection and manual tagging of addresses.

While the simple regex approach would probably have false detection and missed addresses, it can help curate a labeled dataset of texts with and without addresses.

In [4]:
import pandas as pd
import re
import glob
from typing import Tuple, Iterator, Dict
import tqdm

Initial setup and detection logic

In [5]:
enron_path = "../data/maildir/"
email_regex = r"^\S+@\S+\.\S+$"

street_hints = ["st\.","street","blvd","boulevard","rd\.","road","ave","avenue","lane","apt\.","apartment","circle","route","ci\.","ct\.","court","pkwy","parkway","freeway","highway","terrace"]

street_regex = f"(?:^|(?<= ))({'|'.join(street_hints)})(?:(?=[ ?.,!])|$)"

# phrases used to ignore specific cases where the regex detected non-address matches
street_phrases_to_ignore = ["wall street","st. mary","st. peter", "st. john","road conditions",
                            "court to","@","federal court","court of appeals","supreme court",
                            "st. lucia","st.  lucia","st. maarten","st. regis","st. clair",
                            "st. louis","st. thomas","road runner","road show","to route","claims court"]
street_phrases_to_ignore.extend([f"a {word}" for word in street_hints]) # ignore "a street" but keep "street"
street_phrases_to_ignore.extend([f"the {word}" for word in street_hints]) #ignore "the street" but keep "street"
street_phrases_to_ignore

['wall street',
 'st. mary',
 'st. peter',
 'st. john',
 'road conditions',
 'court to',
 '@',
 'federal court',
 'court of appeals',
 'supreme court',
 'st. lucia',
 'st.  lucia',
 'st. maarten',
 'st. regis',
 'st. clair',
 'st. louis',
 'st. thomas',
 'road runner',
 'road show',
 'to route',
 'claims court',
 'a st\\.',
 'a street',
 'a blvd',
 'a boulevard',
 'a rd\\.',
 'a road',
 'a ave',
 'a avenue',
 'a lane',
 'a apt\\.',
 'a apartment',
 'a circle',
 'a route',
 'a ci\\.',
 'a ct\\.',
 'a court',
 'a pkwy',
 'a parkway',
 'a freeway',
 'a highway',
 'a terrace',
 'the st\\.',
 'the street',
 'the blvd',
 'the boulevard',
 'the rd\\.',
 'the road',
 'the ave',
 'the avenue',
 'the lane',
 'the apt\\.',
 'the apartment',
 'the circle',
 'the route',
 'the ci\\.',
 'the ct\\.',
 'the court',
 'the pkwy',
 'the parkway',
 'the freeway',
 'the highway',
 'the terrace']

Function for traversing the dataset

In [8]:
def glob_enron(path: str,
               pattern: str,
               a:int=5,
               b:int=5,
               phrases_to_ignore:Tuple[str]=(),
               verbose:bool=False) -> Iterator[Dict]:
    """
    Traverses the Enron dataset and extract subsets of texts with possible addresses
    :param path: Path to the enron root folder
    :param pattern: The regex pattern to look for
    :param a: Number of lines before the pattern match
    :param b: Number of lines after the pattern match
    :param phrases_to_ignore: List of phrases to scan for each match, and ignore the match if found
    :param verbose: True if the function should print to console
    :return: Generator
    """
    
    for file in tqdm.tqdm(glob.glob(f"{path}/**/[0-9]_",recursive=True)):
        with open(file) as f:
            lines = f.readlines()
            for i, line in enumerate(lines):
                for match in re.finditer(pattern, line, flags=re.I):
                    # adjust a and b if this match is in the first or last rows
                    from_line = i-a
                    to_line = i+b
                    if from_line < 0:
                        from_line = 0
                    if to_line >= len(lines):
                        to_line = len(lines) - 1
                    text = "".join(lines[from_line : to_line]).replace("=09"," ").replace("=20"," ").replace("=\n","")
                    found_phrases = [phrase for phrase in phrases_to_ignore if phrase in lines[i].lower()]
                    to_remove = len(found_phrases) > 0
                    if not to_remove:
                        if verbose:
                            print(f"Found on file file:///{file} on line {i+1} : {match.group()}") 
                            print(text)
                            print(f"from:{from_line},to:{to_line}")
                            print("---------------------\n")
                            
                        yield {"file":file, "match": match.group(), "text":text}
                    else:
                        if verbose:
                            print(f"Match {match.group()} removed due to phrases {found_phrases}")


In [9]:
addresses = []
for match in glob_enron(enron_path,
                        street_regex, 
                        phrases_to_ignore=street_phrases_to_ignore,
                        verbose=False):
    addresses.append(match)
print(f"Found {len(addresses)} matching records")

  0%|          | 80/20723 [00:00<03:04, 111.74it/s]


KeyboardInterrupt: 

In [None]:
to_save = list(set([record['text'] for record in addresses]))
print(f"Saving {len(to_save)} unique records")

for i, address_data in enumerate(to_save):
    with open(f"../data/interim/enron_sentences/{i}.txt","w") as f:
        f.writelines(address_data)

### <<<< Annotated records manually with Doccano >>>>

In [10]:
enron_dataset = "../data/processed/enron/dataset.jsonl"

enron_df = pd.read_json(path_or_buf=enron_dataset, lines=True)
enron_df = enron_df.sample(frac=1).set_index("id")
enron_df = enron_df.drop_duplicates(subset='data', keep="last") #shuffle data and remove duplicates
enron_df.head()

Unnamed: 0_level_0,data,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
759,the spread calculations referenced were tied t...,"[[232, 265, GPE]]"
215,\nConsorcio thereafter appealed through a proc...,[]
349,\n\nAngela Papesch\nHead of Asia-Pacific Offic...,"[[104, 157, GPE]]"
167,"Hi Phillip,\n \nThanks for your winning bid on...","[[397, 441, GPE]]"
262,> should be removable to federal court.\n>\n> ...,[]


Dataset size (with and without labels)

In [11]:
print(f"Dataset size: {len(enron_df)}")
no_labels_df = enron_df[enron_df['label'].apply(len)==0]
labels_df = enron_df[enron_df['label'].apply(len)>0]
print(f"Len of samples with no labels: {len(no_labels_df)}, number of samples with labels: {len(labels_df)}")

Dataset size: 654
Len of samples with no labels: 345, number of samples with labels: 309


In [12]:
def extract_value(text, labels):
    extracted = []
    for label in labels:
        extracted.append(text[label[0]:label[1]])
    return extracted

enron_df['values'] = [extract_value(text=row.data,labels=row.label)
    for row in enron_df.itertuples()]

enron_df.head()

Unnamed: 0_level_0,data,label,values
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
759,the spread calculations referenced were tied t...,"[[232, 265, GPE]]","[1400 Smith St.\nHouston, TX 77002]"
215,\nConsorcio thereafter appealed through a proc...,[],[]
349,\n\nAngela Papesch\nHead of Asia-Pacific Offic...,"[[104, 157, GPE]]",[1 Robinson Road - #18-00 AIA Tower - Singapor...
167,"Hi Phillip,\n \nThanks for your winning bid on...","[[397, 441, GPE]]","[141 McCallie Lane\nLookout Mountain, GA 30750]"
262,> should be removable to federal court.\n>\n> ...,[],[]


All address values

In [13]:
all_values = []
for val in enron_df['values']:
    all_values.extend(val)

all_values

['1400 Smith St.\nHouston, TX  77002',
 '1 Robinson Road - #18-00 AIA Tower - Singapore 048542',
 '141 McCallie Lane\nLookout Mountain, GA 30750',
 '285 Hamilton Avenue\n      Palo Alto, CA 94301',
 '1400 Smith Street, EB3801a\nHouston, TX  77002',
 '1400 Smith Street, EB 3888 \nHouston, TX 77002-7361',
 '290 Washington Ave',
 '450 Miramonte Avenue\nPalo Alto, CA 94306',
 '600 Fifth Avenue - 27th Floor\nRockefeller Center\nNew York, NY 10020',
 '5000 Dominion Blvd.\nGlen Allen, Virginia 23060\nInnsbrook 3rd Floor',
 'Sixty State Park Road, Morro Bay',
 '5959 Topanga Canyon Blvd.  Suite 244\nWoodland Hills, CA 91367',
 '424 Church Street, Suite 2800\nNashville, TN  37219-2386',
 '370 17th Street, Suite 4240\nDenver, CO  80202-1370',
 '550 15th Street, Denver',
 'the corner of Michaux and West Cottage',
 '1907 Holcombe Blvd, 77030',
 '21st Street North',
 'Vernon Blvd',
 'Vernon Blvd',
 'Braodway ',
 'Broadway',
 'Vernon Blvd.',
 '4142, Stevenson Blvd,\nApartment No. 1205,\nFremont, Cali

In [14]:
enron_df.to_pickle("../data/processed/enron.pickle")

In [15]:
pd.read_pickle("../data/processed/enron.pickle")

Unnamed: 0_level_0,data,label,values
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
759,the spread calculations referenced were tied t...,"[[232, 265, GPE]]","[1400 Smith St.\nHouston, TX 77002]"
215,\nConsorcio thereafter appealed through a proc...,[],[]
349,\n\nAngela Papesch\nHead of Asia-Pacific Offic...,"[[104, 157, GPE]]",[1 Robinson Road - #18-00 AIA Tower - Singapor...
167,"Hi Phillip,\n \nThanks for your winning bid on...","[[397, 441, GPE]]","[141 McCallie Lane\nLookout Mountain, GA 30750]"
262,> should be removable to federal court.\n>\n> ...,[],[]
...,...,...,...
276,** - GEL ALERT HIGHLIGHTS: Employee Benefits: ...,[],[]
686,have been applied until the case was removed t...,[],[]
11,"\nJeff Youngflesh\nDirector, Business Developm...","[[86, 131, GPE]]","[333 Clay Street, 11th Floor\nHouston, TX 77002]"
775,X-bcc: \nX-Folder: \Steven_Kean_Dec2000_1\Note...,[],[]
