In [1]:
import numpy as np
import pandas as pd

import os
import re
import sys
import pyprojroot

proj_root = pyprojroot.find_root(pyprojroot.has_file(".git"))
sys.path.append(os.path.join(proj_root, 'code'))

from utils import parallel_apply

In [2]:
# ! pip install pyprojroot
# ! python -m nltk.downloader -d ../../data/nltk_data all

import nltk
nltk.data.path.append('../../data/nltk_data/')

Load the emails

In [3]:
df_emails_path = os.path.join(proj_root, "data/enron_emails.parquet")
df_emails = pd.read_parquet(df_emails_path)

Remove replies and forwarded emails

In [4]:
re_emails = df_emails['Subject'].str.lower().str.contains("re:")
fw_emails_1 = df_emails['Subject'].str.lower().str.contains("fw:")
fw_emails_2 = df_emails['Subject'].str.lower().str.contains("fwd:")
fw_emails = (fw_emails_1 | fw_emails_2)

In [5]:
# clip forwarded emails and replied emails
# Remove some other random things, like Yahoo! and Blackberry signatures
def trim_reply_emails(email):
    x = re.search("[- ]*Original Message", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("[- ]*Forwarded ", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("From:\t", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("To:\t", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("To: ", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("Do You Yahoo!?", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("[- ]*Sent from my BlackBerry", email)
    if x is not None:
        email = email[:x.start()].strip() 
    return email

In [6]:
fw_emails

0          True
1         False
2         False
3         False
4         False
          ...  
517396    False
517397    False
517398    False
517399    False
517400    False
Name: Subject, Length: 517401, dtype: bool

In [7]:
%%time

# trim the emails and ignore the forwards
good_emails = parallel_apply(df_emails[~fw_emails]['Content'], trim_reply_emails)

# drop any null emails
good_emails = good_emails[good_emails.str.len() > 0]

Processing:   0%|          | 0/32 [00:00<?, ?it/s]

CPU times: user 882 ms, sys: 1.21 s, total: 2.09 s
Wall time: 9.98 s


In [8]:
good_emails

1         We'll be marketing power from their Kenansvill...
2         Kay,\nI have everything I need with the except...
3         I gave Raimund his number and asked him to fol...
4         \nMany of you have asked whether you should co...
5         I talked with Julie Smith right after lunch ab...
                                ...                        
517396    For West:\nRisk(give these people access to al...
517397                                            done.\nPL
517398                                            done.\nPL
517399    Add Steve South to the Trader list and remove ...
517400    I forwarded this to Scott Palmer, he is the ne...
Length: 445658, dtype: object

In [9]:
df_emails = good_emails.rename("Email Trimmed").to_frame().join(df_emails[[col for col in df_emails if col != 'Email']], how='left')

Get the emails SENT by the POIs and Execs

Note that emails in someone's mailbox aren't good enough because they contain emails from other people and not necessarily emails sent by someone

In [10]:
def find_possible_email_addresses(df_emails, name):
    return df_emails[df_emails['From'].str.contains(name.lower())]['From'].unique()

In [11]:
poi_names = [
    'Lay',
    'Skilling',
    'Delainey',
    'Forney'
]

# exec people (salary over $200,000), from the fraud dataset we found
exec_names = [
    'Allen', 
    'Beck', 
    'Buy', 
    'Delainey', 
    'Derrick', 
    'Haedicke', 
    'Kaminski', 
    'Kean', 
    'Kitchen', 
    'Lavorato', 
    'Lay', 
    'Martin',
    'Mcconnell', 
    'Shankman', 
    'Shapiro', 
    'Skilling', 
    'Taylor', 
    'Whalley', 
    'White'
]

# exec people (salary over $300,000), from the fraud dataset we found
exec_300_names = [
    'Buy',  
    'Derrick', 
    'Haedicke', 
    'Kean', 
    'Lavorato', 
    'Martin',
    'Mcconnell', 
    'Shankman', 
    'Whalley', 
    'White'
]

POI email addresses

In [12]:
# note that forney and delainey's middle initials are M and W. I'm pretty sure those emails correspond to them.
lay_addrs = ['kenneth.lay@enron.com', 
    'ken.lay@enron.com', 
    'ken.lay-@enron.com', 
    'ken.lay-.chairman.of.the.board@enron.com'
]

skill_addrs = ['jeff.skilling@enron.com', 
    'skilling@enron.com', 
    'jeffreyskilling@yahoo.com'
]

delain_addrs = ['david.delainey@enron.com', 
    'w..delainey@enron.com', 
    'delainey@enron.com', 
    'dave.delainey@enron.com'
]

forn_addrs = ['john.forney@enron.com', 
    'm..forney@enron.com', 
    'forney@enron.com'
]

poi_addrs = lay_addrs + skill_addrs + delain_addrs + forn_addrs

Exec email addresses

In [13]:
# 'ALLEN PHILLIP K'
allen_addrs = [
    'phillip.allen@enron.com', 
    'pallen70@hotmail.com',
    'k..allen@enron.com',
    'allen@enron.com'
]

# 'BECK SALLY W'
beck_addrs = [
    'sally.beck@enron.com',
    'beck@enron.com',
    'sbeck9@msn.com'
]

# 'BUY RICHARD B'
buy_addrs = [
    'rick.buy@enron.com',
    'buy@enron.com'
]

# 'DERRICK JR. JAMES V'
derrick_addrs = [ 
    'james.derrick@enron.com'
]

# 'HAEDICKE MARK E'
haedicke_addrs = [
    'mark.haedicke@enron.com',
    'e..haedicke@enron.com',
    'mark.e.haedicke@enron.com'
]

# 'KAMINSKI WINCENTY J'
kaminski_addrs = [
    'vince.kaminski@enron.com',
    'j.kaminski@enron.com',
    'vkaminski@aol.com',
    'vkaminski@palm.net',
    'j..kaminski@enron.com',
    'vince.j.kaminski@enron.com',
    'kaminski@enron.com'
]

# 'KEAN STEVEN J'
kean_addrs = [
    'j..kean@enron.com',
    'steven.kean@enron.com'
]

# 'KITCHEN LOUISE'
kitchen_addrs = [
    'louise.kitchen@enron.com',
    'kitchen@enron.com' 
]

# 'LAVORATO JOHN J'
lavorato_addrs = [
    'john.lavorato@enron.com',
    'lavorato@sympatico.ca',
    'john.j.lavorato@enron.com',
    'lavorato@enron.com'
]

# 'MARTIN AMANDA K'
martin_addrs = [
    'martin@enron.com'
]

# 'MCCONNELL MICHAEL S'
mcconnell_addrs = [
    'mike.mcconnell@enron.com'
]

# 'SHANKMAN JEFFREY A'
shankman_addrs = [
    'a..shankman@enron.com',
    'jeffrey.shankman@enron.com',
    'shankman@enron.com'
]

# 'SHAPIRO RICHARD S'
shapiro_addrs = [
    'richard.shapiro@enron.com',
    'shapiro@haas.berkeley.edu',
    'rickshapiro@hotmail.com',
    'shapiro@enron.com'
]

# 'TAYLOR MITCHELL S'
taylor_addrs = [
    'mitchell.taylor@enron.com'
]

# 'WHALLEY LAWRENCE G'
whalley_addrs = [
    'whalley@enron.com'
]

# 'WHITE JR THOMAS E'
white_addrs = []

exec_200_addrs = allen_addrs + beck_addrs + buy_addrs + derrick_addrs + haedicke_addrs + kaminski_addrs + kean_addrs + kitchen_addrs + lavorato_addrs + martin_addrs + mcconnell_addrs + shankman_addrs + shapiro_addrs + taylor_addrs + whalley_addrs + white_addrs

exec_300_addrs = buy_addrs + derrick_addrs + haedicke_addrs + kean_addrs + lavorato_addrs + martin_addrs + mcconnell_addrs + shankman_addrs + whalley_addrs + white_addrs

poi_addrs_dict = {
    "Lay": lay_addrs,
    "Skilling": skill_addrs,
    "Delainey": delain_addrs,
    "Forney": forn_addrs
}

exec_addrs_dict = {
    'Allen': allen_addrs, 
    'Beck': beck_addrs, 
    'Buy': buy_addrs, 
    'Derrick': derrick_addrs, 
    'Haedicke': haedicke_addrs, 
    'Kaminski': kaminski_addrs, 
    'Kean': kean_addrs, 
    'Kitchen': kitchen_addrs, 
    'Lavorato': lavorato_addrs, 
    'Martin': martin_addrs,
    'Mcconnell': mcconnell_addrs,
    'Shankman': shankman_addrs, 
    'Shapiro': shapiro_addrs, 
    'Taylor': taylor_addrs,
}

# all of Whalley's emails seem to be from someone named Liz Taylor
# White has no emails

Add info to the DataFrame

In [14]:
# Assign POI labels
df_emails['POI'] = False
df_emails.loc[df_emails['From'].isin(poi_addrs), 'POI'] = True

# Assign Exec 200 labels
df_emails['Exec 200'] = False
df_emails.loc[df_emails['From'].isin(exec_200_addrs), 'Exec 200'] = True

# Assign Exec 300 labels
df_emails['Exec 300'] = False
df_emails.loc[df_emails['From'].isin(exec_300_addrs), 'Exec 300'] = True

In [15]:
# Initialize the 'Sender' column with a string data type instead of NaN
df_emails['Sender'] = pd.NA

for name in poi_addrs_dict:
    df_emails.loc[df_emails['From'].isin(poi_addrs_dict[name]), 'Sender'] = name
    
for name in exec_addrs_dict:
    df_emails.loc[df_emails['From'].isin(exec_addrs_dict[name]), 'Sender'] = name

In [16]:
# df_emails.to_csv("emails_checkpoint1.csv", index_label='Original Index')

In [17]:
# separate POI, Execs, and Normal people
df_poi = df_emails[df_emails['POI']].copy()
df_exec = df_emails[df_emails['Exec 200']].copy()
df_norm = df_emails[(~df_emails['POI']) & (~df_emails['Exec 200'])].copy()

In [18]:
# add sender information for the normal people
def get_sender(e):
    if '@enron.com' in e:
        i = e.split("@")[0]
        if "." in i:
            return i.split(".")[1].capitalize()
    return np.nan

df_norm['Sender'] = df_norm['From'].apply(lambda x: get_sender(x))

In [19]:
df_poi['Email Trimmed'].drop_duplicates()

844       I was wondering if you were there.  Hope the w...
4760      I hope this is the last thing.  Here's the lat...
4761       <<2RZ501!.DOC>>  <<2R6LRED.DOC>>  <<2RZ601!.D...
4762      ATTORNEY-CLIENT PRIVILEGED - DO NOT PRODUCE\n\...
4763      Hi again,\n\nI accidentally deleted the emails...
                                ...                        
515291                 What the hell are you talking about?
515358    **********************************************...
515982    FYI, it may be wise to run a few of your macro...
516397    Where are these being transferred from?  I onl...
516516              Here is the most current orig file.\nPL
Name: Email Trimmed, Length: 3380, dtype: object

In [20]:
import codecs

def to_ascii(s):
    return codecs.encode(s, 'ascii', 'ignore').decode() # returns a regular string

# remove Rosalee Fleming emails; Lay's secretary 
def lay_processing(messages):
    return messages[~messages.apply(lambda s: ("Rosie" in s) | ("Rosalee" in s))]

# remove Sherri Sera, Joannie Williamson, SRS; Skilling's secretary 
def skilling_processing(messages):
    return messages[~messages.apply(lambda s: ("Sherri" in s) | ("Joannie" in s)| ("SRS" in s))]

In [21]:
# convert to ascii
df_poi['Email Trimmed'] = df_poi['Email Trimmed'].apply(lambda x: to_ascii(x))

In [22]:
# sample of non-exec, non-poi people
norm_names = ['Mann',
 'Davis',
 'Dasovich',
 'Shackleton',
 'Germany',
 'Jones',
 'Symes',
 'Bass',
 'Lenhart',
 'Perlingiere',
 'Scott',
 'ClairFossum',
 'Nemec',
 'Rogers'
]

In [23]:
%%time
from nltk.tokenize import word_tokenize, sent_tokenize

df_n = []
df = df_norm
add_processing = {
    "Lay": lay_processing,
    "Skilling": skilling_processing,
}         

# for name in df['Sender'].unique():
for name in norm_names:
    if name in add_processing:
        good_emails = add_processing[name](df.loc[df['Sender'] == name, 'Email Trimmed'])
        df_n.append(good_emails)
    else:
        df_n.append(df.loc[df['Sender'] == name, 'Email Trimmed'])
df_processed = pd.concat(df_n)
        
# drop emails less than 5 words long
df_processed = df_processed[df_processed.apply(lambda x: len(word_tokenize(x)) >= 5)]

CPU times: user 1min 34s, sys: 56.9 ms, total: 1min 34s
Wall time: 1min 34s


In [24]:
d = df_processed.to_frame().join(df[[col for col in df.columns if col != 'Email Trimmed']], how='left')

d = d[['Email Trimmed', 'Sender', 'POI', 'Exec 200', 'Exec 300', 'Date']].drop_duplicates()

d = d.rename(columns={"Email Trimmed": "Email"})

d = d.sort_index()

In [25]:
# want to remove all emails sent before 1999 (only about 100 emails were sent before then)

In [26]:
import datetime
d['Date'] = d['Date'].apply(lambda x: x.split("-")[0])
d['Datetime'] = pd.to_datetime(d['Date'])

d = d[(d['Datetime'] > datetime.datetime(1998, 12, 31)) & (~d['Datetime'].isna())]

d['Datetime'].sort_values()

69948    1999-04-30 06:33:00
60933    1999-04-30 06:33:00
68069    1999-05-03 03:37:00
72893    1999-05-03 03:37:00
66113    1999-05-03 09:07:00
                 ...        
102234   2002-06-24 11:47:25
102088   2002-06-24 15:33:08
102519   2002-06-25 10:48:44
102619   2002-06-25 11:04:28
243864   2002-09-22 09:42:25
Name: Datetime, Length: 82738, dtype: datetime64[ns]

In [27]:
d.to_csv(
    "normal_emails.csv",
    index_label='Original Index'
)