In [2]:
import numpy as np
import pandas as pd

import re

Load the emails

In [2]:
df_emails_orig = pd.read_csv("../data/enron_emails.csv")

In [3]:
def split_preambles(messages):
    preambles = []
    for initial_message in messages:
        initial_split = initial_message.split("\n\n")
        preamble = initial_split[0]
        message = " ".join(initial_split[1:])

        preamble_split = preamble.split("\n")

        preamble_dict = dict()
        preamble_dict['Email'] = message.strip()
        for field in preamble_split:
            field_key_value = field.split(": ", maxsplit=1)
            try:
                field_key = field_key_value[0]
                field_value = field_key_value[1]
                preamble_dict[field_key] = field_value
                last_key = field_key
            except(IndexError):
                preamble_dict[last_key] = preamble_dict[last_key] + " " + field_key_value[0].strip()
        preambles.append(preamble_dict)
    return pd.DataFrame(preambles, index=messages.index)

In [4]:
df_emails = split_preambles(df_emails_orig['message'])

In [5]:
df_emails.shape

(517401, 128)

The parse function isn't perfect and creates too many columns (some of which are garbage), so only keep good columns

In [6]:
df_emails = df_emails[
    ['Email',
    'Message-ID',
    'Date',
    'From',
    'To',
    'Subject',
    'Mime-Version',
    'Content-Type',
    'Content-Transfer-Encoding',
    'X-From',
    'X-To',
    'X-cc',
    'X-bcc',
    'X-Folder',
    'X-Origin',
    'X-FileName',
    'Cc',
    'Bcc']
]

Remove replies and forwarded emails

In [7]:
re_emails = df_emails['Subject'].str.lower().str.contains("re:")

fw_emails_1 = df_emails['Subject'].str.lower().str.contains("fw:")
fw_emails_2 = df_emails['Subject'].str.lower().str.contains("fwd:")
fw_emails = (fw_emails_1 | fw_emails_2)

In [8]:
# clip forwarded emails and replied emails
# Remove some other random things, like Yahoo! and Blackberry signatures
def trim_reply_emails(email):
    x = re.search("[- ]*Original Message", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("[- ]*Forwarded ", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("From:\t", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("To:\t", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("To: ", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("Do You Yahoo!?", email)
    if x is not None:
        email = email[:x.start()].strip()
    x = re.search("[- ]*Sent from my BlackBerry", email)
    if x is not None:
        email = email[:x.start()].strip() 
    return email

In [9]:
# trim the emails and ignore the forwards
good_emails = df_emails[~fw_emails]['Email'].apply(trim_reply_emails)

# drop any null emails
good_emails = good_emails[~(good_emails == "")]

In [20]:
df_emails = good_emails.rename("Email Trimmed").to_frame().join(df_emails[[col for col in df_emails if col != 'Email']], how='left')

Get the emails SENT by the POIs and Execs

Note that emails in someone's mailbox aren't good enough because they contain emails from other people and not necessarily emails sent by someone

In [12]:
def find_possible_email_addresses(df_emails, name):
    return df_emails[df_emails['From'].str.contains(name.lower())]['From'].unique()

In [13]:
poi_names = [
    'Lay',
    'Skilling',
    'Delainey',
    'Forney'
]

# exec people (salary over $200,000), from the fraud dataset we found
exec_names = [
    'Allen', 
    'Beck', 
    'Buy', 
    'Delainey', 
    'Derrick', 
    'Haedicke', 
    'Kaminski', 
    'Kean', 
    'Kitchen', 
    'Lavorato', 
    'Lay', 
    'Martin',
    'Mcconnell', 
    'Shankman', 
    'Shapiro', 
    'Skilling', 
    'Taylor', 
    'Whalley', 
    'White'
]

# exec people (salary over $300,000), from the fraud dataset we found
exec_300_names = [
    'Buy',  
    'Derrick', 
    'Haedicke', 
    'Kean', 
    'Lavorato', 
    'Martin',
    'Mcconnell', 
    'Shankman', 
    'Whalley', 
    'White'
]

POI email addresses

In [14]:
# note that forney and delainey's middle initials are M and W. I'm pretty sure those emails correspond to them.
lay_addrs = ['kenneth.lay@enron.com', 
    'ken.lay@enron.com', 
    'ken.lay-@enron.com', 
    'ken.lay-.chairman.of.the.board@enron.com'
]

skill_addrs = ['jeff.skilling@enron.com', 
    'skilling@enron.com', 
    'jeffreyskilling@yahoo.com'
]

delain_addrs = ['david.delainey@enron.com', 
    'w..delainey@enron.com', 
    'delainey@enron.com', 
    'dave.delainey@enron.com'
]

forn_addrs = ['john.forney@enron.com', 
    'm..forney@enron.com', 
    'forney@enron.com'
]

poi_addrs = lay_addrs + skill_addrs + delain_addrs + forn_addrs

Exec email addresses

In [15]:
# 'ALLEN PHILLIP K'
allen_addrs = [
    'phillip.allen@enron.com', 
    'pallen70@hotmail.com',
    'k..allen@enron.com',
    'allen@enron.com'
]

# 'BECK SALLY W'
beck_addrs = [
    'sally.beck@enron.com',
    'beck@enron.com',
    'sbeck9@msn.com'
]

# 'BUY RICHARD B'
buy_addrs = [
    'rick.buy@enron.com',
    'buy@enron.com'
]

# 'DERRICK JR. JAMES V'
derrick_addrs = [ 
    'james.derrick@enron.com'
]

# 'HAEDICKE MARK E'
haedicke_addrs = [
    'mark.haedicke@enron.com',
    'e..haedicke@enron.com',
    'mark.e.haedicke@enron.com'
]

# 'KAMINSKI WINCENTY J'
kaminski_addrs = [
    'vince.kaminski@enron.com',
    'j.kaminski@enron.com',
    'vkaminski@aol.com',
    'vkaminski@palm.net',
    'j..kaminski@enron.com',
    'vince.j.kaminski@enron.com',
    'kaminski@enron.com'
]

# 'KEAN STEVEN J'
kean_addrs = [
    'j..kean@enron.com',
    'steven.kean@enron.com'
]

# 'KITCHEN LOUISE'
kitchen_addrs = [
    'louise.kitchen@enron.com',
    'kitchen@enron.com' 
]

# 'LAVORATO JOHN J'
lavorato_addrs = [
    'john.lavorato@enron.com',
    'lavorato@sympatico.ca',
    'john.j.lavorato@enron.com',
    'lavorato@enron.com'
]

# 'MARTIN AMANDA K'
martin_addrs = [
    'martin@enron.com'
]

# 'MCCONNELL MICHAEL S'
mcconnell_addrs = [
    'mike.mcconnell@enron.com'
]

# 'SHANKMAN JEFFREY A'
shankman_addrs = [
    'a..shankman@enron.com',
    'jeffrey.shankman@enron.com',
    'shankman@enron.com'
]

# 'SHAPIRO RICHARD S'
shapiro_addrs = [
    'richard.shapiro@enron.com',
    'shapiro@haas.berkeley.edu',
    'rickshapiro@hotmail.com',
    'shapiro@enron.com'
]

# 'TAYLOR MITCHELL S'
taylor_addrs = [
    'mitchell.taylor@enron.com'
]

# 'WHALLEY LAWRENCE G'
whalley_addrs = [
    'whalley@enron.com'
]

# 'WHITE JR THOMAS E'
white_addrs = []

exec_200_addrs = allen_addrs + beck_addrs + buy_addrs + derrick_addrs + haedicke_addrs + kaminski_addrs + kean_addrs + kitchen_addrs + lavorato_addrs + martin_addrs + mcconnell_addrs + shankman_addrs + shapiro_addrs + taylor_addrs + whalley_addrs + white_addrs

exec_300_addrs = buy_addrs + derrick_addrs + haedicke_addrs + kean_addrs + lavorato_addrs + martin_addrs + mcconnell_addrs + shankman_addrs + whalley_addrs + white_addrs

In [23]:
poi_addrs_dict = {
    "Lay": lay_addrs,
    "Skilling": skill_addrs,
    "Delainey": delain_addrs,
    "Forney": forn_addrs
}

exec_addrs_dict = {
    'Allen': allen_addrs, 
    'Beck': beck_addrs, 
    'Buy': buy_addrs, 
    'Derrick': derrick_addrs, 
    'Haedicke': haedicke_addrs, 
    'Kaminski': kaminski_addrs, 
    'Kean': kean_addrs, 
    'Kitchen': kitchen_addrs, 
    'Lavorato': lavorato_addrs, 
    'Martin': martin_addrs,
    'Mcconnell': mcconnell_addrs,
    'Shankman': shankman_addrs, 
    'Shapiro': shapiro_addrs, 
    'Taylor': taylor_addrs,
}

# all of Whalley's emails seem to be from someone named Liz Taylor
# White has no emails

Add info to the DataFrame

In [41]:
# Assign POI labels
df_emails['POI'] = False
df_emails.loc[df_emails['From'].isin(poi_addrs), 'POI'] = True

# Assign Exec 200 labels
df_emails['Exec 200'] = False
df_emails.loc[df_emails['From'].isin(exec_200_addrs), 'Exec 200'] = True

# Assign Exec 300 labels
df_emails['Exec 300'] = False
df_emails.loc[df_emails['From'].isin(exec_300_addrs), 'Exec 300'] = True

In [42]:
# add the Sender information to more easily identify the emails sent by that person
df_emails.loc[df_emails.index, 'Sender'] = np.nan

for name in poi_addrs_dict:
    df_emails.loc[df_emails['From'].isin(poi_addrs_dict[name]), 'Sender'] = name
    
for name in exec_addrs_dict:
    df_emails.loc[df_emails['From'].isin(exec_addrs_dict[name]), 'Sender'] = name

In [55]:
# df_emails.to_csv("emails_checkpoint1.csv", index_label='Original Index')

In [61]:
# separate POI, Execs, and Normal people
df_poi = df_emails[df_emails['POI']]
df_exec = df_emails[df_emails['Exec 200']]
df_norm = df_emails[(~df_emails['POI']) & (~df_emails['Exec 200'])]

In [74]:
# add sender information for the normal people
def get_sender(e):
    if '@enron.com' in e:
        i = e.split("@")[0]
        if "." in i:
            return i.split(".")[1].capitalize()
    return np.nan

df_norm['Sender'] = df_norm['From'].apply(lambda x: get_sender(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_norm['Sender'] = df_norm['From'].apply(lambda x: get_sender(x))


In [131]:
df_poi['Email Trimmed'].drop_duplicates()

18020     I have purchased from Duke   100 mw's at $51  ...
19066     We have four direct lines to the Ercot ISO tha...
19817     I have purchased from Duke   100 mw's at $51  ...
21617     Guys, attached you will find a final cut on th...
21729     Guys, attached you will find a preliminary age...
                                ...                        
499934    Kevin, given the track record of mis-behaviour...
500742    fyi, on a different note, given legislative ap...
516194    HA HA HA YOU STUPID, ARROGANT FUCK ___________...
516196    cocksucker ___________________________________...
516229    Fuck you, you piece of shit.  I can't wait to ...
Name: Email Trimmed, Length: 1220, dtype: object

In [133]:
import codecs

def to_ascii(s):
    return codecs.encode(s, 'ascii', 'ignore').decode()      # returns a regular string

# remove Rosalee Fleming emails; Lay's secretary 
def lay_processing(messages):
    return messages[~messages.apply(lambda s: ("Rosie" in s) | ("Rosalee" in s))]

# remove Sherri Sera, Joannie Williamson, SRS; Skilling's secretary 
def skilling_processing(messages):
    return messages[~messages.apply(lambda s: ("Sherri" in s) | ("Joannie" in s)| ("SRS" in s))]

In [137]:
# convert to ascii
df_poi['Email Trimmed'] = df_poi['Email Trimmed'].apply(lambda x: to_ascii(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_poi['Email Trimmed'] = df_poi['Email Trimmed'].apply(lambda x: to_ascii(x))


In [247]:
# sample of non-exec, non-poi people
norm_names = ['Mann',
 'Davis',
 'Dasovich',
 'Shackleton',
 'Germany',
 'Jones',
 'Symes',
 'Bass',
 'Lenhart',
 'Perlingiere',
 'Scott',
 'ClairFossum',
 'Nemec',
 'Rogers'
]

In [248]:
from nltk.tokenize import word_tokenize, sent_tokenize

df_n = []
df = df_norm
add_processing = {
    "Lay": lay_processing,
    "Skilling": skilling_processing,
}         

# for name in df['Sender'].unique():
for name in norm_names:
    if name in add_processing:
        good_emails = add_processing[name](df.loc[df['Sender'] == name, 'Email Trimmed'])
        df_n.append(good_emails)
    else:
        df_n.append(df.loc[df['Sender'] == name, 'Email Trimmed'])
df_processed = pd.concat(df_n)
        
# drop emails less than 5 words long
df_processed = df_processed[df_processed.apply(lambda x: len(word_tokenize(x)) >= 5)]

In [249]:
d = df_processed.to_frame().join(df[[col for col in df.columns if col != 'Email Trimmed']], how='left')

In [250]:
d = d[['Email Trimmed', 'Sender', 'POI', 'Exec 200', 'Exec 300', 'Date']].drop_duplicates()

In [251]:
d = d.rename(columns={"Email Trimmed": "Email"})

In [256]:
d = d.sort_index()

In [257]:
d.to_csv("normal_emails.csv", index_label='Original Index')

In [None]:
# want to remove all emails sent before 1999 (only about 100 emails were sent before then)

In [None]:
import datetime
dk['Date'] = dk['Date'].apply(lambda x: x.split("-")[0])
dk['Datetime'] = pd.to_datetime(dk['Date'])

In [176]:
df = df[(df['Datetime'] > datetime.datetime(1998, 12, 31)) & (~df['Datetime'].isna())]

In [178]:
df['Datetime'].sort_values()

Original Index
227687   1999-03-10 10:03:00
151575   1999-05-11 13:10:00
152107   1999-05-14 06:39:00
152484   1999-05-14 07:01:00
152595   1999-05-14 07:11:00
                 ...        
261792   2002-02-06 13:59:15
260771   2002-02-06 14:47:02
261259   2002-02-06 16:52:19
261152   2002-02-06 16:53:04
260876   2002-02-06 17:55:56
Name: Datetime, Length: 11919, dtype: datetime64[ns]