In [1]:
import numpy as np
import pandas as pd

import re

In [2]:
df_emails = pd.read_csv("../data/emails.csv")
df_fraud = pd.read_csv("../data/enron_fraud.csv")
df_fraud = df_fraud.rename(columns={'Unnamed: 0':"person"})

In [3]:
# get the author names from Enron email
author_list = [i.split("/")[0] for i in df_emails['file']]
author_names_email, counts_email = np.unique(author_list, return_counts=True)

In [4]:
print(', '.join(author_names_email))

allen-p, arnold-j, arora-h, badeer-r, bailey-s, bass-e, baughman-d, beck-s, benson-r, blair-l, brawner-s, buy-r, campbell-l, carson-m, cash-m, causholli-m, corman-s, crandell-s, cuilla-m, dasovich-j, davis-d, dean-c, delainey-d, derrick-j, dickson-s, donoho-l, donohoe-t, dorland-c, ermis-f, farmer-d, fischer-m, forney-j, fossum-d, gang-l, gay-r, geaccone-t, germany-c, gilbertsmith-d, giron-d, griffith-j, grigsby-m, guzman-m, haedicke-m, hain-m, harris-s, hayslett-r, heard-m, hendrickson-s, hernandez-j, hodge-j, holst-k, horton-s, hyatt-k, hyvl-d, jones-t, kaminski-v, kean-s, keavey-p, keiser-k, king-j, kitchen-l, kuykendall-t, lavorato-j, lay-k, lenhart-m, lewis-a, linder-e, lokay-m, lokey-t, love-p, lucci-p, maggi-m, mann-k, martin-t, may-l, mccarty-d, mcconnell-m, mckay-b, mckay-j, mclaughlin-e, merriss-s, meyers-a, mims-thurston-p, motley-m, neal-s, nemec-g, panus-s, parks-j, pereira-s, perlingiere-d, phanis-s, pimenov-v, platter-p, presto-k, quenet-j, quigley-d, rapp-b, reitmeyer-j

In [5]:
# simplify the last names since they're formatred differently in either dataset
last_names_fraud = df_fraud['person'].apply(lambda x: x.lower().split()[0])
last_names_email = [i.split("-")[0] for i in author_names_email]

In [6]:
last_names_fraud

0           allen
1           badum
2      bannantine
3          baxter
4             bay
          ...    
141       winokur
142      wodraska
143        wrobel
144        yeager
145          yeap
Name: person, Length: 146, dtype: object

In [7]:
# get the name overlap between the two datasets
overlapping_people = [name for name in last_names_fraud if name in last_names_email]
overlapping_people

['allen',
 'beck',
 'buy',
 'delainey',
 'derrick',
 'haedicke',
 'hayslett',
 'horton',
 'kaminski',
 'kean',
 'kitchen',
 'lavorato',
 'lay',
 'lewis',
 'martin',
 'mccarty',
 'mcconnell',
 'pereira',
 'shankman',
 'shapiro',
 'skilling',
 'taylor',
 'whalley',
 'white']

In [8]:
# get a short version of someone's name
df_fraud["person_short"] = df_fraud["person"].apply(lambda x: x.lower().split()[0])

In [9]:
overlapping_poi = df_fraud[df_fraud["person_short"].isin(overlapping_people)]

In [10]:
not_poi = df_fraud[~df_fraud["person_short"].isin(overlapping_people)]

In [11]:
overlapping_poi[["person", "poi"]]

Unnamed: 0,person,poi
0,ALLEN PHILLIP K,False
6,BECK SALLY W,False
19,BUY RICHARD B,False
31,DELAINEY DAVID W,True
32,DERRICK JR. JAMES V,False
58,HAEDICKE MARK E,False
62,HAYSLETT RODERICK J,False
66,HORTON STANLEY C,False
72,KAMINSKI WINCENTY J,False
73,KEAN STEVEN J,False


In [12]:
# make a shorter dataframe to track author names and the number of emails they have
df_authors = pd.DataFrame({"authors": author_names_email, "email counts": counts_email})
df_authors["name_short"] = df_authors['authors'].apply(lambda x: x.split("-")[0])

# the authors with overlap in the fraud dataset
df_authors[df_authors['name_short'].isin(overlapping_people)]

Unnamed: 0,authors,email counts,name_short
0,allen-p,3034,allen
7,beck-s,11830,beck
11,buy-r,2429,buy
22,delainey-d,3566,delainey
23,derrick-j,1766,derrick
42,haedicke-m,5246,haedicke
45,hayslett-r,2554,hayslett
51,horton-s,2470,horton
55,kaminski-v,28465,kaminski
56,kean-s,25351,kean


In [13]:
# Get the approximate email length for each person
# this is very hacky
# we need to clean up the emails
df_emails['approx_message_length'] = df_emails['message'].apply(lambda x: len(" ".join(x.split("\n\n")[1:]).split()))

In [14]:
df_emails["approx_message_length"].describe()

count    517401.000000
mean        262.045742
std         822.519148
min           1.000000
25%          45.000000
50%         114.000000
75%         256.000000
max       64024.000000
Name: approx_message_length, dtype: float64

In [15]:
# number emails from Delainey, Lay, and Skilling
df_authors[df_authors["name_short"].isin(['lay', 'skilling', 'delainey'])]

Unnamed: 0,authors,email counts,name_short
22,delainey-d,3566,delainey
63,lay-k,5937,lay
118,skilling-j,4139,skilling


In [16]:
# authors with between 3500 and 6000 emails
# includes Delainey, Lay, and Skilling
df_authors[(df_authors['email counts'] > 3500) & (df_authors['email counts'] < 6000)]

Unnamed: 0,authors,email counts,name_short
1,arnold-j,4898,arnold
22,delainey-d,3566,delainey
32,fossum-d,4796,fossum
38,giron-d,4220,giron
42,haedicke-m,5246,haedicke
43,hain-m,3820,hain
60,kitchen-l,5546,kitchen
62,lavorato-j,4685,lavorato
63,lay-k,5937,lay
64,lenhart-m,5920,lenhart
