## Dataset Splitting

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import os
import re
import sys
import string
import pyprojroot

proj_root = pyprojroot.find_root(pyprojroot.has_file(".git"))
sys.path.append(os.path.join(proj_root, 'code'))

### 1) Split and prepare datasets

In [None]:
df_poi = pd.read_csv("poi_emails.csv", index_col='Original Index')
df_exec = pd.read_csv("exec_emails.csv", index_col='Original Index')
df_norm = pd.read_csv("normal_emails.csv", index_col='Original Index')

In [None]:
df_norm.POI.any()

In [2]:
def get_email_samples(df, email_cap=4000, max_sample_size=700):
    df_senders_list = []
    total_emails = 0
    for sender in df['Sender'].unique():
        df_tmp = df[df['Sender'] == sender]
        sample_size = np.min([max_sample_size, df_tmp.shape[0]])
        df_tmp = df_tmp.sample(sample_size)
        df_senders_list.append(df_tmp)
        total_emails += sample_size
        if total_emails > email_cap:
            break
    return pd.concat(df_senders_list)

email_sample_size = 700
num_exec_emails = 3000
num_norm_emails = 9000
df_exec_samples = get_email_samples(df_exec, email_cap=num_exec_emails, max_sample_size=email_sample_size)
df_norms_samples = get_email_samples(df_norm, email_cap=num_norm_emails, max_sample_size=email_sample_size)

NameError: name 'df_exec' is not defined

In [379]:
df = pd.concat([df_poi, df_exec_samples, df_norms_samples]).sort_index()

In [380]:
num_emails = df.shape[0]
num_poi = df[df['POI']].shape[0]
num_execs = df[(~df['POI']) & df['Exec 200']].shape[0]
num_norms = df[(~df['POI']) & (~df['Exec 200'])].shape[0]

print(f"Number of Emails = {num_emails}")
print(f"Fraction of POI = {num_poi/num_emails}")
print(f"Fraction of Execs = {num_execs/num_emails}")
print(f"Fraction of POI = {num_norms/num_emails}")

Number of Emails = 13240
Fraction of POI = 0.08376132930513595
Fraction of Execs = 0.2289274924471299
Fraction of POI = 0.6873111782477341


In [381]:
# breakdown of senders
df[['Sender', 'POI', 'Exec 200']].value_counts().sort_index(level=["POI", "Exec 200"])

Sender       POI    Exec 200
Bass         False  False       700
Dasovich     False  False       700
Davis        False  False       700
Germany      False  False       700
Jones        False  False       700
Lenhart      False  False       700
Mann         False  False       700
Nemec        False  False       700
Perlingiere  False  False       700
Rogers       False  False       700
Scott        False  False       700
Shackleton   False  False       700
Symes        False  False       700
Allen        False  True        614
Kitchen      False  True        700
Lavorato     False  True        700
Shankman     False  True        700
Shapiro      False  True        317
Delainey     True   True        688
Forney       True   True        392
Lay          True   True         12
Skilling     True   True         17
Name: count, dtype: int64

In [382]:
valid_size = 1986
test_size = 3310

In [383]:
# get Datetime objects
def date_standardizer(s):
    s = s.strip()
    m = re.search("[\d]+", s)
    if (m.end() - m.start()) == 1:
        return re.sub(f" {m.group()} ", f" 0{m.group()} ", s)
    return s
    
df['Date'] = df['Date'].apply(lambda x: date_standardizer(x))
df['Datetime'] = pd.to_datetime(df['Date'], format="%a, %d %b %Y %H:%M:%S")

In [384]:
df

Unnamed: 0_level_0,Email,Sender,POI,Exec 200,Exec 300,Date,Classify Email,Datetime
Original Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,test successful. way to go!!!,Allen,False,True,False,"Wed, 18 Oct 2000 03:00:00",test successful. way to go!!!,2000-10-18 03:00:00
3,"Randy, Can you send me a schedule of the sala...",Allen,False,True,False,"Mon, 23 Oct 2000 06:13:00","Steve, Can you send me a schedule of the salar...",2000-10-23 06:13:00
4,Let's shoot for Tuesday at 11:45.,Allen,False,True,False,"Thu, 31 Aug 2000 05:07:00",Let's shoot for Tuesday at 11:45.,2000-08-31 05:07:00
5,"Greg, How about either next Tuesday or Thursd...",Allen,False,True,False,"Thu, 31 Aug 2000 04:17:00","Greg, How about either next Tuesday or Thursda...",2000-08-31 04:17:00
6,Please cc the following distribution list with...,Allen,False,True,False,"Tue, 22 Aug 2000 07:44:00",Please cc the following distribution list with...,2000-08-22 07:44:00
...,...,...,...,...,...,...,...,...
516118,"Hi Andy, I assume by now the gas to power opti...",Shankman,False,True,True,"Thu, 07 Jun 2001 10:38:37","Hi Steve, I assume by now the gas to power opt...",2001-06-07 10:38:37
516194,"HA HA HA YOU STUPID, ARROGANT FUCK ___________...",Skilling,True,True,True,"Fri, 30 Nov 2001 15:34:35","HA HA HA YOU STUPID, ARROGANT FUCK ___________...",2001-11-30 15:34:35
516229,"Fuck you, you piece of shit. I can't wait to ...",Skilling,True,True,True,"Wed, 21 Nov 2001 12:13:05","Fuck you, you piece of shit. I can't wait to s...",2001-11-21 12:13:05
516296,Well done on getting this far with ICE - uphil...,Kitchen,False,True,False,"Tue, 05 Jun 2001 10:41:38",Well done on getting this far with ICE - uphil...,2001-06-05 10:41:38


In [385]:
y_full = df['POI']
X_full = df[[col for col in df.columns if col != 'POI']]

In [398]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=test_size, stratify=y)

In [399]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_size, stratify=y_train)

In [400]:
print(f"Training: {X_train.shape[0]}, Validation: {X_valid.shape[0]}, Test: {X_test.shape[0]}")

Training: 7944, Validation: 1986, Test: 3310


In [401]:
X_test[['Sender', 'Exec 200']].value_counts().sort_index(level=["Exec 200"])

Sender       Exec 200
Bass         False       176
Dasovich     False       170
Davis        False       162
Germany      False       167
Jones        False       175
Lenhart      False       156
Mann         False       170
Nemec        False       167
Perlingiere  False       174
Rogers       False       192
Scott        False       189
Shackleton   False       176
Symes        False       190
Allen        True        150
Delainey     True        172
Forney       True         92
Kitchen      True        180
Lavorato     True        186
Lay          True          4
Shankman     True        177
Shapiro      True         81
Skilling     True          4
Name: count, dtype: int64

In [402]:
X_valid[['Sender', 'Exec 200']].value_counts().sort_index(level=["Exec 200"])

Sender       Exec 200
Bass         False       100
Dasovich     False       114
Davis        False        95
Germany      False       112
Jones        False       102
Lenhart      False       113
Mann         False        90
Nemec        False       124
Perlingiere  False       106
Rogers       False       106
Scott        False        99
Shackleton   False       105
Symes        False       103
Allen        True         97
Delainey     True         96
Forney       True         67
Kitchen      True         91
Lavorato     True        128
Lay          True          1
Shankman     True        101
Shapiro      True         33
Skilling     True          3
Name: count, dtype: int64

In [403]:
X_train[['Sender', 'Exec 200']].value_counts().sort_index(level=["Exec 200"])

Sender       Exec 200
Bass         False       424
Dasovich     False       416
Davis        False       443
Germany      False       421
Jones        False       423
Lenhart      False       431
Mann         False       440
Nemec        False       409
Perlingiere  False       420
Rogers       False       402
Scott        False       412
Shackleton   False       419
Symes        False       407
Allen        True        367
Delainey     True        420
Forney       True        233
Kitchen      True        429
Lavorato     True        386
Lay          True          7
Shankman     True        422
Shapiro      True        203
Skilling     True         10
Name: count, dtype: int64

In [404]:
X_train['POI'] = y_train
X_valid['POI'] = y_valid
X_test['POI'] = y_test

In [405]:
X_train[[col for col in X_train.columns if col != 'Datetime']].to_csv("train_set.csv")
X_valid[[col for col in X_valid.columns if col != 'Datetime']].to_csv("valid_set.csv")
X_test[[col for col in X_test.columns if col != 'Datetime']].to_csv("test_set.csv")