In [74]:
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model
from sklearn import ensemble
from sklearn import model_selection
from sklearn import metrics

In [3]:
data = pd.read_csv("train_data.csv")
data.head()

Unnamed: 0,name,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,RICE KENNETH D,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,...,1617011.0,174839.0,True,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,SKILLING JEFFREY K,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,...,1920000.0,22122.0,True,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,SHELBY REX,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,...,,1573324.0,True,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,KOPPER MICHAEL J,800000.0,,,,michael.kopper@enron.com,,118134.0,,,...,602671.0,907502.0,True,985032.0,,224305.0,,,2652612.0,985032.0
4,CALGER CHRISTOPHER F,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,...,375304.0,486.0,True,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [94]:
data.name.nunique()

113

In [4]:
def split(data):
    feature = data.copy()
    label = feature.pop("poi").astype(int)
    return feature, label

In [9]:
def na_check(data):
    data_na = (data.isnull().sum() / len(data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.style.background_gradient(cmap='Blues'))
na_check(feature)

Unnamed: 0,Missing Ratio
loan_advances,98.230088
restricted_stock_deferred,91.150442
director_fees,88.495575
deferral_payments,75.221239
deferred_income,69.911504
long_term_incentive,56.637168
bonus,46.017699
from_messages,42.477876
from_poi_to_this_person,42.477876
from_this_person_to_poi,42.477876


In [61]:
def process_data(data):
    data = data.query('name!="THE TRAVEL AGENCY IN THE PARK"')
    data = data.query('name!="LOCKHART EUGENE E"')
    drop_feature = ["email_address", "name", "loan_advances", "restricted_stock_deferred", "director_fees", "deferral_payments"]
    data = data.drop(labels=drop_feature, axis=1)
    col = data.columns.tolist()
    imputer = SimpleImputer(missing_values=np.nan, copy=False, strategy="mean")
    imputer = imputer.fit(data)
    data = imputer.transform(data)
    data = pd.DataFrame(data, columns=col)
    print(f"Shape of data: {data.shape}")
    return data

In [62]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_features.csv")
train_data = process_data(train_data)
test_data = process_data(test_data)

Shape of data: (111, 16)
Shape of data: (33, 15)


In [65]:
train_data.loc[:, "kfold"] = -1
train_data = train_data.sample(frac=1).reset_index(drop=True)
skf = model_selection.StratifiedKFold(n_splits=5)

for f, (t_, v_) in enumerate(skf.split(X=train_data, y=train_data.poi)):
    train_data.loc[v_, "kfold"] = f

In [88]:
def run_training(folds=5):
    dfs = []
    for fold in range(folds):
        df_train = train_data[train_data.kfold != fold].reset_index(drop=True)
        df_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
        x_train, y_train = split(df_train)
        x_valid, y_valid = split(df_valid)

        clf = ensemble.RandomForestClassifier(n_estimators=200)
        clf.fit(x_train, y_train)
        pred = clf.predict_proba(x_valid)[:, 1]

        auc = metrics.roc_auc_score(y_valid, pred)
        print(f"Fold {fold}: {auc}")
        df_valid.loc[:, "rf_pred"] = pred
        dfs.append(df_valid)
    final_df = pd.concat(dfs)
    return final_df

In [89]:
final_df = run_training(folds=5)

Fold 0: 0.9
Fold 1: 0.825
Fold 2: 0.875
Fold 3: 0.6929824561403508
Fold 4: 0.7105263157894737


In [87]:
def expand_features(data):
    data.loc[:, "salary_p"] = data.loc[:, "salary"]/data.loc[:, "total_payments"]
    data.loc[:, "deferral_payments_p"] = data.loc[:, "deferral_payments"]/data.loc[:, "total_payments"]
    data.loc[:, "loan_advances_p"] = data.loc[:, "loan_advances"]/data.loc[:, "total_payments"]
    data.loc[:, "bonus_p"] = data.loc[:, "bonus"]/data.loc[:, "total_payments"]
    data.loc[:, "deferred_income_p"] = data.loc[:, "deferred_income"]/data.loc[:, "total_payments"]
    data.loc[:, "expenses_p"] = data.loc[:, "expenses"]/data.loc[:, "total_payments"]
    data.loc[:, "other_p"] = data.loc[:, "other"]/data.loc[:, "total_payments"]
    data.loc[:, "long_term_incentive_p"] = data.loc[:, "long_term_incentive"]/data.loc[:, "total_payments"]
    data.loc[:, "director_fees_p"] = data.loc[:, "director_fees"]/data.loc[:, "total_payments"]
    data.loc[:, "restricted_stock_deferred_p"] = data.loc[:, "restricted_stock_deferred"]/data.loc[:, "total_stock_value"]
    data.loc[:, "exercised_stock_options_p"] = data.loc[:, "exercised_stock_options"]/data.loc[:, "total_stock_value"]
    data.loc[:, "restricted_stock_p"] = data.loc[:, "restricted_stock"]/data.loc[:, "total_stock_value"]
    data.loc[:, "from_poi_to_this_person_p"] = data.loc[:, "from_poi_to_this_person"]/data.loc[:, "to_messages"]
    data.loc[:, "shared_receipt_with_poi_p"] = data.loc[:, "shared_receipt_with_poi"]/data.loc[:, "to_messages"]
    data.loc[:, "from_this_person_to_poi_p"] = data.loc[:, "from_this_person_to_poi"]/data.loc[:, "from_messages"]
    data.loc[:, "long_term_incentive_p"] = data.loc[:, "long_term_incentive"]/data.loc[:, "total_payments"]
    data.loc[:, "restricted_stock_deferred_p"] = data.loc[:, "restricted_stock_deferred"]/data.loc[:, "total_stock_value"]
    data.loc[:, "from_this_person_to_poi_p"] = data.loc[:, "from_this_person_to_poi"]/data.loc[:, "from_messages"]
    data.drop("long_term_incentive", axis=1)
    data.drop("restricted_stock_deferred", axis=1)
    data.drop("from_this_person_to_poi", axis=1)
    return data