## Identifing Fraud from Enron Email and financial data

In [1]:
import sys
sys.path.append('./tools/')
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier



In [2]:
import pickle
import numpy as np
import pandas as pd

In [3]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer, f1_score



### Understanding the Dataset and Question

In [4]:
# initialize the list
features_list = ['poi',
                 'salary',
                 'bonus',
                 'long_term_incentive',
                 'deferred_income',
                 'deferral_payments',
                 'loan_advances', 
                 'other',
                 'expenses', 
                 'director_fees',
                 'total_payments',
                 'exercised_stock_options',
                 'restricted_stock',
                 'restricted_stock_deferred',
                 'total_stock_value',
                 'to_messages',
                 'from_messages',
                 'from_this_person_to_poi',
                 'from_poi_to_this_person',
                 'shared_receipt_with_poi']
print "There are {} features in the list.".format(len(features_list))

There are 20 features in the list.


#### Data exploration and wrangling

In [5]:
# load the dictionary containing the dataset
with open("./data/final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# transform the dictionary to dataframe
df = pd.DataFrame.from_dict(data_dict, orient='index', dtype=np.float)

# reorder dataframe
df = df[features_list]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 20 columns):
poi                          146 non-null float64
salary                       95 non-null float64
bonus                        82 non-null float64
long_term_incentive          66 non-null float64
deferred_income              49 non-null float64
deferral_payments            39 non-null float64
loan_advances                4 non-null float64
other                        93 non-null float64
expenses                     95 non-null float64
director_fees                17 non-null float64
total_payments               125 non-null float64
exercised_stock_options      102 non-null float64
restricted_stock             110 non-null float64
restricted_stock_deferred    18 non-null float64
total_stock_value            126 non-null float64
to_messages                  86 non-null float64
from_messages                86 non-null float64
from_this_person_to_poi      86 non-null fl

In [6]:
print "No. of data points: '{}' and features: '{}'".\
      format(df.shape[0], df.shape[1])
print "No. of POIs: {}".format(len(df[df['poi'] == 1]))
print "No. of non POIs: {}".format(len(df[df['poi'] == 0]))

No. of data points: '146' and features: '20'
No. of POIs: 18
No. of non POIs: 128


##### missing values

In [7]:
# missing values by feature
df = df.replace('NaN', np.nan)
print "Top 5 features by missing values:\n",\
      df.isnull().sum().sort_values(ascending=False).head()

Top 5 features by missing values:
loan_advances                142
director_fees                129
restricted_stock_deferred    128
deferral_payments            107
deferred_income               97
dtype: int64


In [8]:
# missing values by record
nan_value_row = df.isnull().sum(axis=1)
print "Top 5 records by missing values::\n",\
      nan_value_row.sort_values(ascending=False).head()

Top 5 records by missing values::
LOCKHART EUGENE E                19
GRAMM WENDY L                    17
WROBEL BRUCE                     17
WODRASKA JOHN                    17
THE TRAVEL AGENCY IN THE PARK    17
dtype: int64


In [9]:
df.ix['LOCKHART EUGENE E']

poi                          0.0
salary                       NaN
bonus                        NaN
long_term_incentive          NaN
deferred_income              NaN
deferral_payments            NaN
loan_advances                NaN
other                        NaN
expenses                     NaN
director_fees                NaN
total_payments               NaN
exercised_stock_options      NaN
restricted_stock             NaN
restricted_stock_deferred    NaN
total_stock_value            NaN
to_messages                  NaN
from_messages                NaN
from_this_person_to_poi      NaN
from_poi_to_this_person      NaN
shared_receipt_with_poi      NaN
Name: LOCKHART EUGENE E, dtype: float64

Conclusion: There is no useful information in data point `LOCKHART EUGENE E`. I'm going to remove it from the data set.

In [10]:
df.ix['THE TRAVEL AGENCY IN THE PARK']

poi                               0.0
salary                            NaN
bonus                             NaN
long_term_incentive               NaN
deferred_income                   NaN
deferral_payments                 NaN
loan_advances                     NaN
other                        362096.0
expenses                          NaN
director_fees                     NaN
total_payments               362096.0
exercised_stock_options           NaN
restricted_stock                  NaN
restricted_stock_deferred         NaN
total_stock_value                 NaN
to_messages                       NaN
from_messages                     NaN
from_this_person_to_poi           NaN
from_poi_to_this_person           NaN
shared_receipt_with_poi           NaN
Name: THE TRAVEL AGENCY IN THE PARK, dtype: float64

Conclusion: `THE TRAVEL AGENCY IN THE PARK` is clearly not a person and is not helpful to identify a POI. 

In [11]:
# remove data points: 'LOCKHART EUGENE E' & 'THE TRAVEL AGENCY IN THE PARK'
df = df.drop(['LOCKHART EUGENE E', 'THE TRAVEL AGENCY IN THE PARK'], 0)

In [12]:
# fill na for email features 
email_features = ['to_messages',
                  'from_messages',
                  'from_this_person_to_poi',
                  'from_poi_to_this_person',
                  'shared_receipt_with_poi']
imp = Imputer(missing_values='NaN', strategy='median', axis=0)

df.loc[df[df.poi == 1].index, email_features] =\
imp.fit_transform(df[email_features][df.poi == 1])

df.loc[df[df.poi == 0].index, email_features] =\
imp.fit_transform(df[email_features][df.poi == 0])

In [13]:
# fill na with 0 for financial features
df.ix[:,:15] = df.ix[:,:15].fillna(0)

##### data errors

In [14]:
# validate data by comparing the sum of all payment features with 'total_payments'
payment_features = ['salary',
                    'bonus',
                    'long_term_incentive',
                    'deferred_income',
                    'deferral_payments',
                    'loan_advances',
                    'other',
                    'expenses',
                    'director_fees']
print "Records with payment_features error:"
df[df[payment_features].sum(axis=1) != df.total_payments]

Records with payment_features error:


Unnamed: 0,poi,salary,bonus,long_term_incentive,deferred_income,deferral_payments,loan_advances,other,expenses,director_fees,total_payments,exercised_stock_options,restricted_stock,restricted_stock_deferred,total_stock_value,to_messages,from_messages,from_this_person_to_poi,from_poi_to_this_person,shared_receipt_with_poi
BELFER ROBERT,0.0,0.0,0.0,0.0,0.0,-102500.0,0.0,0.0,0.0,3285.0,102500.0,3285.0,0.0,44093.0,-44093.0,944.0,41.0,6.0,26.5,594.0
BHATNAGAR SANJAY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,137864.0,0.0,137864.0,15456290.0,2604490.0,-2604490.0,15456290.0,0.0,523.0,29.0,1.0,0.0,463.0


In [15]:
# validate data by comparing the sum of all stock features with 'total_stock_value'
stock_features = ['exercised_stock_options',
                  'restricted_stock',
                  'restricted_stock_deferred']
print "Records with stock_features error:"
df[df[stock_features].sum(axis=1) != df.total_stock_value]

Records with stock_features error:


Unnamed: 0,poi,salary,bonus,long_term_incentive,deferred_income,deferral_payments,loan_advances,other,expenses,director_fees,total_payments,exercised_stock_options,restricted_stock,restricted_stock_deferred,total_stock_value,to_messages,from_messages,from_this_person_to_poi,from_poi_to_this_person,shared_receipt_with_poi
BELFER ROBERT,0.0,0.0,0.0,0.0,0.0,-102500.0,0.0,0.0,0.0,3285.0,102500.0,3285.0,0.0,44093.0,-44093.0,944.0,41.0,6.0,26.5,594.0
BHATNAGAR SANJAY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,137864.0,0.0,137864.0,15456290.0,2604490.0,-2604490.0,15456290.0,0.0,523.0,29.0,1.0,0.0,463.0


It seems that the numbers were filled in the wrong entry. We'll try to fix `BELFER ROBERT` by shifting them 1 column to the left.

In [16]:
# payment_features
df.ix['BELFER ROBERT','deferred_income'] = -102500.0
df.ix['BELFER ROBERT','deferral_payments'] = 0.0
df.ix['BELFER ROBERT','expenses'] = 3285.0
df.ix['BELFER ROBERT','director_fees'] = 102500.0
df.ix['BELFER ROBERT','total_payments'] = 3285.0
# stock_features
df.ix['BELFER ROBERT','exercised_stock_options'] = 0.0
df.ix['BELFER ROBERT','restricted_stock'] = 44093.0
df.ix['BELFER ROBERT','restricted_stock_deferred'] = -44093.0
df.ix['BELFER ROBERT','total_stock_value'] = 0.0

print "No. of records with payment_features error: ",\
      len(df[df[payment_features].sum(axis=1) != df.total_payments])
print "No. of records with stock_features error: ",\
      len(df[df[payment_features].sum(axis=1) != df.total_payments])

No. of records with payment_features error:  1
No. of records with stock_features error:  1


It works. Let's fix `BHATNAGAR SANJAY` by shifting numbers 1 column to the right.

In [17]:
# payment_features
df.ix['BHATNAGAR SANJAY','other'] = 0.0
df.ix['BHATNAGAR SANJAY','expenses'] = 137864.0
df.ix['BHATNAGAR SANJAY','director_fees'] = 0.0
df.ix['BHATNAGAR SANJAY','total_payments'] = 137864.0
# stock_features
df.ix['BHATNAGAR SANJAY','exercised_stock_options'] = 15456290.0
df.ix['BHATNAGAR SANJAY','restricted_stock'] = 2604490.0
df.ix['BHATNAGAR SANJAY','restricted_stock_deferred'] = -2604490.0
df.ix['BHATNAGAR SANJAY','total_stock_value'] = 15456290.0

print "No. of records with payment_features error: ",\
      len(df[df[payment_features].sum(axis=1) != df.total_payments])
print "No. of records with stock_features error: ",\
      len(df[df[payment_features].sum(axis=1) != df.total_payments])

No. of records with payment_features error:  0
No. of records with stock_features error:  0


##### outliers

In [18]:
# sort data points by sum of outlier variables
outliers = df.quantile(.5) + 1.5 * (df.quantile(.75) - df.quantile(.25))
pd.DataFrame((df[1:] > outliers[1:]).\
             sum(axis=1), columns=['No. of outliers']).\
             sort_values('No. of outliers', ascending=False).head()

Unnamed: 0,No. of outliers
LAY KENNETH L,13
FREVERT MARK A,13
SKILLING JEFFREY K,12
WHALLEY LAWRENCE G,12
TOTAL,12


The results reflect multiple records with values outside of the quantile boundaries. Based on the facts of the case, this made sense for some of the finance data points and key individuals within Enron. Therefore, the records for actual people will not be removed. However, the TOTAL record is a spreadsheet calculation quirk from the finance data. So this outlier will be removed from the data set.

In [19]:
# remove outliers: 'TOTAL'
df = df.drop('TOTAL')

### Optimize Feature Selection/Engineering

#### create new features

1. financial_total
2. fraction_from_poi
3. fraction_to_poi

In [20]:
# create 'financial_total' feature
df['financial_total'] = df['total_payments'] + df['total_stock_value']
df.financial_total.sort_values(ascending=False).head()

LAY KENNETH L         152669871.0
SKILLING JEFFREY K     34776388.0
FREVERT MARK A         31874715.0
HIRKO JOSEPH           30857157.0
PAI LOU L              26941313.0
Name: financial_total, dtype: float64

In [21]:
# create 'fraction_from_poi' feature
df['fraction_from_poi'] = df['from_poi_to_this_person']/df['to_messages']
# clean 'inf' values caused by zero value of 'to_messages'
df = df.replace('inf', 0)
df.fraction_from_poi.sort_values(ascending=False).head()

DONAHUE JR JEFFREY M    0.217341
DEFFNER JOSEPH M        0.161064
COLWELL WESLEY          0.136519
REDMOND BRIAN L         0.122083
DIETRICH JANET R        0.118585
Name: fraction_from_poi, dtype: float64

In [22]:
# create 'fraction_to_poi' feature
df['fraction_to_poi'] = df['from_this_person_to_poi']/df['from_messages']
# clean 'inf' values caused by zero value of 'from_messages'
df = df.replace('inf', 0)
df.fraction_to_poi.sort_values(ascending=False).head()

HUMPHREY GENE E       1.000000
SHERRICK JEFFREY B    0.720000
HANNON KEVIN P        0.656250
GARLAND C KEVIN       0.613636
RIEKER PAULA H        0.585366
Name: fraction_to_poi, dtype: float64

In [23]:
# store dataframe with new features_list
features_list = features_list + ['financial_total', 'fraction_from_poi', 'fraction_to_poi']
df = df[features_list]

#### intelligently select features 

In [24]:
# get new feature list ccording to feature importances with decision tree
dt = DecisionTreeClassifier(random_state = 42)
dt.fit(df.ix[:,1:], df.ix[:,:1])

features_importance = []
for i in range(len(dt.feature_importances_)):
    if dt.feature_importances_[i] > 0:
        features_importance.append([df.columns[i+1], dt.feature_importances_[i]])
features_importance.sort(key=lambda x: x[1], reverse = True)
for f_i in features_importance:
    print f_i
features_list = [x[0] for x in features_importance]
features_list.insert(0, 'poi')

['fraction_to_poi', 0.34628571428571447]
['shared_receipt_with_poi', 0.22013095238095232]
['expenses', 0.16820105820105813]
['other', 0.15642989417989422]
['fraction_from_poi', 0.06128571428571427]
['deferral_payments', 0.047666666666666663]


In [25]:
features_list

['poi',
 'fraction_to_poi',
 'shared_receipt_with_poi',
 'expenses',
 'other',
 'fraction_from_poi',
 'deferral_payments']

In [26]:
# extract features and labels from dataset for local testing
my_dataset = df.to_dict('index')
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Pick and Tune an Algorithm

#### try a varity of classifiers

* Naive Baye
* Decision Tree
* Ada Boost
* Random Forest
* K-means

In [27]:
# try Naive Bayes
nb = GaussianNB()
pipe_nb = Pipeline(steps=[('scaler', StandardScaler()),
                          ('pca', PCA()),
                          ('clf', nb)])

test_classifier(pipe_nb, my_dataset, features_list)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', GaussianNB(priors=None))])
	Accuracy: 0.79267	Precision: 0.12551	Recall: 0.09300	F1: 0.10684	F2: 0.09808
	Total predictions: 15000	True positives:  186	False positives: 1296	False negatives: 1814	True negatives: 11704



In [28]:
# try Decision Tree
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)
pipe_dt = Pipeline(steps=[('scaler', StandardScaler()),
                          ('pca', PCA()),
                          ('clf', dt)])

test_classifier(pipe_dt, my_dataset, features_list)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
      ...        min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'))])
	Accuracy: 0.83360	Precision: 0.38302	Recall: 0.40600	F1: 0.39417	F2: 0.40119
	Total predictions: 15000	True positives:  812	False positives: 1308	False negatives: 1188	True negatives: 11692



In [29]:
# try Ada Boost
ada = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(criterion='entropy',
                                          random_state=42),
    random_state=42)
pipe_ada = Pipeline(steps=[('scaler', StandardScaler()),
                           ('pca', PCA()),
                           ('clf', ada)])

test_classifier(pipe_ada, my_dataset, features_list)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifi...e=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=42))])
	Accuracy: 0.83493	Precision: 0.38763	Recall: 0.41050	F1: 0.39874	F2: 0.40571
	Total predictions: 15000	True positives:  821	False positives: 1297	False negatives: 1179	True negatives: 11703



In [30]:
# try Random Forest
rf = RandomForestClassifier(random_state=42)
pipe_rf = Pipeline(steps=[('scaler', StandardScaler()),
                          ('pca', PCA()),
                          ('clf', rf)])

test_classifier(pipe_rf, my_dataset, features_list)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
         ...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])
	Accuracy: 0.87207	Precision: 0.53990	Recall: 0.27400	F1: 0.36352	F2: 0.30394
	Total predictions: 15000	True positives:  548	False positives:  467	False negatives: 1452	True negatives: 12533



In [31]:
# try K-mean
km = KMeans(n_clusters=2, random_state=42)
pipe_km = Pipeline(steps=[('scaler', StandardScaler()),
                          ('pca', PCA()),
                          ('clf', km)])

test_classifier(pipe_km, my_dataset, features_list)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0))])
	Accuracy: 0.75233	Precision: 0.16699	Recall: 0.21500	F1: 0.18798	F2: 0.20331
	Total predictions: 15000	True positives:  430	False positives: 2145	False negatives: 1570	True negatives: 10855



### Tune the algorithm

In [32]:
dt = DecisionTreeClassifier(random_state=42)

# create a grid search
grid = {
    'criterion': ('gini', 'entropy'),
    'min_samples_split': range(2, 7),
    'min_samples_leaf': range(1, 7)
}
search = GridSearchCV(estimator=dt, param_grid=grid,
                      scoring=make_scorer(f1_score), cv=3)

# fit the search
search.fit(features, labels)

# assign clf with best estimator
clf = search.best_estimator_

# use included tester function to assess performance using cross validation
test_classifier(clf, my_dataset, features_list)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
	Accuracy: 0.90920	Precision: 0.68002	Recall: 0.60250	F1: 0.63892	F2: 0.61656
	Total predictions: 15000	True positives: 1205	False positives:  567	False negatives:  795	True negatives: 12433



### References

* [ENRON - The Smartest Guys in the Room](https://www.youtube.com/watch?v=dbg5J_AjIto)
* [A look at those involved in the Enron scandal](http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm)