# Enron Fraud POI Identifier
Udacity Intro to Machine Learning - Data Analyst Nanodegree Project 5

Author: Max Edwards

Date: January 12, 2015

## Enron Dataset Exploration

In [134]:
import pickle
import pprint
enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))

The code chunk above loads the data

In [135]:
print("The number of people in the dataset {:.0f}".format(len(enron_data.keys())))

The number of people in the dataset 146


In [136]:
print("The number of data fields for each person are {:.0f}".format(len(enron_data['METTS MARK'])))

The number of data fields for each person are 21


In [137]:
print("The keys in the data set are: {}".format(enron_data['METTS MARK'].keys()))

The keys in the data set are: ['salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'email_address', 'from_poi_to_this_person']


In [138]:
count = 0
for person in enron_data.keys():
    if enron_data[person]["poi"] == 1:
        count += 1
    else:
        continue
print("The number of pois (positive labels) is {}".format(count))

The number of pois (positive labels) is 18


In [139]:
# Check Salary
count_non_nans = 0
count_nans = 0
for key in enron_data.keys():
    if enron_data[key]['salary'] != 'NaN':
        count_non_nans += 1
    else:
        count_nans += 1
print("The amount of people with a known salary are {}".format(count_non_nans))
print("The amount of people with an unknown salary are {}".format(count_nans))

# Check emails
count_non_nans = 0
count_nans = 0
for key in enron_data.keys():
    if enron_data[key]['email_address'] != 'NaN':
        count_non_nans += 1
    else:
        count_nans += 1
print("")
print("The amount of people with a known email are {}".format(count_non_nans))
print("The amount of people with an unknown email are {}".format(count_nans))

The amount of people with a known salary are 95
The amount of people with an unknown salary are 51

The amount of people with a known email are 111
The amount of people with an unknown email are 35


In [140]:
# Percentage of people who have "NaN" as their total payments
count_non_nans = 0
count_nans = 0
for key in enron_data.keys():
    if enron_data[key]['total_payments'] != 'NaN':
        count_non_nans += 1
    else:
        count_nans += 1
print("The amount of people with a value for total payments are {}".format(count_non_nans))
print("The amount of people with a **NaN** value for total payments are {}".format(count_nans))
print("Percentage {:.2f}".format(float(count_nans)/(float(count_non_nans) + float(count_nans))))

The amount of people with a value for total payments are 125
The amount of people with a **NaN** value for total payments are 21
Percentage 0.14


In [141]:
# Percentage of POIs who have "NaN" as their total payments
count_non_nans = 0
count_nans = 0
for key in enron_data.keys():
    if enron_data[key]['poi'] == 1:
        if enron_data[key]['total_payments'] != 'NaN':
            count_non_nans += 1
        else:
            count_nans += 1
    else:
        continue
print("The amount of POIs with a value for total payments are {}".format(count_non_nans))
print("The amount of POIs with a **NaN** value for total payments are {}".format(count_nans))
print("Percentage {:.2f}".format(float(count_nans)/(float(count_non_nans) + float(count_nans))))

The amount of POIs with a value for total payments are 18
The amount of POIs with a **NaN** value for total payments are 0
Percentage 0.00


The two code chunks identify that all POIs have a `totalpayments` value. Therefore, if I tried to collect additional POIs without having a `totalpaynments` value for them, it would be counterproductive.

In [142]:
# Outlier salary / bonus
for key in enron_data.keys():
    if enron_data[key]['salary'] > 1000000 and enron_data[key]['salary'] != 'NaN':
        print "name: " + key
        print "salary:", enron_data[key]['salary']
        print "bonus: ", enron_data[key]['bonus']

name: LAY KENNETH L
salary: 1072321
bonus:  7000000
name: SKILLING JEFFREY K
salary: 1111258
bonus:  5600000
name: TOTAL
salary: 26704229
bonus:  97343619
name: FREVERT MARK A
salary: 1060932
bonus:  2000000


Prior using the data to build predicitive models, it's prudent to check for outliers. The above output identifies several individuals with very large salaries. It turns out there is a key for "TOTAL". This will need to be removed.

In [143]:
import operator
#count 'Nan' per key to see who is missing al ot of data
#found 'THE TRAVEL AGENCY IN THE PARK with 18 NaNs
count_dict = {}
for name in enron_data.keys():
    for value in enron_data[name].values():
        if value == 'NaN':
            count += 1
    count_dict[name] = count
    count = 0
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
pprint.pprint(sorted_dict)

[('HAEDICKE MARK E', 2),
 ('LAY KENNETH L', 2),
 ('ALLEN PHILLIP K', 2),
 ('FREVERT MARK A', 2),
 ('PIPER GREGORY F', 3),
 ('DERRICK JR. JAMES V', 3),
 ('HANNON KEVIN P', 4),
 ('MULLER MARK S', 4),
 ('RIEKER PAULA H', 4),
 ('OLSON CINDY K', 4),
 ('BELDEN TIMOTHY N', 4),
 ('RICE KENNETH D', 4),
 ('WASAFF GEORGE', 4),
 ('SHARP VICTORIA T', 4),
 ('BUY RICHARD B', 4),
 ('MCMAHON JEFFREY', 5),
 ('BLACHMAN JEREMY M', 5),
 ('COLWELL WESLEY', 5),
 ('WALLS JR ROBERT H', 5),
 ('SHANKMAN JEFFREY A', 5),
 ('BERGSIEKER RICHARD P', 5),
 ('BIBI PHILIPPE A', 5),
 ('LAVORATO JOHN J', 5),
 ('BANNANTINE JAMES M', 5),
 ('SHELBY REX', 5),
 ('DEFFNER JOSEPH M', 5),
 ('WHALLEY LAWRENCE G', 5),
 ('MCCONNELL MICHAEL S', 5),
 ('DELAINEY DAVID W', 5),
 ('FALLON JAMES B', 5),
 ('BOWEN JR RAYMOND M', 5),
 ('FITZGERALD JAY L', 5),
 ('DURAN WILLIAM D', 5),
 ('THORN TERENCE H', 5),
 ('CALGER CHRISTOPHER F', 5),
 ('KAMINSKI WINCENTY J', 5),
 ('COX DAVID', 5),
 ('SKILLING JEFFREY K', 5),
 ('KEAN STEVEN J', 5),
 ('MCCLE

`LOCKHART EUGENE E` has all 'NaN' values. Also, `THE TRAVEL AGENCY IN THE PARK` is not a person. Therefore I will remove these from the dataset prior to modeling.

## Data Cleaning/Preparation

In [144]:
### Clean dataset. Remove outliers Function and other cleaning
def cleanEnron(d):
    """
    Remove outliers from Enron dataset and other unwated data
    """
    ## Delete unwated keys
    del d['TOTAL']
    del d['THE TRAVEL AGENCY IN THE PARK']
    del d['LOCKHART EUGENE E']
    
    return d
data = cleanEnron(enron_data)

These keys removed were identified to be removed from the dataset during data exploration.

In [145]:
# Create additional features
for name in data.keys():
    # uses computeFraction function from poi_clean.py    
    from_to_poi_ratio = computeFraction(data[name]['from_this_person_to_poi'], data[name]['from_messages'] )
    data[name]['from_to_poi_ratio'] = from_to_poi_ratio 

In [146]:
# Confirm from_to_poi_ratio added - choose METTS MARK but could of been anyone to view all values
data['METTS MARK']

{'to_messages': 807, 'deferral_payments': 'NaN', 'expenses': 94299, 'poi': False, 'deferred_income': 'NaN', 'email_address': 'mark.metts@enron.com', 'long_term_incentive': 'NaN', 'from_to_poi_ratio': 0.034482758620689655, 'restricted_stock_deferred': 'NaN', 'shared_receipt_with_poi': 702, 'loan_advances': 'NaN', 'from_messages': 29, 'other': 1740, 'director_fees': 'NaN', 'bonus': 600000, 'total_stock_value': 585062, 'from_poi_to_this_person': 38, 'from_this_person_to_poi': 1, 'restricted_stock': 585062, 'salary': 365788, 'total_payments': 1061827, 'exercised_stock_options': 'NaN'}

In [147]:
from pandas import DataFrame as pd

df = pd.from_dict(data, orient='index')

In [148]:
df.head()

Unnamed: 0,to_messages,deferral_payments,expenses,poi,deferred_income,email_address,long_term_incentive,from_to_poi_ratio,restricted_stock_deferred,shared_receipt_with_poi,...,other,director_fees,bonus,total_stock_value,from_poi_to_this_person,from_this_person_to_poi,restricted_stock,salary,total_payments,exercised_stock_options
ALLEN PHILLIP K,2902.0,2869717.0,13868,False,-3081055.0,phillip.allen@enron.com,304805.0,0.029613,-126027.0,1407.0,...,152.0,,4175000.0,1729541,47.0,65.0,126027.0,201955.0,4484442,1729541.0
BADUM JAMES P,,178980.0,3486,False,,,,0.0,,,...,,,,257817,,,,,182466,257817.0
BANNANTINE JAMES M,566.0,,56301,False,-5104.0,james.bannantine@enron.com,,0.0,-560222.0,465.0,...,864523.0,,,5243487,39.0,0.0,1757552.0,477.0,916197,4046157.0
BAXTER JOHN C,,1295738.0,11200,False,-1386055.0,,1586055.0,0.0,,,...,2660303.0,,1200000.0,10623258,,,3942714.0,267102.0,5634343,6680544.0
BAY FRANKLIN R,,260455.0,129142,False,-201641.0,frank.bay@enron.com,,0.0,-82782.0,,...,69.0,,400000.0,63014,,,145796.0,239671.0,827696,


In [149]:
# Dimensions
df.shape

(143, 22)

In [150]:
poi = df.pop('poi')

In [151]:
df.insert(0, 'poi', poi)

In [152]:
# Check that poi is first column after the name
df.head()

Unnamed: 0,poi,to_messages,deferral_payments,expenses,deferred_income,email_address,long_term_incentive,from_to_poi_ratio,restricted_stock_deferred,shared_receipt_with_poi,...,other,director_fees,bonus,total_stock_value,from_poi_to_this_person,from_this_person_to_poi,restricted_stock,salary,total_payments,exercised_stock_options
ALLEN PHILLIP K,False,2902.0,2869717.0,13868,-3081055.0,phillip.allen@enron.com,304805.0,0.029613,-126027.0,1407.0,...,152.0,,4175000.0,1729541,47.0,65.0,126027.0,201955.0,4484442,1729541.0
BADUM JAMES P,False,,178980.0,3486,,,,0.0,,,...,,,,257817,,,,,182466,257817.0
BANNANTINE JAMES M,False,566.0,,56301,-5104.0,james.bannantine@enron.com,,0.0,-560222.0,465.0,...,864523.0,,,5243487,39.0,0.0,1757552.0,477.0,916197,4046157.0
BAXTER JOHN C,False,,1295738.0,11200,-1386055.0,,1586055.0,0.0,,,...,2660303.0,,1200000.0,10623258,,,3942714.0,267102.0,5634343,6680544.0
BAY FRANKLIN R,False,,260455.0,129142,-201641.0,frank.bay@enron.com,,0.0,-82782.0,,...,69.0,,400000.0,63014,,,145796.0,239671.0,827696,


In [153]:
import sys
sys.path.append("..\\tools\\")
from feature_format import featureFormat, targetFeatureSplit

labels, features = targetFeatureSplit(df.values)

In [154]:
labels

[False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, True, False, True, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, True, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, False, False, True, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False]

In [155]:
features

[array([2902, 2869717, 13868, -3081055, 'phillip.allen@enron.com', 304805,
       0.029612756264236904, -126027, 1407, 'NaN', 2195, 152, 'NaN',
       4175000, 1729541, 47, 65, 126027, 201955, 4484442, 1729541], dtype=object), array(['NaN', 178980, 3486, 'NaN', 'NaN', 'NaN', 0.0, 'NaN', 'NaN', 'NaN',
       'NaN', 'NaN', 'NaN', 'NaN', 257817, 'NaN', 'NaN', 'NaN', 'NaN',
       182466, 257817], dtype=object), array([566, 'NaN', 56301, -5104, 'james.bannantine@enron.com', 'NaN', 0.0,
       -560222, 465, 'NaN', 29, 864523, 'NaN', 'NaN', 5243487, 39, 0,
       1757552, 477, 916197, 4046157], dtype=object), array(['NaN', 1295738, 11200, -1386055, 'NaN', 1586055, 0.0, 'NaN', 'NaN',
       'NaN', 'NaN', 2660303, 'NaN', 1200000, 10623258, 'NaN', 'NaN',
       3942714, 267102, 5634343, 6680544], dtype=object), array(['NaN', 260455, 129142, -201641, 'frank.bay@enron.com', 'NaN', 0.0,
       -82782, 'NaN', 'NaN', 'NaN', 69, 'NaN', 400000, 63014, 'NaN', 'NaN',
       145796, 239671, 827696, 'NaN'

## Use MinMaxScaler on Features

In [157]:
# scale features
from poi_feature_engineering import scaleFeatures
features = scaleFeatures(features)

NameError: name 'scaleFeatures' is not defined