In [1]:
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append("../tools/")

# suppress scientific notations
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



## Task 1: Select what features you'll use

In [3]:
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 
'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 
'long_term_incentive', 'restricted_stock', 'director_fees', 'to_messages',
'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 
'shared_receipt_with_poi']

selected_features = ['poi', 'salary', 'total_payments', 'bonus', 
'total_stock_value', 'exercised_stock_options', 'long_term_incentive', 
'restricted_stock', 'to_messages', 'shared_receipt_with_poi']

In [4]:
# Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [5]:
# create pandas dataframe from dictionary and 
# transpose the axes for easier manipulation 
df = pd.DataFrame.from_dict(data_dict).transpose()

# replace string 'NaN' with nan value
df = df.replace('NaN', np.nan)

In [6]:
# compute the missing values per feature
missing = df.isnull().sum()

# transform to dataframe
missing = pd.DataFrame(missing, columns = ['total'])

# add column for # of missing values for pois & non-pois
missing['poi'] = df[df.poi == True].isnull().sum()
missing['non_poi'] = df[df.poi == False].isnull().sum()

# calculate percentage of missing values
missing['%_missing_poi'] = (missing['poi'] / 18) * 100
missing['%_missing_non_poi'] = (missing['non_poi'] / 130) * 100

# sorting dataframe by # of total missing values 
missing.sort_values(by ='total', ascending = False, inplace = True)

In [7]:
# missing email features
missing.ix[[7,8,9,10,11,17],]

Unnamed: 0,total,poi,non_poi,%_missing_poi,%_missing_non_poi
from_messages,60,4,56,22.22,43.08
from_poi_to_this_person,60,4,56,22.22,43.08
from_this_person_to_poi,60,4,56,22.22,43.08
shared_receipt_with_poi,60,4,56,22.22,43.08
to_messages,60,4,56,22.22,43.08
email_address,35,0,35,0.0,26.92


In [8]:
# missing financial features
missing.ix[[0,1,2,3,4,5,6,12,13,14,15,16,18,19],]

# export table to csv for report
#missing.to_csv('missing_all.csv')

Unnamed: 0,total,poi,non_poi,%_missing_poi,%_missing_non_poi
loan_advances,142,17,125,94.44,96.15
director_fees,129,18,111,100.0,85.38
restricted_stock_deferred,128,18,110,100.0,84.62
deferral_payments,107,13,94,72.22,72.31
deferred_income,97,7,90,38.89,69.23
long_term_incentive,80,6,74,33.33,56.92
bonus,64,2,62,11.11,47.69
other,53,0,53,0.0,40.77
expenses,51,0,51,0.0,39.23
salary,51,1,50,5.56,38.46


In [9]:
# find missing values per row
empty_rows = df.isnull().sum(axis=1)
empty_rows.sort(ascending = False)
print empty_rows.head(3)

# drop LOCKHART for missing all values (except POI labeling)
df = df.drop(['LOCKHART EUGENE E'])

LOCKHART EUGENE E    20
GRAMM WENDY L        18
WROBEL BRUCE         18
dtype: int64


In [10]:
# remove email address column
df = df.drop('email_address', axis = 1)

## Task 2: Remove outliers

In [11]:
# plot email features for POIs and non-POIs (1)
df[['poi', 'to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
    'shared_receipt_with_poi']].boxplot(figsize=(15,10), sym='b.', by = 'poi')
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation = 'vertical')
plt.show()
#plt.savefig('emails.png')

In [12]:
# plot financial features for POIs and non-POIs (2)
df[['poi', 'salary', 'total_payments', 'loan_advances', 'bonus', 'total_stock_value', 
    'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 
    'director_fees']].boxplot(figsize=(15,15), sym='b.', by = 'poi')
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation = 'vertical')
plt.show()
#plt.savefig('financials.png')

In [13]:
# drop TOTAL and THE TRAVEL AGENCY IN THE PARK rows 
df = df.drop(['TOTAL','THE TRAVEL AGENCY IN THE PARK'])

In [14]:
# replace missing values with 0 
df = df.fillna(0)

# calculate the total payments
df['total_p'] = df[['bonus', 'deferral_payments', 'deferred_income', 
					'director_fees', 'expenses', 'loan_advances', 
                    'long_term_incentive', 'other', 'salary']].sum(axis = 1)

# calculate difference between own calculation and column total_payments
df['diff'] = df['total_p'] - df['total_payments']

# show names for who the calculated and supplied total payments are not equal
names = df.index[df['diff'] != 0].tolist()
print names

# remove created columns from df
df = df.drop(['total_p', 'diff'], axis = 1)

# drop two employees (BELFER and BHATNAGAR) with incorrect values
df = df.drop(names)

# create new_df with the correct values from enron61702insiderpay.pdf
values = [[0, 0, 0, 0, 15456290, 137864, 29, 0, 1, 0, 0, 0, False, 2604490, 
		  -2604490, 0, 463, 523, 137864, 15456290], 
          [0, 0, -102500, 102500, 0, 3285, 0, 0, 0, 0, 0, 0, False, 44093, 
          -44093, 0, 0, 0, 3285, 0]]

new_df = pd.DataFrame(values, columns = list(df), 
	index = ['BHATNAGAR SANJAY', 'BELFER ROBERT'])

# append new_df to df 
df = df.append(new_df)

# sort index so the names are sorted alfabetically again
df = df.sort_index(axis = 0)

# find number of employees
print "Number of employees", len(df)

['BELFER ROBERT', 'BHATNAGAR SANJAY']
Number of employees 143


## Task 3: Create new feature(s) 

In [15]:
# create bonus to salary ratio
df['bonus_salary_ratio'] = df['bonus'] / df['salary']

# create bonus to total payments ratio
df['bonus_tp_ratio'] = df['bonus'] / df['total_payments']

# replace nan values with 0 
df = df.fillna(0)

# select features 
def select_features(list_features, new):
	features = list_features
	if new == True:
		features += ['bonus_tp_ratio','bonus_salary_ratio']
	return features 

features = select_features(selected_features, True)

# create dataframe for selected features
df = df.ix[:, features] 

# create new features list
new_features_list = list(df)

# store dictionary to my_dataset
my_dataset = df.to_dict('index')

# extract features and labels from dataset for testing  
data = featureFormat(my_dataset, new_features_list, sort_keys = True) 
labels, features = targetFeatureSplit(data)

## Task 4: Try a variety of classifiers

In [16]:
# import packages
import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split

# create scaler
scaler = StandardScaler()

# create feature selection method
select = SelectKBest()

# create different classifiers
dt = DecisionTreeClassifier()
gnb = GaussianNB()
knn = KNeighborsClassifier()

# create pipeline for gnb & dt
pipeline = Pipeline([('feature_selection', select), ('classifier', dt)])

# create pipeline for knn
#pipeline = Pipeline([('scaler', scaler), ('feature_selection', select), ('classifier', kn)])

# create parameters to explore in grid search
parameters = dict(
	feature_selection__k = range(2,10),
    # dt
    classifier__splitter = ['best', 'random'],
    classifier__criterion = ['entropy', 'gini'], 
    classifier__min_samples_split = [4,6],
    classifier__class_weight = ['balanced', None],
    classifier__min_samples_leaf = [12,15],
    # knn
    #classifier__n_neighbors = [2,3,4,5],
    #classifier__weights = ['uniform', 'distance'],
    #classifier__algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #classifier__p = [1,2,3]
    classifier__random_state = [42]
    )

## Task 5: Tune your classifier to achieve better than .3 p & r scores

In [17]:
# import package
from sklearn.metrics import classification_report

# create training and test data 
features_train, features_test, labels_train, labels_test = \
train_test_split(features, labels, test_size = 0.3, random_state = 42)

# create stratified shuffle split 
sss = StratifiedShuffleSplit(n_splits=100, test_size = 0.3, random_state = 42)

# create grid search
gs = GridSearchCV(pipeline, param_grid = parameters, cv = sss, scoring = 'f1', n_jobs = 10)

# fit to total dataset due to size and imbalances
gs.fit(features,labels)

# find parameter values
gs.best_params_

# assign best estimator model to clf
clf = gs.best_estimator_
print 'Best model found by grid search:'
print clf
print '\n'

# create classification report for best estimator model
predictions = gs.predict(features_test)
names = ['non-POI', 'POI']
report = classification_report(labels_test, predictions, target_names = names)
print report

Best model found by grid search:
Pipeline(memory=None,
     steps=[('feature_selection', SelectKBest(k=9, score_func=<function f_classif at 0x11e9fb8c0>)), ('classifier', DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=15, min_samples_split=4,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='random'))])


             precision    recall  f1-score   support

    non-POI       0.97      0.76      0.85        38
        POI       0.31      0.80      0.44         5

avg / total       0.89      0.77      0.81        43



In [18]:
# find indices of the features that are selected
list_feat = clf.named_steps['feature_selection'].get_support(indices = True)
    
# get scores for features
scores = clf.named_steps['feature_selection'].scores_

# get importances for features
importances = clf.named_steps['classifier'].feature_importances_

# sort indices with highest importance first
indices = np.argsort(importances)[::-1]

# create list with names & scores of selected features
selected_features_list = ['poi']
scores_list = []
for i in list_feat:
	selected_features_list.append(new_features_list[i + 1])
	scores_list.append(scores[i])

# create overview of selected features and their importance and scores    
print 'Feature Ranking: (importance - score)'
for i in indices:
	print "{} ({} - {})".format(
		selected_features_list[i+1],
		round(importances[i],3), 
		round(scores_list[i],3))

Feature Ranking: (importance - score)
salary (0.388 - 18.29)
exercised_stock_options (0.307 - 22.349)
bonus (0.305 - 20.792)
bonus_salary_ratio (0.0 - 10.784)
bonus_tp_ratio (0.0 - 20.716)
restricted_stock (0.0 - 8.825)
long_term_incentive (0.0 - 9.922)
total_stock_value (0.0 - 22.511)
total_payments (0.0 - 9.284)


## Task 6: Dump your classifier, dataset, and features_list

In [19]:
dump_classifier_and_data(clf, my_dataset, new_features_list)