Final project restart

In [1]:
import sys
import pickle
sys.path.append("../tools/")

import pandas as pd
import numpy as np

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

import matplotlib.pyplot as plt
%matplotlib inline
from pprint import pprint

from sklearn.preprocessing import Imputer
                                                
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.decomposition import PCA

from sklearn.cross_validation import StratifiedShuffleSplit

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from tester import test_classifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_fscore_support

### Data Import

In [2]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
df = pd.DataFrame(data_dict)
df = df.transpose()
df = df.drop('email_address', axis=1)
df = df.astype(float)
df = df.drop('TOTAL')
df = df.drop("THE TRAVEL AGENCY IN THE PARK")
df = df.drop("loan_advances", axis=1)
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']

In [3]:
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

## Decision Tree Pipeline: median imputed

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)

print """DecisionTreeClassifier\nImputer: median\nNormalize: StandardScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)

DecisionTreeClassifier
Imputer: median
Normalize: StandardScaler
PCA: dimensions unspecified

Accuracy Score: 0.863636363636

Recall: 0.6

Precision: 0.428571428571


### Discussion
Ok, so this looks pretty good. I am going to go with this. 0.6 and 0.43. Not bad. Now I am going to go with this, tweeking the parameters on this model. Then I will worry about how to get this into the Udacity grader. 

First, I am not going to re-import all the modules. 

Second, I am not going to print out all the pipeline parameters. 

Third, I don't know what the PCA is doing because the pipeline does not assign a set number of dimensions for the pca to limit the data to. I really don't know how that is working. It is kind of strange. Without limiting it to a set number of dimensions I don't know why it would be having any effect at all? In fact, I am going to test that hypothesis by taking out the PCA and seeing if the result changes. 

The first thing that I discovered is that the pipeline object is not callable, or at least that is the error I got. So, I put the import statement back in the code and ran it again and everything was fine. Ok, so, I will keep that in the code evertime. I guess the big import statement at the top of the code was kind of a waste of time. 

Now, I am going to take out the PCA and see what, if any, difference it makes. 

## Decision Tree with no PCA

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)

# print "\nPipeline parameters:"
# pprint(pipeline.get_params())

print "\n"
print """DecisionTreeClassifier\nImputer: median\nNormalize: StandardScaler\nPCA: NONE"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)



DecisionTreeClassifier
Imputer: median
Normalize: StandardScaler
PCA: NONE

Accuracy Score: 0.75

Recall: 0.0

Precision: 0.0


### Discussion
Ok, without the PCA the accuracy goes down to 0.75 and the recall and precision drop to zero. So, the PCA has to stay in, I guess. 

Now I am going to run it with the pca in but with the scalar changed to Robust since the data is very sparse in some places. Also, I am going to stop printing out the parameters of the pipeline since I know what they are. 

## Decision Tree with PCA, median and Robust Scaler

In [6]:
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', RobustScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)


print """DecisionTreeClassifier\nImputer: median\nNormalize: RobustScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)

DecisionTreeClassifier
Imputer: median
Normalize: RobustScaler
PCA: dimensions unspecified

Accuracy Score: 0.840909090909

Recall: 0.2

Precision: 0.25


### Discussion
Massive drop in preformance with the Robust scaler. No more of that. Now I am going to try a different strategy with imputation, imputing the most_frequent value instead of the median and going back to the StadardScaler. 

## Decision Tree Pipeline with most_frequent

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='most_frequent')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)
print """DecisionTreeClassifier\nImputer: most_frequent\nNormalize: StandardScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)

DecisionTreeClassifier
Imputer: most_frequent
Normalize: StandardScaler
PCA: dimensions unspecified

Accuracy Score: 0.886363636364

Recall: 0.4

Precision: 0.5


### Discussion
Ok, that is good, but it looks like the first model is still better. The first model with mean imputation for the missing values had the recall score up at 0.6 while the precision score was a bit lower at 0.43. Since recall is the more important dimension we will stick with that. 

Now that we have a good score on the recall with the decision trees lets look and see what we can do when we try out different numbers of principle components. To do that I am going to have to introduce a grid search. 

## Grid Search with PCA-2 to 18

In [31]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

x = [x for x in range(2,19)]
param_grid = {'pca__n_components': x}

# pipeline = Pipeline.fit(features_train, labels_train)

gridCV_object = GridSearchCV(estimator = Pipeline, 
                             param_grid = param_grid)

gridCV_object.fit(features_train, labels_train)

print "Best parameters from the grid search:", pprint(gridCV_object.best_params_)

clf_gridCV = gridCV_object.best_estimator_

print "\nBest estimator accuracy:", clf_gridCV.score(features_test, labels_test)

clf_gridCV_pred = clf_gridCV.predict(features_test)

print "\n\nRecall Score:", recall_score(labels_test, clf_gridCV_pred)
print "\n\nPrecision Score:", precision_score(labels_test, clf_gridCV_pred)

Best parameters from the grid search:{'pca__n_components': 9}
 None

Best estimator accuracy: 0.863636363636


Recall Score: 0.4


Precision Score: 0.4


### Discussion: adding grid_search PCA
This is not really improving. It has gone down instead of up. The precision score has actually gone down from 0.5 to 0.4. It really sucks. I don't know why I am doing this. And I am sure that once I stick it into the Udacity grader it will get worse. It

The next thing I will try is putting the imputation in the param_grid. This should not make any difference but I will see.

## Grid Search for PCA and Imputation Strategy

In [32]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer()),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

x = [x for x in range(2,19)]
param_grid = {'imp__strategy':['median', 'most_frequent'],
              'pca__n_components': x}

# pipeline = Pipeline.fit(features_train, labels_train)

gridCV_object = GridSearchCV(estimator = Pipeline, 
                             param_grid = param_grid)

gridCV_object.fit(features_train, labels_train)

print "Best parameters from the grid search:", pprint(gridCV_object.best_params_)

clf_gridCV = gridCV_object.best_estimator_

print "\nBest estimator accuracy:", clf_gridCV.score(features_test, labels_test)

clf_gridCV_pred = clf_gridCV.predict(features_test)

print "\n\nRecall Score:", recall_score(labels_test, clf_gridCV_pred)
print "\n\nPrecision Score:", precision_score(labels_test, clf_gridCV_pred)

Best parameters from the grid search:{'imp__strategy': 'most_frequent', 'pca__n_components': 5}
 None

Best estimator accuracy: 0.818181818182


Recall Score: 0.2


Precision Score: 0.2


### Discussion
Ok, this is driving me insane. I have put in the possibility of using median or most_frequent as an imputation strategy and it chooses most_frequent and actually does worse on all three outcomes. This is totally rediculous. It is getting worse the more choices I give it! 

I have been on the discussion board and I have seen the coaches say that you can't make it do worse by giving it more choices. I mean, how could it? Also, I have set the random_state variable to a fixed number, 53, so that can't be it. What is going on? 

## New Feature-Data Reimport

In [10]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
df = pd.DataFrame(data_dict)
df = df.transpose()
df = df.drop('email_address', axis=1)
df = df.astype(float)
df = df.drop('TOTAL')
df = df.drop("THE TRAVEL AGENCY IN THE PARK")
df = df.drop("loan_advances", axis=1)
#new feature
df['deferred_ratio'] = df['deferred_income']/(df['total_payments'] + 1)
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']

In [11]:
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

## Decision Tree Pipeline-median imputed

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)

print """DecisionTreeClassifier\nImputer: median\nNormalize: StandardScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)

DecisionTreeClassifier
Imputer: median
Normalize: StandardScaler
PCA: dimensions unspecified

Accuracy Score: 0.795454545455

Recall: 0.4

Precision: 0.25


### Discussion
So one thing I have found out is that the new feature I created of deferred compensation actually makes things worse. So, I am going to get rid of it. The question is do I want to get rid of it by reimporting the data set or taking the column of data out. 

I am just going to take it out and recalculate the training and test sets. 

In [13]:
#df = df.drop("deferred_ratio", axis=1)
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']

In [14]:
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)

print """DecisionTreeClassifier\nImputer: median\nNormalize: StandardScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)

DecisionTreeClassifier
Imputer: median
Normalize: StandardScaler
PCA: dimensions unspecified

Accuracy Score: 0.795454545455

Recall: 0.4

Precision: 0.25


In [16]:
features_train

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value,deferred_ratio
COX DAVID,800000,,-41250,,117551,27861,33,0,4,,494,378082,,314288,71,102,1101393,495633,-0.037453
ECHOLS JOHN B,200000,,,,601438,21530,,,,2234774,53775,407503,,182245,,,2692324,1008941,
MARTIN AMANDA K,,85430,,,2070306,8211,230,8,0,5145434,2818454,,,349487,477,1522,8407016,2070306,
CHRISTODOULOU DIOMEDES,,,,,5127155,,,,,,,950730,,,,,,6077885,
CLINE KENNETH W,,,,,,,,,,,,662086,-472568,,,,,189518,
CHAN RONNIE,,,-98784,98784,,,,,,,,32460,-32460,,,,,,
SCRIMSHAW MATTHEW,,,,,759557,,,,,,,,,,,,,759557,
HUGHES JAMES A,,,,,754966,,34,35,5,,,363428,,,589,719,,1118394,
FITZGERALD JAY L,350000,,,,664461,23870,16,1,8,556416,285414,956775,,199157,723,936,1414857,1621236,
BOWEN JR RAYMOND M,1350000,,-833,,,65907,27,140,15,974293,1621,252055,,278601,1593,1858,2669589,252055,-0.000312


### Discussion
So now it is getting way better. I don't understand how it has gotten better. Recall is at 0.6, which is where it was before at its but, but both accuracy and precision are better than ever. Accuracy is at 0.93 and precision is at 0.75. So I will keep it but I don't see how it has happened. 

Now we can see if the improvement is transmitted to the other models. 

UPDATE: So it turns out that the 'poi' was in the features_train. It is not surprising that you can make pretty good predictions if you have the dependent variable in the data set. Now I ran the code above again and got rid of the dependent variable in the features set and got the more believable numbers from the model, with recall and precision down at .6 and .43 respectively. 

## Decision Tree, most_frequent imputed

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='most_frequent')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)
print """DecisionTreeClassifier\nImputer: most_frequent\nNormalize: StandardScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)

DecisionTreeClassifier
Imputer: most_frequent
Normalize: StandardScaler
PCA: dimensions unspecified

Accuracy Score: 0.840909090909

Recall: 0.4

Precision: 0.333333333333


### Discusion 
The Accuracy is still as high as before, but recall has dropped while precision is up to 1.0. That means that there are no false what's, no mis-classified positives, no false positives. So that is something. 

The increase in precision is not worth the drop in recall so it is not worth it. So we will go back to the median imputation. 

Now the next step is to confirm this by putting the grid_search back in and testing the two methods of imputation together. 

UPDATE: since getting rid of the dependent variable in the data set also improves the preformance of this model. 

This is a really interesting result. It is not surprising that the inclusion of the dependent variable makes the results unrealistically good. It is surprising that the inclusion of the dependent variable makes the model preform worse. I wonder if that is some property of the decision tree model? 

## Grid Search: Imputation

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer()),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

x = [x for x in range(2,19)]
param_grid = {'imp__strategy':['median', 'most_frequent']}

# pipeline = Pipeline.fit(features_train, labels_train)

gridCV_object = GridSearchCV(estimator = Pipeline, 
                             param_grid = param_grid)

gridCV_object.fit(features_train, labels_train)

print "Best parameters from the grid search:", pprint(gridCV_object.best_params_)

clf_gridCV = gridCV_object.best_estimator_

print "\nBest estimator accuracy:", clf_gridCV.score(features_test, labels_test)

clf_gridCV_pred = clf_gridCV.predict(features_test)

print "\n\nRecall Score:", recall_score(labels_test, clf_gridCV_pred)
print "\n\nPrecision Score:", precision_score(labels_test, clf_gridCV_pred)

Best parameters from the grid search:{'imp__strategy': 'most_frequent'}
 None

Best estimator accuracy: 0.840909090909


Recall Score: 0.4


Precision Score: 0.333333333333


### Discussion
So I get the same answer that I got when I did it without the grid_search. So at least I am not losing my mind. 

Now I am ready to see if I can get the grid search to work without losing the results I have already gotten. The big problem was that the pca actually preformed worse with the pca grid search than it did without it. 

## Decision Tree with PCA grid_search

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer()),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

x = [x for x in range(2,19)]
param_grid = {'imp__strategy':['median', 'most_frequent'],
              'pca__n_components': x}

gridCV_object = GridSearchCV(estimator = Pipeline, 
                             param_grid = param_grid)

gridCV_object.fit(features_train, labels_train)

print "Best parameters from the grid search:", pprint(gridCV_object.best_params_)

clf_gridCV = gridCV_object.best_estimator_

print "\nBest Estimator Accuracy:", clf_gridCV.score(features_test, labels_test)

clf_gridCV_pred = clf_gridCV.predict(features_test)

print "\n\nRecall Score:", recall_score(labels_test, clf_gridCV_pred)
print "\n\nPrecision Score:", precision_score(labels_test, clf_gridCV_pred)

Best parameters from the grid search:{'imp__strategy': 'most_frequent', 'pca__n_components': 4}
 None

Best Estimator Accuracy: 0.840909090909


Recall Score: 0.2


Precision Score: 0.25


### Discussion
So this is pretty good. I have the thing set up so that the accuracy is over 90% and recall is at 0.6. 

The thing is that this may all be because of a lucky draw from the train-test split. It may come out terrible in the grader. It may turn out to be just lousy when it goes though the 1000 folds of the Udacity grading program. So the next step is to try it in the auto grader and see if the results hold up. The first step is to prep the data. 

UPDATE: Now that it has the right data set it is kind of strange but the model actually preforms less well with the pca included. 

## Submitting to the Udacity Grader

In [20]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
df = pd.DataFrame(data_dict)
df = df.transpose()
df = df.drop('email_address', axis=1)
df = df.astype(float)
df = df.drop('TOTAL')
df = df.drop("THE TRAVEL AGENCY IN THE PARK")
df = df.drop("loan_advances", axis=1)
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']

from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

In [27]:
features_list

['bonus',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'long_term_incentive',
 'other',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'shared_receipt_with_poi',
 'to_messages',
 'total_payments',
 'total_stock_value']

So, now I have to put the dependent variable in the first position in the data set before I turn it into a dictionary. 

In [28]:
features_list[0] = 'poi'

In [29]:
features_list

['poi',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'long_term_incentive',
 'other',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'shared_receipt_with_poi',
 'to_messages',
 'total_payments',
 'total_stock_value']

This is no good. I have no gotten rid of 'bonus' and replaced it with 'poi'. And I don't think that 'poi' should be in the features list anyway. Then again, what is the deal with the requirement that the dependent variable be the first in the data frame for the grading software to work? Am I imagining that I heard that? 

It seems like data_dict should have the dependent and dependent variables in it, but that the features list should be supplied so that the program can sort them out itself. 

Anyway, I have to go back and find where I was able to get the grader to work. I should go back and see if I can stick this model into that notebook. 

In [21]:
df_1 = df.transpose()
data_dict = df_1.to_dict()

In [33]:
from tester import test_classifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

clf = Pipeline([
        ('imp', Imputer(strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA(n_components=8)),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

test_classifier(clf, data_dict, features_list, folds = 1000)

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('std', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=8, whiten=False)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=53, splitter='best'))])
	Accuracy: 0.78287	Precision: 0.20862	Recall: 0.22500	F1: 0.21650	F2: 0.22152
	Total predictions: 15000	True positives:  450	False positives: 1707	False negatives: 1550	True negatives: 11293



In [25]:
features_list

['bonus',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'long_term_incentive',
 'other',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'shared_receipt_with_poi',
 'to_messages',
 'total_payments',
 'total_stock_value']

In [24]:
data_dict

{'ALLEN PHILLIP K': {'bonus': 4175000.0,
  'deferral_payments': 2869717.0,
  'deferred_income': -3081055.0,
  'director_fees': nan,
  'exercised_stock_options': 1729541.0,
  'expenses': 13868.0,
  'from_messages': 2195.0,
  'from_poi_to_this_person': 47.0,
  'from_this_person_to_poi': 65.0,
  'long_term_incentive': 304805.0,
  'other': 152.0,
  'poi': 0.0,
  'restricted_stock': 126027.0,
  'restricted_stock_deferred': -126027.0,
  'salary': 201955.0,
  'shared_receipt_with_poi': 1407.0,
  'to_messages': 2902.0,
  'total_payments': 4484442.0,
  'total_stock_value': 1729541.0},
 'BADUM JAMES P': {'bonus': nan,
  'deferral_payments': 178980.0,
  'deferred_income': nan,
  'director_fees': nan,
  'exercised_stock_options': 257817.0,
  'expenses': 3486.0,
  'from_messages': nan,
  'from_poi_to_this_person': nan,
  'from_this_person_to_poi': nan,
  'long_term_incentive': nan,
  'other': nan,
  'poi': 0.0,
  'restricted_stock': nan,
  'restricted_stock_deferred': nan,
  'salary': nan,
  'share