In [1]:
import pandas as pd
import pickle as pk
import sys
import numpy as np
import pdb

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 1000

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

# Understanding the Dataset

Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]


> financial features: ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees'] (all units are in US dollars)

> email features: ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi'] (units are generally number of emails messages; notable exception is ‘email_address’, which is a text string)

> POI label: [‘poi’] (boolean, represented as integer)

### Data Exploration

In [2]:
## Load dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pk.load(data_file)
    
my_dataset = data_dict
features_list = ['poi','salary']

#data = featureFormat(my_dataset, features_list, sort_keys = True)

In [3]:
## Load the data dictionary into a dataframe
data = pd.DataFrame.from_dict(my_dataset,orient='index')
data.head()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442,1729541.0,4175000.0,126027.0,1407.0,-126027.0,1729541,13868,,2195.0,152.0,65.0,False,,-3081055.0,304805.0,phillip.allen@enron.com,47.0
BADUM JAMES P,,,178980.0,182466,257817.0,,,,,257817,3486,,,,,False,,,,,
BANNANTINE JAMES M,477.0,566.0,,916197,4046157.0,,1757552.0,465.0,-560222.0,5243487,56301,,29.0,864523.0,0.0,False,,-5104.0,,james.bannantine@enron.com,39.0
BAXTER JOHN C,267102.0,,1295738.0,5634343,6680544.0,1200000.0,3942714.0,,,10623258,11200,,,2660303.0,,False,,-1386055.0,1586055.0,,
BAY FRANKLIN R,239671.0,,260455.0,827696,,400000.0,145796.0,,-82782.0,63014,129142,,,69.0,,False,,-201641.0,,frank.bay@enron.com,


In [4]:
## Total Number of Data Points
data.size

3066

In [5]:
## Number of Available Features
len(data.columns)

21

In [6]:
## Available Features
sorted(list(data.columns))

['bonus',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'email_address',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'loan_advances',
 'long_term_incentive',
 'other',
 'poi',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'shared_receipt_with_poi',
 'to_messages',
 'total_payments',
 'total_stock_value']

In [7]:
data.head()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442,1729541.0,4175000.0,126027.0,1407.0,-126027.0,1729541,13868,,2195.0,152.0,65.0,False,,-3081055.0,304805.0,phillip.allen@enron.com,47.0
BADUM JAMES P,,,178980.0,182466,257817.0,,,,,257817,3486,,,,,False,,,,,
BANNANTINE JAMES M,477.0,566.0,,916197,4046157.0,,1757552.0,465.0,-560222.0,5243487,56301,,29.0,864523.0,0.0,False,,-5104.0,,james.bannantine@enron.com,39.0
BAXTER JOHN C,267102.0,,1295738.0,5634343,6680544.0,1200000.0,3942714.0,,,10623258,11200,,,2660303.0,,False,,-1386055.0,1586055.0,,
BAY FRANKLIN R,239671.0,,260455.0,827696,,400000.0,145796.0,,-82782.0,63014,129142,,,69.0,,False,,-201641.0,,frank.bay@enron.com,


In [8]:
## Allocation across classes (POI/non-POI)
data['poi'].value_counts()

False    128
True      18
Name: poi, dtype: int64

In [9]:
## Features with missing values
data = data.replace('NaN',np.nan)
data.isnull().sum().sort_values(ascending = False)

loan_advances                142
director_fees                129
restricted_stock_deferred    128
deferral_payments            107
deferred_income               97
long_term_incentive           80
bonus                         64
from_poi_to_this_person       60
shared_receipt_with_poi       60
to_messages                   60
from_this_person_to_poi       60
from_messages                 60
other                         53
expenses                      51
salary                        51
exercised_stock_options       44
restricted_stock              36
email_address                 35
total_payments                21
total_stock_value             20
poi                            0
dtype: int64

In [10]:
data = data.replace(np.nan,0.0)

In [11]:
data.head()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442.0,1729541.0,4175000.0,126027.0,1407.0,-126027.0,1729541.0,13868.0,0.0,2195.0,152.0,65.0,False,0.0,-3081055.0,304805.0,phillip.allen@enron.com,47.0
BADUM JAMES P,0.0,0.0,178980.0,182466.0,257817.0,0.0,0.0,0.0,0.0,257817.0,3486.0,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,0,0.0
BANNANTINE JAMES M,477.0,566.0,0.0,916197.0,4046157.0,0.0,1757552.0,465.0,-560222.0,5243487.0,56301.0,0.0,29.0,864523.0,0.0,False,0.0,-5104.0,0.0,james.bannantine@enron.com,39.0
BAXTER JOHN C,267102.0,0.0,1295738.0,5634343.0,6680544.0,1200000.0,3942714.0,0.0,0.0,10623258.0,11200.0,0.0,0.0,2660303.0,0.0,False,0.0,-1386055.0,1586055.0,0,0.0
BAY FRANKLIN R,239671.0,0.0,260455.0,827696.0,0.0,400000.0,145796.0,0.0,-82782.0,63014.0,129142.0,0.0,0.0,69.0,0.0,False,0.0,-201641.0,0.0,frank.bay@enron.com,0.0


### Outliers

In [12]:
## Removing the 'TOTAL' row
data[data['salary'] == data['salary'].max()]

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
TOTAL,26704229.0,0.0,32083396.0,309886585.0,311764000.0,97343619.0,130322299.0,0.0,-7576788.0,434509511.0,5235198.0,83925000.0,0.0,42667589.0,0.0,False,1398517.0,-27992891.0,48521928.0,0,0.0


In [13]:
data = data.drop('TOTAL')

In [14]:
fin_columns = ['salary','deferral_payments','total_payments','exercised_stock_options','bonus','restricted_stock','restricted_stock_deferred','total_stock_value','expenses','deferred_income','long_term_incentive']

In [15]:
#for col in fin_columns:
#     plt.scatter(data[fin_columns]['salary'],data[fin_columns][col])
#     plt.suptitle('{}'.format(col))
#     plt.show()
#     plt.close()

In [16]:
from scipy.stats import percentileofscore

def outlierCleaner(preds, feature_vals):
    """
        Clean away the 10% of points that have the largest
        residual errors (difference between the prediction
        and the actual net worth).

        Return a list of tuples named cleaned_data where
        each tuple is of the form (age, net_worth, error).
    """

    cleaned_data = []

    diffs = (np.array(feature_vals) - np.array(preds)) ** 2
    diffs = diffs.flatten()
    feature_vals = feature_vals.flatten()
    
    data = zip(feature_vals, diffs)
    
    percentiles = [percentileofscore(diffs, i) for i in diffs]
    
    
    cleaned_data = [ val if percentileofscore(diffs, diff) < 98 else 0.0 for (val,diff) in data]
    #cleaned_data = sorted(data, key=lambda tup: tup[1])[:130]

    return cleaned_data

In [18]:
rel_cols = ['bonus','deferral_payments',
'deferred_income',
 'director_fees',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'loan_advances',
 'long_term_incentive',
 'other',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'shared_receipt_with_poi',
 'to_messages',
 'total_payments',
 'total_stock_value']

In [21]:
from sklearn.linear_model import LinearRegression

for f in rel_cols:
    curr_feat = np.reshape( np.array(data[f]), (len(data[f]), 1))
    poi = np.reshape( np.array(data.poi), (len(data.poi), 1))

    reg = LinearRegression()
    reg.fit(curr_feat, poi)

    pred = reg.predict(curr_feat)
    
    cleaned_data = outlierCleaner(pred,curr_feat)
    
    data[f] = cleaned_data
    
    #output.append([f, [item if cleaned_data[i] != item else '' for (i,item) in enumerate(curr_feat)]])

# Optimize Feature Selection/Engineering

What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “intelligently select features”, “properly scale features”]

### Create new features 

In [22]:
### Use PCA to create a new features of financial features?

### Intelligently select features 

In [23]:
data.head()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442.0,1729541.0,4175000.0,126027.0,1407.0,-126027.0,1729541.0,13868.0,0.0,2195.0,152.0,65.0,False,0.0,-3081055.0,304805.0,phillip.allen@enron.com,47.0
BADUM JAMES P,0.0,0.0,178980.0,182466.0,257817.0,0.0,0.0,0.0,0.0,257817.0,3486.0,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,0,0.0
BANNANTINE JAMES M,477.0,566.0,0.0,916197.0,4046157.0,0.0,1757552.0,465.0,0.0,5243487.0,56301.0,0.0,29.0,864523.0,0.0,False,0.0,-5104.0,0.0,james.bannantine@enron.com,39.0
BAXTER JOHN C,267102.0,0.0,1295738.0,5634343.0,6680544.0,1200000.0,3942714.0,0.0,0.0,10623258.0,11200.0,0.0,0.0,2660303.0,0.0,False,0.0,-1386055.0,1586055.0,0,0.0
BAY FRANKLIN R,239671.0,0.0,260455.0,827696.0,0.0,400000.0,145796.0,0.0,-82782.0,63014.0,129142.0,0.0,0.0,69.0,0.0,False,0.0,-201641.0,0.0,frank.bay@enron.com,0.0


In [24]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components = 2)
# pca.fit(data[rel_cols])
# test = pca.transform(data[rel_cols])
# plt.plot(test[0],test[1])

In [397]:
from sklearn.feature_selection import SelectPercentile, f_classif

selector = SelectPercentile(f_classif,percentile = 15)
selector.fit(data[rel_cols],data['poi'])
features_transformed = selector.transform(data[rel_cols])

In [398]:
features_transformed

array([[  1.38680000e+04,   4.70000000e+01,   3.04805000e+05],
       [  3.48600000e+03,   0.00000000e+00,   0.00000000e+00],
       [  5.63010000e+04,   3.90000000e+01,   0.00000000e+00],
       [  1.12000000e+04,   0.00000000e+00,   1.58605500e+06],
       [  1.29142000e+05,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   9.37500000e+04],
       [  3.71720000e+04,   1.44000000e+02,   0.00000000e+00],
       [  1.73550000e+04,   2.28000000e+02,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.18920000e+04,   0.00000000e+00,   0.00000000e+00],
       [  5.91750000e+04,   4.00000000e+00,   1.80250000e+05],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.85590000e+04,   2.30000000e+01,   3.69721000e+05],
       [  8.42080000e+04,   2.50000000e+01,   8.31809000e+05],
       [  1.27900000e+03,   0.00000000e+00,   0.00000000e+00],
       [  6.59070000e+04,   1.40000000e+02,   9.7429300

In [399]:
from sklearn.linear_model import Lasso

reg = Lasso()
reg.fit()

TypeError: fit() takes at least 3 arguments (1 given)

### Properly scale features 

In [400]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
rescaled_weight = scaler.fit_transform(features_transformed)

# Pick and Tune an Algorithm

### Pick an algorithm

What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]

In [401]:
classifiers = []

In [402]:
from sklearn import cross_validation

features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(rescaled_weight,data['poi'], test_size=0.2, random_state=42)

In [403]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(features_train,labels_train)

classifiers.append(clf)

clf.score(features_test,labels_test)

0.86206896551724133

In [404]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(features_train,labels_train)

classifiers.append(clf)

clf.score(features_test,labels_test)

0.89655172413793105

In [405]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(features_train,labels_train)

classifiers.append(clf)

clf.score(features_test,labels_test)

0.93103448275862066

In [406]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(features_train,labels_train)

classifiers.append(clf)

clf.score(features_test,labels_test)

0.89655172413793105

In [407]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(features_train,labels_train)

classifiers.append(clf)

clf.score(features_test,labels_test)

0.89655172413793105

In [408]:
from sklearn.ensemble import AdaBoostClassifier#, SAMME

clf = AdaBoostClassifier(base_estimator=SVC(),algorithm='SAMME',learning_rate=0.1, n_estimators=20)
clf.fit(features_train,labels_train)

classifiers.append(clf)

clf.score(features_test,labels_test)

0.89655172413793105

In [409]:
import os
os.environ['PATH'] = os.environ['PATH'] + ';C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'

from xgboost import XGBClassifier

clf = XGBClassifier()
clf.fit(features_train,labels_train)

classifiers.append(clf)

clf.score(features_test,labels_test)

0.89655172413793105

In [425]:
from sklearn.grid_search import GridSearchCV

test = DecisionTreeClassifier(class_weight={False: 3, True: 7}, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

param_grid = {

         'n_estimators': [10,20,50,100,250,500],
          'learning_rate': [0.1,0.5,1],
          }
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(AdaBoostClassifier(base_estimator=test,algorithm='SAMME'), param_grid)
clf = clf.fit(rescaled_weight,data['poi'])

print "Best estimator found by grid search:"
print clf.best_estimator_

Best estimator found by grid search:
AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight={False: 3, True: 7}, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1, n_estimators=20, random_state=None)


In [426]:
classifiers.append(clf.best_estimator_)

In [411]:
# param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                      'C': [1, 10, 100, 1000]},
#                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
# # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
# clf = GridSearchCV(SVC(), param_grid)
# clf = clf.fit(rescaled_weight,data['poi'])

# print "Best estimator found by grid search:"
# print clf.best_estimator_

In [412]:
clf = SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

clf.fit(features_train,labels_train)

clf.score(features_test,labels_test)

0.89655172413793105

In [413]:
# param_grid = {"max_depth": [3, None],
#               "max_features": [1, 3],
#               "min_samples_split": [1, 3, 10],
#               "min_samples_leaf": [1, 3, 10],
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]}
# # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
# clf = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1')
# clf = clf.fit(rescaled_weight,data['poi'])



# print "Best estimator found by grid search:"
# print clf.best_estimator_

# clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
#             max_depth=3, max_features=3, max_leaf_nodes=None,
#             min_samples_leaf=1, min_samples_split=1,
#             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False)

# classifiers.append(clf)

In [417]:
param_grid = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              'class_weight':[None,{True:7,False:3}]
              }
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='f1')
clf = clf.fit(rescaled_weight,data['poi'])



print "Best estimator found by grid search:"
print 

clf = clf.best_estimator_

classifiers.append(clf)

Best estimator found by grid search:



In [418]:
clf

DecisionTreeClassifier(class_weight={False: 3, True: 7}, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [419]:
param_grid = {'n_neighbors': [1,5],
             'algorithm':['auto','ball_tree', 'kd_tree'],
             'leaf_size':[30],
             'n_jobs':[1,5],
             'p':[2],
             'weights':['uniform','distance']}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(KNeighborsClassifier(), param_grid, scoring='f1')
clf = clf.fit(rescaled_weight,data['poi'])



print "Best estimator found by grid search:"
print clf.best_estimator_

Best estimator found by grid search:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


In [420]:
clf = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

classifiers.append(clf)

### Parameter tuning

What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? What parameters did you tune? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric items: “discuss parameter tuning”, “tune the algorithm”]

### Tune the algorithm

# Validate and Evaluate

### Usage of Evaluation Metrics

Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]

In [433]:
## WINNER

winner = DecisionTreeClassifier(class_weight={False: 3, True: 7}, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
# 0.6
# 1.0
# 0.75

## RUNNER UP

runner_up = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
# 0.666666666667
# 0.666666666667
# 0.666666666667

In [444]:
for num in [0.15,0.20,0.30,0.50]:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(rescaled_weight,data['poi'], test_size=num, random_state=42)
    winner.fit(X_train,y_train)

    y_guess = winner.predict(X_test)
    print metrics.f1_score(y_test,y_guess)

0.5
0.75
0.333333333333
0.235294117647


In [445]:
for num in [0.15,0.20,0.30,0.50]:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(rescaled_weight,data['poi'], test_size=num, random_state=42)
    runner_up.fit(X_train,y_train)

    y_guess = runner_up.predict(X_test)
    print metrics.f1_score(y_test,y_guess)

0.666666666667
0.666666666667
0.5
0.181818181818


In [427]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(rescaled_weight,data['poi'], test_size=0.20, random_state=42)

#clf = AdaBoostClassifier(base_estimator=GaussianNB(),algorithm='SAMME',learning_rate=0.1, n_estimators=20)#GaussianNB()
#clf.fit(X_train,y_train)

for idx, clf in enumerate(classifiers):
    curr = clf

    curr.fit(X_train,y_train)

    y_guess = curr.predict(X_test)

    print idx, clf
    from sklearn import metrics
    print metrics.precision_score(y_test,y_guess)
    print metrics.recall_score(y_test,y_guess)
    print metrics.f1_score(y_test,y_guess)

0 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
0.4
0.666666666667
0.5
1 GaussianNB()
0.5
0.666666666667
0.571428571429
2 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.666666666667
0.666666666667
0.666666666667
3 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.0
0.0
0.0
4 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
  

In [428]:
curr = classifiers[8]

curr.fit(X_train,y_train)

y_guess = curr.predict(X_test)


from sklearn import metrics
print metrics.precision_score(y_test,y_guess)
print metrics.recall_score(y_test,y_guess)
print metrics.f1_score(y_test,y_guess)

0.6
1.0
0.75


In [429]:
output = pd.DataFrame(y_test)
output['guess'] = y_guess

In [430]:
# TRUE POSITIVES
output[(output.poi) & (output.guess)]

Unnamed: 0,poi,guess
RICE KENNETH D,True,True
DELAINEY DAVID W,True,True
KOENIG MARK E,True,True


In [431]:
# FALSE NEGATIVES
output[(output.poi == True) & (output.guess == False)]

Unnamed: 0,poi,guess


In [432]:
# FALSE POSITIVES
output[(output.poi == False) & (output.guess == True)]

Unnamed: 0,poi,guess
IZZO LAWRENCE L,False,True
SHAPIRO RICHARD S,False,True


In [225]:
data.loc[['KOENIG MARK E','RICE KENNETH D','DELAINEY DAVID W'],:]

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
KOENIG MARK E,309946.0,2374.0,0.0,1587421.0,671737.0,700000.0,1248318.0,2271.0,0.0,1920055.0,127017.0,0.0,61.0,150458.0,15.0,True,0.0,0.0,300000.0,mark.koenig@enron.com,53.0
RICE KENNETH D,420636.0,905.0,0.0,505050.0,0.0,1750000.0,2748364.0,864.0,0.0,22542539.0,46950.0,0.0,18.0,174839.0,4.0,True,0.0,0.0,1617011.0,ken.rice@enron.com,42.0
DELAINEY DAVID W,365163.0,3093.0,0.0,4747979.0,2291113.0,3000000.0,1323148.0,2097.0,0.0,3614261.0,86174.0,0.0,3069.0,1661.0,0.0,True,0.0,0.0,1294981.0,david.delainey@enron.com,66.0


### Validation Strategy

What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric items: “discuss validation”, “validation strategy”]

### Algorithm Performance