# Final project restart
Import the modules, assign variables and define functions.

In [207]:
import sys
import pickle
sys.path.append("../tools/")

import pandas as pd
import numpy as np

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

import matplotlib.pyplot as plt
%matplotlib inline
from pprint import pprint

from sklearn.preprocessing import Imputer
                                                
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.decomposition import PCA

from sklearn.cross_validation import StratifiedShuffleSplit

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from tester import test_classifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_fscore_support

### Data Import

In [208]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
df = pd.DataFrame(data_dict)
df = df.transpose()
df = df.drop('email_address', axis=1)
df = df.astype(float)
df = df.drop('TOTAL')
df = df.drop("THE TRAVEL AGENCY IN THE PARK")
df = df.drop("loan_advances", axis=1)
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']

In [209]:
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

## Decision Tree Pipeline: median imputed

In [210]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)

print """DecisionTreeClassifier\nImputer: median\nNormalize: StandardScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)

DecisionTreeClassifier
Imputer: median
Normalize: StandardScaler
PCA: dimensions unspecified

Accuracy Score: 0.863636363636

Recall: 0.6

Precision: 0.428571428571


## New Feature-Data Reimport

In [211]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
df = pd.DataFrame(data_dict)
df = df.transpose()
df = df.drop('email_address', axis=1)
df = df.astype(float)
df = df.drop('TOTAL')
df = df.drop("THE TRAVEL AGENCY IN THE PARK")
df = df.drop("loan_advances", axis=1)
#new feature
df['deferred_ratio'] = df['deferred_income']/(df['total_payments'] + 1)
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']

In [212]:
data = "final_project_dataset.pkl"

In [213]:
def import_data(data):
    '''This are the things I will do to import the data everytime, 
    regardless of what variables I make.'''
    with open(data, "r") as data_file:
        data_dict = pickle.load(data_file)
    df = pd.DataFrame(data_dict)
    df = df.transpose()
    df = df.drop('email_address', axis=1)
    df = df.astype(float)
    df = df.drop('TOTAL')
    df = df.drop("THE TRAVEL AGENCY IN THE PARK")
    df = df.drop("loan_advances", axis=1)
    return df

In [214]:
df = import_data(data)

In [223]:
def get_features_labels(df):
    '''This is where the features and labels are extracted to use as arguments
    for sklearn\'s cross_validation function. It is also where I will do add
    any new variables.'''
    df['deferred_ratio'] = df['deferred_income']/(df['total_payments'] + 1)
    df['pct_from_poi'] = df['from_poi_to_this_person']/(df['from_messages'] + 1)
    df['pct_to_poi'] = df['from_this_person_to_poi']/(df['from_messages'] + 1)
    df['to_from'] = df['pct_from_poi']*df['pct_from_poi']
    features_list = list(df.columns)
    features_list.remove('poi')
    features = df[features_list]
    labels = df['poi']
    return features, labels

In [224]:
features, labels = get_features_labels(df)

In [229]:
def get_outcomes(grid_object):
    '''Gets the print out of all the outcomes from the grid_search. It prints out the 
    best parameters found by the model and the outcomes of the test of the model on 
    the test set.'''
    print "Best parameters from the grid search:", pprint(gridCV_object.best_params_)
    clf_gridCV = gridCV_object.best_estimator_
    print "\nBest Estimator Accuracy:", clf_gridCV.score(features_test, labels_test)
    clf_gridCV_pred = clf_gridCV.predict(features_test)
    print "\n\nRecall Score:", recall_score(labels_test, clf_gridCV_pred)
    print "\n\nPrecision Score:", precision_score(labels_test, clf_gridCV_pred)

In [225]:
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

In [232]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

x = [x for x in range(2,19)]
d = [d for d in range(6,12)]
param_grid = {'pca__n_components': x,
              'clf__max_depth': d,
              'imp__strategy':['most_frequent','median']}

gridCV_object = GridSearchCV(estimator = Pipeline, 
                             param_grid = param_grid,
                             scoring = "f1",
                             cv = StratifiedShuffleSplit(labels_train, n_iter=1000))

gridCV_object.fit(features_train, labels_train)

# print all the outcomes of interest
get_outcomes(gridCV_object)

Best parameters from the grid search:{'clf__max_depth': 7, 'imp__strategy': 'most_frequent', 'pca__n_components': 8}
 None

Best Estimator Accuracy: 0.886363636364


Recall Score: 0.2


Precision Score: 0.5


In [230]:
get_outcomes(gridCV_object)

Best parameters from the grid search:{'clf__max_depth': 8, 'pca__n_components': 7}
 None

Best Estimator Accuracy: 0.863636363636


Recall Score: 0.2


Precision Score: 0.333333333333


## Results with 1000, 10 and 100 iterations

### 1000 iterations
So the first outcome from the model is pretty good. It is StratifiedShuffleSplit with 1000 iterations. The outcome was really good but it took forever. 

```
Best parameters from the grid search:{'clf__max_depth': 9, 'pca__n_components': 10}
 None

Best Estimator Accuracy: 0.886363636364


Recall Score: 0.6


Precision Score: 0.5
```
### 10 iterations
This is great but I just can't use this for testing things out. It will take forever. So, I am going to try it with 10 iterations and see what happens. 

```

Best parameters from the grid search:{'clf__max_depth': 9, 'pca__n_components': 12}
 None

Best Estimator Accuracy: 0.863636363636


Recall Score: 0.4


Precision Score: 0.4
```
### 100
Now we get the same depth but 12 instead of 10 principle components. The scores have gone down though. It is worth trying it at a higher number of iterations. I am going to try 100. Here is the outcome: 
```
Best Estimator Accuracy: 0.818181818182


Recall Score: 0.4


Precision Score: 0.285714285714
```
Ok, so 100 iterations is worse than either 1000 or 10 iterations. That is kind of distressing. It would be nice if the behavior of the model was 'monotonic', that is, the more of one thing you do the more of something you are looking for you get. I was thinking that 100 iterations could serve as a good way to explore possibilities and narrow the search space while reserving 1000 iteration runs to make the final cut. Now I am not quite sure what to do. 

And here is another thing. I just ran the model again with 1000 iterations with the single addition of the clf's criterion being 'gini' or 'entropy'. It came back with gini as the better criterion. And the model had performance that was just as good as before on the 1000 iterations. In fact, the three scores of interest--accuracy, precision and recall--were exactly the same. But it found a max depth of 7 instead of 9. 
```
Best parameters from the grid search:{'clf__criterion': 'gini', 'clf__max_depth': 7, 'pca__n_components': 10}
 None

Best Estimator Accuracy: 0.886363636364


Recall Score: 0.6


Precision Score: 0.5
```
The reason that is so puzzling to me is that gini is the default scoring criterion for the model. So it was using gini before when it found the best preformance was at max depth of 9 instead of 7. How does that happen?

### Adding variables

In this iteration I try adding some new variables and the choice between the median and most_frequent imputation for the missing values. I also limited the search space for 'max_depth' and the number of components to 6-8 and 8-12 respectively. That may have been a mistake because I had found that the most effective max depth was 9. I don't know why I did that. 
```

Best parameters from the grid search:{'clf__max_depth': 7, 'imp__strategy': 'most_frequent', 'pca__n_components': 8}
 None

Best Estimator Accuracy: 0.886363636364


Recall Score: 0.2


Precision Score: 0.5
```
The results are strange in that adding more variables decreased the preformance, but I can't be sure that it wasn't because of my odd choice of cutting off the max_depth at 8 instead of 9. 

Also, I don't know why the max_depth is a useful parameter since the default is None. How could limiting the parameter improve preformance? 

## New Variables with modified parameter search space

Just to satisfy my curiosity I am going to increase the max_depth parameter to 10 and raise the maximum number of principle components to 16 to account for the four newly added variables.

#### Civil Libertarian Improvement

So this turns out to have been a good idea. I have had a slight loss in recall but big gains in accuracy and precision. Given that I am a civil libertarian I personally feel this is an improvement. 

```

Best parameters from the grid search:{'clf__max_depth': 8, 'imp__strategy': 'most_frequent', 'pca__n_components': 8}
 None

Best Estimator Accuracy: 0.909090909091


Recall Score: 0.4


Precision Score: 0.666666666667
```
Now I have one problem with these results in that the number of principle components chosen was at the bottom of the range I had specified. So now I have to face the possibility that there was better preforming model with a smaller dimensional principle component space that was over looked by the paramters to which I limited the grid search. So, as a double check, I am going to let the space searced go down. I am also going to keep the 'most_frequent' strategy and not search that space anymore. 

### Fewer dimensions: Insanity

Ok, this is the kind of insane behavior that is driving me insane. 
```

Best parameters from the grid search:{'clf__max_depth': 8, 'pca__n_components': 7}
 None

Best Estimator Accuracy: 0.863636363636


Recall Score: 0.2


Precision Score: 0.333333333333
```
Now, all I did was specify 'most_frequent' as the method of imputation and offered it the choice of finding few dimensions in the principle component analysis. Everything else was the same. So how could the model get worse? It had the choice of keeping the model that had preformed better in the last specificaiton of the model, so how could it get worse? 
## Showdown
Ok, I am just going to test everything. I am going to have a model of 200 different possible parameter configurations, but I am going to know once and for all what's what. Since I have never had entropy come up as the better scoring criterion I am going to leave that out but I am going to try everything else I have tried with the broades possible ranges. 

I am doing this essentially because I have to know once and for all whether more variables improve things or not. 

In [156]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
df = pd.DataFrame(data_dict)
df = df.transpose()
df = df.drop('email_address', axis=1)
df = df.astype(float)
df = df.drop('TOTAL')
df = df.drop("THE TRAVEL AGENCY IN THE PARK")
df = df.drop("loan_advances", axis=1)
#new feature
# df['deferred_ratio'] = df['deferred_income']/(df['total_payments'] + 1)
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']

In [157]:
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

In [158]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)

print """DecisionTreeClassifier\nImputer: median\nNormalize: StandardScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)
print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)

DecisionTreeClassifier
Imputer: median
Normalize: StandardScaler
PCA: dimensions unspecified

Accuracy Score: 0.863636363636

Recall: 0.6

Precision: 0.428571428571


Ok, so now I can get the thing to work. The best thing so far is using pca with no actual reduction in the number of dimensions and no new features added. Now I am going to try to submit that to the udacity grader. 

## Submitting to the Udacity Grader

In [174]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
df = pd.DataFrame(data_dict)
df = df.transpose()
df = df.drop('email_address', axis=1)
df = df.astype(float)
df = df.drop('TOTAL')
df = df.drop("THE TRAVEL AGENCY IN THE PARK")
df = df.drop("loan_advances", axis=1)
features_list = list(df.columns)
features_list.remove('poi')
features = df[features_list]
labels = df['poi']

from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = \
    cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)

In [175]:
from tester import test_classifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

pipeline = Pipeline.fit(features_train, labels_train)

print """DecisionTreeClassifier\nImputer: median\nNormalize: StandardScaler\nPCA: dimensions unspecified"""
print "\nAccuracy Score:", pipeline.score(features_test, labels_test)

clf_pipeline_pred = pipeline.predict(features_test)

print "\nRecall:", recall_score(labels_test, clf_pipeline_pred)
print "\nPrecision:", precision_score(labels_test, clf_pipeline_pred)


DecisionTreeClassifier
Imputer: median
Normalize: StandardScaler
PCA: dimensions unspecified

Accuracy Score: 0.863636363636

Recall: 0.6

Precision: 0.428571428571


#### Extra data prep to give to the grader. 
Put 'poi' at the top of the features_list and put the whole pandas data frame into a dictionary called 'data_dict'. 

In [176]:
features_list[0] = 'poi'
df_1 = df.transpose()
data_dict = df_1.to_dict()

#### Same model submitted to the grader: 

In [177]:
from tester import test_classifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

clf = Pipeline([
        ('imp', Imputer(missing_values='NaN', strategy='median')),
        ('std', StandardScaler()),
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier(random_state = 53))
    ])

print "\n\nAnd these are the results going through the test classifier:\n"
test_classifier(clf, data_dict, features_list, folds = 1000)



And these are the results going through the test classifier:

Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('std', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=N...plit=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=53, splitter='best'))])
	Accuracy: 0.79173	Precision: 0.27005	Recall: 0.33000	F1: 0.29703	F2: 0.31597
	Total predictions: 15000	True positives:  660	False positives: 1784	False negatives: 1340	True negatives: 11216

