In [1]:
import sys
import numpy as np
import pickle
import pandas as pd

sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data


def mkRatio(a, b):
    if a == 'NaN':
        a = 0
    if b == 'NaN' or b == 0:
        return('NaN')
    return(a/(b*1.0))

In [2]:
### Task 1: Select what features you'll use.
features_list = ['poi'] 

# Ignore email address since it's text
ignore_list = ['email_address'] 

### Load the dictionary containing the dataset and my processed text data
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
wordData = pickle.load(open("p5_analyzedEmailData.pkl", "r") )

# Populate all the features to the Features list to start
for p in data_dict:
    for f in data_dict[p]:
        if f not in features_list and f not in ignore_list:
            features_list.append(f)
            
print(features_list)

['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']


In [3]:
### Task 2: Remove outliers
data_dict.pop('TOTAL')


{'bonus': 97343619,
 'deferral_payments': 32083396,
 'deferred_income': -27992891,
 'director_fees': 1398517,
 'email_address': 'NaN',
 'exercised_stock_options': 311764000,
 'expenses': 5235198,
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 83925000,
 'long_term_incentive': 48521928,
 'other': 42667589,
 'poi': False,
 'restricted_stock': 130322299,
 'restricted_stock_deferred': -7576788,
 'salary': 26704229,
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 309886585,
 'total_stock_value': 434509511}

In [4]:
### Task 3: Create new feature(s)

for p in data_dict:
    mailA = data_dict[p]['email_address'] 
    if mailA in wordData:
        data_dict[p]['flaggedMail'] = wordData[mailA]['flaggedMails']
    else:
        data_dict[p]['flaggedMail'] = 'NaN'
        
for p in data_dict:
    data_dict[p]['flaggedRatio'] = mkRatio(data_dict[p]['flaggedMail'],
            data_dict[p]['from_messages'])
    if 'flaggedRatio' not in features_list:
        features_list.append('flaggedRatio')   

    data_dict[p]['fromRatio'] = mkRatio(data_dict[p]['from_this_person_to_poi'],
            data_dict[p]['from_messages'])
    if 'fromRatio' not in features_list:
        features_list.append('fromRatio')    
    data_dict[p]['toRatio'] = mkRatio(data_dict[p]['from_poi_to_this_person'],
            data_dict[p]['to_messages'])
    if 'toRatio' not in features_list:
        features_list.append('toRatio')
    data_dict[p]['salToBonus'] = mkRatio(data_dict[p]['salary'],
            (data_dict[p]['bonus']))
    if 'salToBonus' not in features_list:
        features_list.append('salToBonus')

In [5]:
# Normalize to facilitate PCA and Feature Selection
fts = {}
for f in features_list:
    if f not in ignore_list and f != 'poi':
        fts[f] = {'min':0,'max':0}
        for person in data_dict:
            if data_dict[person][f] < fts[f]['min'] and data_dict[person][f] != 'NaN':
                fts[f]['min'] =  data_dict[person][f]

            if data_dict[person][f] > fts[f]['max'] and data_dict[person][f] != 'NaN':
                fts[f]['max'] =  data_dict[person][f]

for f in features_list:
    if f not in ignore_list and f != 'poi':
        for person in data_dict:
            if data_dict[person][f] != 'NaN':
                data_dict[person][f] = data_dict[person][f] / (fts[f]['max']-fts[f]['min'])

In [6]:
data = featureFormat(data_dict, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [7]:
### Feature Selection
from sklearn.feature_selection import SelectKBest
skb = SelectKBest(k=3)
skb = skb.fit(features, labels)
features = skb.transform(features)
np.shape(features)
sup = skb.get_support()
print('Retained Features:')
for f in range(0,len(features_list)-1):
    if sup[f]:
        print(features_list[f], skb.scores_[f])


Retained Features:
('other', 6.6847457627118638)
('from_poi_to_this_person', 646.21187958248311)
('flaggedRatio', 15.03681557335176)


()


In [9]:
print('Retained Features:')
features_list
print('Features', np.shape(features))
print('Labels', np.shape(labels))

Retained Features:
('Features', (118, 3))
('Labels', (118,))


In [21]:
# Set up the cross validateion, run the selected model and print results.

validation_scores = {}
counter = 0

from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier
kf = KFold(len(labels), 14)

for tr_ind, tst_ind in kf:
    counter+=1    
    features_train = [features[ii] for ii in tr_ind]
    features_test = [features[ii] for ii in tst_ind]
    labels_train = [labels[ii] for ii in tr_ind]
    labels_test = [labels[ii] for ii in tst_ind]
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, 
                             min_samples_leaf=5)
    
    clf.fit(features_train, labels_train)
    
    predictions = clf.predict(features_test)
    acc = accuracy_score(predictions, labels_test)
    rec = recall_score(predictions, labels_test)
    prec = precision_score(predictions, labels_test)
    validation_scores[counter] = {'Acc':acc, 'Rec':rec,'Prec':prec}    

prec, rec, acc = 0,0,0
cnt = 0
for s in validation_scores:
    prec = prec + validation_scores[s]['Prec']
    rec = rec + validation_scores[s]['Rec']
    acc = acc + validation_scores[s]['Acc']
    cnt += 1

print('Accuracy:',acc/cnt, 'Precision:',prec/cnt, 'Recall:',rec/cnt)

# Now print the Udacity test results
test_classifier(clf, data_dict, features_list)

('Accuracy:', 0.98313492063492069, 'Precision:', 0.54761904761904767, 'Recall:', 0.5714285714285714)
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
	Accuracy:, 0.97817,	Precision:, 1.00000\,tRecall:, 0.86900	,F1:,0.92991	,F2: 0.89238
	Total predictions:, 12000	True positives: 1738,	False positives:,    0,	False negatives:,  262,	True negatives:, 10000

