In [1]:
#!/usr/bin/python

import sys
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
from sklearn.feature_selection import SelectKBest
#sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
#features_list = ['poi','salary','bonus','total_stock_value','exercised_stock_options','total_payments','long_term_incentive'] # You will need to use more features
financial_features_list = ['poi','salary','bonus','deferral_payments','deferred_income','director_fees','exercised_stock_options','expenses','loan_advances','long_term_incentive','restricted_stock','restricted_stock_deferred','total_payments','total_stock_value']
email_feature_list = ['poi','to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
print 'number of people in data set = ', len(data_dict)
print 'features for every person in the data set:\n', data_dict['LAY KENNETH L'].keys()

number of people in data set =  146
features for every person in the data set:
['salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'email_address', 'from_poi_to_this_person']




In [2]:
### Task 2: Remove outliers
''' as seen in the outliers section, we saw that there was a "Total" entry in the data set that was a clear outlier.
thus we are removing it
'''
print "length of data set before removing outlier = ", len(data_dict)
data_dict.pop("TOTAL",0)
print "length of data set after removing outlier = ", len(data_dict)

length of data set before removing outlier =  146
length of data set after removing outlier =  145


In [3]:
### Task 3: Create new feature(s)


### Store to my_dataset for easy export below.
my_dataset = data_dict


In the next cell, i am going to extract the features i defined in my list above and select the top 4 features from the financial features and from the email features to help aid my classifier. 

In [4]:
### Extract features and labels from dataset for local testing
from sklearn.feature_selection import SelectKBest
import numpy
# extract the 4 strongest features from the financial data with SelectKBest
financial_data = featureFormat(my_dataset, financial_features_list, sort_keys = True)
financial_labels, financial_features = targetFeatureSplit(financial_data)

# select 4 best features using SelectKBest
test1 = SelectKBest(k=4)
fit1 = test1.fit(financial_features,financial_labels)

#numpy.set_printoptions(precision=10)
print "printing financial_features k scores: \n", (fit1.scores_)

feature = fit1.transform(financial_features)
# summarize selected features
#print(feature)

#print features
#print labels
#print financial_features[3]

#extract the 4 strongest features from email data with SelectKBest
email_data = featureFormat(my_dataset, email_feature_list, sort_keys = True)
email_labels, email_features = targetFeatureSplit(email_data)

test2 = SelectKBest(k=4)
fit2 = test2.fit(email_features,email_labels)

print "printing email_features k scores: \n", (fit2.scores_)

printing financial_features k scores: 
[ 18.57570327  21.06000171   0.21705893  11.59554766   2.10765594
  25.09754153   6.23420114   7.2427304   10.07245453   9.34670079
   0.06498431   8.86672154  24.46765405]
printing email_features k scores: 
[ 0.29296869  2.43137891  0.46640016  1.0853069   4.61945732]


After running SelectKBest seperately on the Finanical Data and Email Data. The following features were identified as the top 4 from each group:
* Financial Data:
    - 'exercised_stock_options'	25.09754153
    - 'total_stock_value'	24.46765405
    - 'bonus'	21.06000171
    - 'salary'	18.57570327



* Email Data:
    - 'shared_receipt_with_poi'	4.61945732
    - 'from_poi_to_this_person'	2.43137891
    - 'from_this_person_to_poi'	1.0853069




Now i am going to combine these feature to extract the top overal features from this set

In [6]:
financial_email_features = ['poi','exercised_stock_options','total_stock_value','bonus','salary','shared_receipt_with_poi','from_poi_to_this_person','from_this_person_to_poi','from_messages']
financial_email_data = featureFormat(my_dataset, financial_email_features, sort_keys = True)
labels, financial_email_features = targetFeatureSplit(financial_email_data)

# select 4 best features using SelectKBest
test3 = SelectKBest(k=4)
fit3 = test3.fit(financial_email_features,labels)

#numpy.set_printoptions(precision=10)
print "printing financial_email_features k scores: \n", (fit3.scores_)


printing financial_email_features k scores: 
[ 21.71552656  21.05899501  17.8573624   15.14904119   6.8822438
   4.1460684    1.90840396   0.24111688]


After running SelectKBest on the top 4 features from each group combined, I observed the following K scores:

Feature 	               Score
- 'exercised_stock_options'	21.71552656
- 'total_stock_value'	21.05899501
- 'bonus'	17.8573624
- 'salary'	15.14904119
- 'shared_receipt_with_poi'	6.8822438
- 'from_poi_to_this_person'	4.1460684
- 'from_this_person_to_poi'	1.90840396
- 'from_messages'	0.24111688


 Features from the financial data seem to be the best features to use to input them into our classifier. I'm going to compare this when I run SelectKBest on all features and also to SelectPercentile

In [9]:
full_feature_list = ['poi','salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees','to_messages','from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
full_data = featureFormat(my_dataset, full_feature_list, sort_keys = True)
labels, full_features = targetFeatureSplit(full_data)

# select 4 best features using SelectKBest
test4 = SelectKBest(k=4)
fit4 = test4.fit(full_features,labels)

#numpy.set_printoptions(precision=10)
print "printing full_features k scores: \n", (fit4.scores_)

feature = fit4.transform(full_features)

#print full_features

printing full_features k scores: 
[ 18.57570327   0.21705893   8.86672154   7.2427304   21.06000171
   0.06498431  11.59554766  24.46765405   6.23420114  25.09754153
   4.20497086  10.07245453   9.34670079   2.10765594   1.69882435
   5.34494152   0.1641645    2.42650813   8.74648553]


After running SelectKBest on the entire feature list, excluding email address, I observed the following K scores:
Feature 	Score
- 'exercised_stock_options'	25.09754153
- 'total_stock_value'	24.46765405
- 'bonus'	21.06000171
- 'salary'	18.57570327
- 'deferred_income'	11.59554766
- 'long_term_incentive'	10.07245453
- 'restricted_stock'	9.34670079
- total_payments'	8.86672154
- 'shared_receipt_with_poi'	8.74648553
- 'loan_advances'	7.2427304
- 'expenses'	6.23420114
- 'from_poi_to_this_person'	5.34494152
- 'other'	4.20497086
- 'from_this_person_to_poi'	2.42650813
- 'director_fees'	2.10765594
- 'to_messages'	1.69882435
- 'deferral_payments'	0.21705893
- 'from_messages'	0.1641645
- 'restricted_stock_deferred'	0.06498431


As seen, the top 4 features from this list is the same when I picked out the top 4 features from each group and ran SelectKBest on the combined list. Lets see if this is the same when using Selectpercentile


In [10]:
from sklearn.feature_selection import SelectPercentile, f_classif
test5 =  SelectPercentile(f_classif, percentile=10)
fit5 = test5.fit(full_features,labels)

#numpy.set_printoptions(precision=10)
print "printing full_features k scores: \n", (fit5.scores_)


printing full_features k scores: 
[ 18.57570327   0.21705893   8.86672154   7.2427304   21.06000171
   0.06498431  11.59554766  24.46765405   6.23420114  25.09754153
   4.20497086  10.07245453   9.34670079   2.10765594   1.69882435
   5.34494152   0.1641645    2.42650813   8.74648553]


After running SelectPercentile, I get the same scores as I did with SelectKBest. I think i found the features i want to use with my classifier. 

In [None]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html




In [None]:
# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)