In [2]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



### Initial data exploration and features selection

In [3]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)



In [18]:
# Counting number of individuals in the dataset
print "There are", len(data_dict), "individuals in the dataset."

There are 146 individuals in the dataset.


In [17]:
# Counting number of POIs in the dataset
count = 0
for key in data_dict:
    if data_dict[key]['poi'] == True:
        count += 1
        
print "Number of POIs =", count
print str(round(float(count *100)/len(data_dict), 1))+"% of all individuals are POIs"

Number of POIs = 18
12.3% of all individuals are POIs


In [19]:
# Number of features for each individual
print "There are", len(data_dict.values()[0]), "features for each individual."

There are 21 features for each individual.


In [10]:
# Types of features
data_dict.items()[0]

('METTS MARK',
 {'bonus': 600000,
  'deferral_payments': 'NaN',
  'deferred_income': 'NaN',
  'director_fees': 'NaN',
  'email_address': 'mark.metts@enron.com',
  'exercised_stock_options': 'NaN',
  'expenses': 94299,
  'from_messages': 29,
  'from_poi_to_this_person': 38,
  'from_this_person_to_poi': 1,
  'loan_advances': 'NaN',
  'long_term_incentive': 'NaN',
  'other': 1740,
  'poi': False,
  'restricted_stock': 585062,
  'restricted_stock_deferred': 'NaN',
  'salary': 365788,
  'shared_receipt_with_poi': 702,
  'to_messages': 807,
  'total_payments': 1061827,
  'total_stock_value': 585062})

In [35]:
# Figuring out number of entries with "NaN" inputs
list_of_all_features = (data_dict.items()[0][1]).keys()
features_nan_list = []


for feat in list_of_all_features:
    count = 0
    for key in data_dict:
        if data_dict[key][feat] == "NaN":
            count += 1
    features_nan_list.append((feat, count))

In [37]:
# Ordering the features with highest number of "NaN" to lowest
features_nan_list.sort(key=lambda tup: tup[1], reverse = True)
features_nan_list

[('loan_advances', 142),
 ('director_fees', 129),
 ('restricted_stock_deferred', 128),
 ('deferral_payments', 107),
 ('deferred_income', 97),
 ('long_term_incentive', 80),
 ('bonus', 64),
 ('to_messages', 60),
 ('shared_receipt_with_poi', 60),
 ('from_messages', 60),
 ('from_this_person_to_poi', 60),
 ('from_poi_to_this_person', 60),
 ('other', 53),
 ('salary', 51),
 ('expenses', 51),
 ('exercised_stock_options', 44),
 ('restricted_stock', 36),
 ('email_address', 35),
 ('total_payments', 21),
 ('total_stock_value', 20),
 ('poi', 0)]

"Loan_advances" and "director_fees" are the features with the highest number of "NaN" input while "poi" and "total_stock_value" are the lowest features.

This indicates that most people from this list did not take loan advances from Enron, however, when we move to the second to fourth highest features, it seems to indicate that there were a restricted few (perhaps including all poi) who received "additional" compensation. This provided a clue to the features to use initially.

In [None]:
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)



In [None]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()



In [None]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)



In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

#### Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]

The goal of this project is to identify persons-of-interest (POIs) involved in the Enron fraud case, through the examination of the email corpus and using machine learning and natural language processing, 

#### What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “properly scale features”, “intelligently select feature”]

#### What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]

#### What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric item: “tune the algorithm”]

#### What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric item: “validation strategy”]


#### Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]
