# Enron Dataset Exploration and Experimentation Log

First, I got the dimensions of the data, the column names, the count of missing values in each column, and the number of POIs and non-POIs.

In [1]:
#!/usr/bin/python

import sys
import pickle
import pandas as pd
import numpy as np
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi', 'salary', 'prop_messages_with_poi', 'total_payments', 'exercised_stock_options',
                'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'total_stock_value', 'expenses',
                'loan_advances', 'other', 'long_term_incentive'] 
# Since the dataset is so small, I'll start with all features with at least fifty non-NaN values, 
# except for email_address
# Replace message count fields with prop_messages_with_poi (created below)

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# explore dataset
data_df = pd.DataFrame.from_dict(data_dict, orient='index')
print data_df.shape
# print data_df.columns
data_df = data_df.replace('NaN', np.nan)
print data_df.isnull().sum()
print data_df['poi'].value_counts()

(146, 21)
salary                        51
to_messages                   60
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       60
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 60
other                         53
from_this_person_to_poi       60
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
email_address                 35
from_poi_to_this_person       60
dtype: int64
False    128
True      18
Name: poi, dtype: int64




I know from the mini-projects that 'TOTAL' should be removed since it's not a real data point. If I find more outliers later on, I'll remove them here as well.

In [2]:
### Task 2: Remove outliers
data_dict.pop('TOTAL', '')
print len(data_dict.keys())

145


I created three new columns: 
* total_messages, the sum of from_messages and to_messages
* total_poi_messages, the sum of from_this_person_to_poi and from_poi_to_this_person
* prop_messages_with_poi, the proportion of total_poi_messages to total_messages

In [3]:
### Task 3: Create new feature(s)
for person in data_dict:
    if data_dict[person]['to_messages'] != 'Nan' and data_dict[person]['from_messages'] != 'Nan':
        data_dict[person]['total_messages'] = data_dict[person]['to_messages'] + data_dict[person]['from_messages']
    else:
        data_dict[person]['total_messages'] = 'NaN'
    
    if data_dict[person]['from_poi_to_this_person'] != 'NaN' and data_dict[person]['from_this_person_to_poi'] != 'NaN':
        data_dict[person]['total_poi_messages'] = data_dict[person]['from_this_person_to_poi'] + data_dict[person]['from_poi_to_this_person']
    else:
        data_dict[person]['total_poi_messages'] = 'NaN'
    
    if data_dict[person]['total_messages'] != 'Nan' and data_dict[person]['total_poi_messages'] != 'NaN':
        data_dict[person]['prop_messages_with_poi'] = float(data_dict[person]['total_poi_messages']) / float(data_dict[person]['total_messages'])
    else:
        data_dict[person]['prop_messages_with_poi'] = 'NaN'
    
print data_dict['SKILLING JEFFREY K'].keys()

### Store to my_dataset for easy export below.
my_dataset = data_dict

['to_messages', 'deferral_payments', 'expenses', 'poi', 'deferred_income', 'email_address', 'long_term_incentive', 'total_poi_messages', 'prop_messages_with_poi', 'restricted_stock_deferred', 'shared_receipt_with_poi', 'loan_advances', 'from_messages', 'other', 'director_fees', 'total_messages', 'bonus', 'total_stock_value', 'from_poi_to_this_person', 'from_this_person_to_poi', 'restricted_stock', 'salary', 'total_payments', 'exercised_stock_options']


First, I'll do feature scaling since I'm going to be trying a few different algorithms, some of which rely on the distance between points in the feature space.

Then, I'll do a simple train/test split and try out some classifiers with all of these features. I'll look at the accuracy score, precision, and recall to get an idea of which performs best.

In [4]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Feature Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

In [5]:
### Task 4: Try a variety of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(scaled_features, labels, test_size=0.3, random_state=42)
    
### PCA
from sklearn.decomposition import RandomizedPCA
pca = RandomizedPCA(n_components=8, whiten=True).fit(features_train)
pca.fit(features_train)
pca_features_train = pca.transform(features_train)
pca_features_test = pca.transform(features_test)
print pca.explained_variance_ratio_



[ 0.43625744  0.18139418  0.13326897  0.09829136  0.05426126  0.03976898
  0.01998366  0.0182827 ]


In [6]:
from sklearn.metrics import precision_score, recall_score, classification_report

### Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_clf.fit(pca_features_train, labels_train)
pred = nb_clf.predict(pca_features_test)
print "accuracy: ", nb_clf.score(pca_features_test, labels_test)

print "precision: ", precision_score(labels_test, pred)
print "recall: ", recall_score(labels_test, pred)

print classification_report(labels_test, pred)

accuracy:  0.909090909091
precision:  0.6
recall:  0.6
             precision    recall  f1-score   support

        0.0       0.95      0.95      0.95        39
        1.0       0.60      0.60      0.60         5

avg / total       0.91      0.91      0.91        44



In [7]:
### Support Vector Machine
from sklearn.svm import SVC
svm_clf = SVC(kernel='rbf')
svm_clf.fit(pca_features_train, labels_train)
pred = svm_clf.predict(pca_features_test)
print "accuracy: ", svm_clf.score(pca_features_test, labels_test)

print "precision: ", precision_score(labels_test, pred)
print "recall: ", recall_score(labels_test, pred)

print classification_report(labels_test, pred)

accuracy:  0.886363636364
precision:  0.0
recall:  0.0
             precision    recall  f1-score   support

        0.0       0.89      1.00      0.94        39
        1.0       0.00      0.00      0.00         5

avg / total       0.79      0.89      0.83        44



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [8]:
### Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(pca_features_train, labels_train)
pred = dt_clf.predict(pca_features_test)
print "accuracy: ", dt_clf.score(pca_features_test, labels_test)

print "precision: ", precision_score(labels_test, pred)
print "recall: ", recall_score(labels_test, pred)

print classification_report(labels_test, pred)

accuracy:  0.931818181818
precision:  0.75
recall:  0.6
             precision    recall  f1-score   support

        0.0       0.95      0.97      0.96        39
        1.0       0.75      0.60      0.67         5

avg / total       0.93      0.93      0.93        44



In [9]:
### Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=100)
rf_clf.fit(pca_features_train, labels_train)
pred = rf_clf.predict(pca_features_test)
print "accuracy: ", rf_clf.score(pca_features_test, labels_test)

print "precision: ", precision_score(labels_test, pred)
print "recall: ", recall_score(labels_test, pred)

print classification_report(labels_test, pred)

accuracy:  0.909090909091
precision:  0.666666666667
recall:  0.4
             precision    recall  f1-score   support

        0.0       0.93      0.97      0.95        39
        1.0       0.67      0.40      0.50         5

avg / total       0.90      0.91      0.90        44



In [10]:
from sklearn.ensemble import AdaBoostClassifier
ab_clf = AdaBoostClassifier()
ab_clf.fit(pca_features_train, labels_train)
pred = ab_clf.predict(pca_features_test)
print "accuracy: ", ab_clf.score(pca_features_test, labels_test)

print "precision: ", precision_score(labels_test, pred)
print "recall: ", recall_score(labels_test, pred)

print classification_report(labels_test, pred)

accuracy:  0.909090909091
precision:  0.6
recall:  0.6
             precision    recall  f1-score   support

        0.0       0.95      0.95      0.95        39
        1.0       0.60      0.60      0.60         5

avg / total       0.91      0.91      0.91        44



In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(pca_features_train, labels_train)
pred = knn_clf.predict(pca_features_test)
print "accuracy: ", knn_clf.score(pca_features_test, labels_test)

print "precision: ", precision_score(labels_test, pred)
print "recall: ", recall_score(labels_test, pred)

print classification_report(labels_test, pred)

accuracy:  0.840909090909
precision:  0.0
recall:  0.0
             precision    recall  f1-score   support

        0.0       0.88      0.95      0.91        39
        1.0       0.00      0.00      0.00         5

avg / total       0.78      0.84      0.81        44



Of these basic supervised classifiers, the best accuracy performers are SVM, Random Forest, and K Nearest Neighbors. All of these give an accuracy of around 88.6%. Random Forest and Naive Bayes are giving the best precision/recall scores.

After going back and adding feature scaling and PCA (8 components gives the best result), the precision and recall scores have improved. The classifiers currently performing best are Naive Bayes, Random Forest, and AdaBoost.

Naive Bayes doesn't have parameters to tune, so I'll tune the other two and choose the best from there.

In [15]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
# Tune Random Forest rf_clf
# With default parameters:
# accuracy:  0.909090909091
# precision:  0.666666666667
# recall:  0.4
#              precision    recall  f1-score   support

#         0.0       0.93      0.97      0.95        39
#         1.0       0.67      0.40      0.50         5

# avg / total       0.90      0.91      0.90        44

parameters = {'n_estimators': (6, 8, 10, 12, 14, 20, 50),
              'criterion': ('gini', 'entropy'),
              'min_samples_split': (2, 4, 6)}
sss = StratifiedShuffleSplit()

clf = GridSearchCV(RandomForestClassifier(random_state=100), parameters, scoring="f1", cv=sss)
clf.fit(pca_features_train, labels_train)

print clf.best_params_
print "F1: ", clf.best_score_

{'min_samples_split': 2, 'n_estimators': 12, 'criterion': 'gini'}
F1:  0.3


In [19]:
# Tune AdaBoost ab_clf
parameters = {'n_estimators': (10, 20, 50, 70, 100)}
sss = StratifiedShuffleSplit()

clf = GridSearchCV(AdaBoostClassifier(), parameters, scoring="f1", cv=sss)
clf.fit(pca_features_train, labels_train)

print clf.best_params_
print "F1: ", clf.best_score_

{'n_estimators': 10}
F1:  0.233333333333


In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)