In [43]:
from copy import copy
import matplotlib.pyplot as plt
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat
from feature_format import targetFeatureSplit

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA

import enron

In [44]:
# features_list is a list of strings, each of which is a feature name
# first feature must be "poi", as this will be singled out as the label
poi = ['poi']
email_features_list = [
    'from_messages',
    'from_poi_to_this_person',
    'from_this_person_to_poi',
    'shared_receipt_with_poi',
    'to_messages',
    ]
financial_features_list = [
    'bonus',
    'deferral_payments',
    'deferred_income',
    'director_fees',
    'exercised_stock_options',
    'expenses',
    'loan_advances',
    'long_term_incentive',
    'other',
    'restricted_stock',
    'restricted_stock_deferred',
    'salary',
    'total_payments',
    'total_stock_value',
]
features_list = poi + financial_features_list + email_features_list

In [45]:
# load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [46]:
# instantiate copies of dataset and features for grading purposes
my_dataset = copy(data_dict)
my_feature_list = copy(features_list)

In [47]:
# get K-best features
num_features = 10 # 10 for logistic regression, 8 for k-means clustering
best_features = enron.get_k_best(my_dataset, my_feature_list, num_features)
my_feature_list = poi + best_features.keys()

10 best features: ['to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'director_fees', 'loan_advances', 'from_this_person_to_poi', 'deferred_income', 'shared_receipt_with_poi', 'from_poi_to_this_person']



In [48]:
# add two new features
enron.add_financial_aggregate(my_dataset, my_feature_list)
enron.add_poi_interaction(my_dataset, my_feature_list)

In [49]:
# print features
print "{0} selected features: {1}\n".format(len(my_feature_list) - 1, my_feature_list[1:])

12 selected features: ['to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'director_fees', 'loan_advances', 'from_this_person_to_poi', 'deferred_income', 'shared_receipt_with_poi', 'from_poi_to_this_person', 'financial_aggregate', 'poi_interaction']



In [50]:
# extract the features specified in features_list
data = featureFormat(my_dataset, my_feature_list)

In [51]:
# split into labels and features (this line assumes that the first
# feature in the array is the label, which is why "poi" must always
# be first in the features list
labels, features = targetFeatureSplit(data)

In [52]:
# scale features via min-max
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)

In [53]:
# parameter optimization- not using right now
from sklearn.grid_search import GridSearchCV


In [54]:
clf = LogisticRegression(C=10**18, tol=10**-21)

In [55]:
evaluate.evaluate_clf(clf, features, labels)

LogisticRegression(C=1000000000000000000L, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=1e-21, verbose=0)

Processing....................................................................................................done.

precision: 0.305536674437
recall:    0.214472546898


(0.30553667443667443, 0.21447254689754691)

In [56]:
pickle.dump(clf, open("my_classifier.pkl", "w"))
pickle.dump(my_dataset, open("my_dataset.pkl", "w"))
pickle.dump(my_feature_list, open("my_feature_list.pkl", "w"))