## Data Exploration



In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import sys
from time import time
import matplotlib as pl
import matplotlib.pyplot as plt
import pickle

In [3]:
dataPath = '/Users/omojumiller/mycode/MachineLearningNanoDegree/IntroToMachineLearning/'
sys.path.append(dataPath+'tools/')
sys.path.append(dataPath+'final_project/')

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from helper_files import compareTwoFeatures, computeFraction

In [4]:
### Load the dictionary containing the dataset

with open(dataPath+'final_project/final_project_dataset.pkl', "r") as data_file:
    data_dict = pickle.load(data_file)

In [5]:
len(data_dict)

146

If there are outliers, remove outliers

This is an iteratable process. I need to do this for each combination of features I want to use


In [None]:
data = compareTwoFeatures('salary', 'bonus', data_dict, "SALARY versus BONUS")

1. I need to find out where that outlier is
2. Find out who it is

In [None]:
np.where(data > 0.8 * 1e8) # This is where the outlier is, what I have to do now is find out who it is


In [None]:
data[57] # So whose bonus is 97343619?
# What’s the name of the dictionary key of this data point?

In [None]:
for key, value in data_dict.iteritems():
    if (value['bonus'] >= int(data[57][1]) and 
        value['bonus'] != "NaN" and
        value['salary'] != "NaN"):
        print "{:20}{:12}${:<12,.2f}{:12}${:<12,.2f}".format(key, 'salary is ', value['salary'],
                                                   ' bonus ', value['bonus'])
        
    if (value['restricted_stock'] < 0):
        print key



Found the source of the outlier. It was the `TOTAL` row that was mistakenly read into the data dict

In [None]:
# Remove the source of the outlier
data_dict.pop( 'TOTAL')
data_dict.pop( 'BHATNAGAR SANJAY')

# We can now go back and rerun the regression to see what the data really looks like.

In [None]:
data = compareTwoFeatures('salary', 'bonus', data_dict, "SALARY versus BONUS cleansed of outliers")

In [None]:
features_list = ['restricted_stock', 'exercised_stock_options'] #
data = featureFormat( data_dict, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit( data )


### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, 
                                            target, test_size=0.3, random_state=42)


import numpy as np
target_test = np.asarray(target_test).reshape(-1,1)

from sklearn import linear_model
reg = linear_model.LinearRegression()
    
# Reshape data using X.reshape(-1, 1) since data has a single feature or X.
clf = reg.fit(feature_train, target_train)


print "slope of regression is %.2f" % reg.coef_
print "intercepts of regression is %.2f" % reg.intercept_
print "\n ********stats on dataset********\n"
print "r-squared score on testing data: ", reg.score(feature_test, target_test)
print "r-squared score on training data: ", reg.score(feature_train, target_train)

plt.clf()
plt.scatter(feature_train, target_train, color="c", label="train data", s=80, marker = 'o', alpha = 0.28)
plt.scatter(feature_test, target_test, color="r", label="test data", s=80, marker = 'o', alpha = 0.28)

plt.plot(target_test, reg.predict(target_test), color="k")
plt.legend(loc='upper center', shadow=True, fontsize='medium')
plt.ylabel(features_list[0])
plt.xlabel(features_list[1])
plt.title('Regression on the '+features_list[0]+' against the '+features_list[1], y=1.08)
plt.show()

## Feature Selection

In [6]:
feature_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances', 
                'bonus', 'restricted_stock_deferred', 'deferred_income', 
                'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 
                'long_term_incentive', 
                'restricted_stock', 'director_fees',
                'to_messages','from_poi_to_this_person', 'from_messages', 
                'from_this_person_to_poi', 'shared_receipt_with_poi'
               ]

names = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 
        'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 
        'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 
         'director_fees','to_messages', 'from_poi_to_this_person', 'from_messages', 
                'from_this_person_to_poi', 'shared_receipt_with_poi']

In [7]:
my_dataset = data_dict
data = featureFormat(my_dataset, feature_list
                     , sort_keys = True)
labels, features = targetFeatureSplit(data)

In [8]:
# Recursive Feature Elimination
from sklearn.linear_model import RandomizedLasso


rlasso = RandomizedLasso()
rlasso.fit(features, labels)
 
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), 
                 names), reverse=True)




Features sorted by their score:
[(1.0, 'shared_receipt_with_poi'), (0.965, 'to_messages'), (0.95, 'deferral_payments'), (0.945, 'from_this_person_to_poi'), (0.89, 'from_messages'), (0.875, 'deferred_income'), (0.815, 'director_fees'), (0.76, 'loan_advances'), (0.735, 'from_poi_to_this_person'), (0.54, 'restricted_stock_deferred'), (0.535, 'exercised_stock_options'), (0.295, 'total_stock_value'), (0.285, 'other'), (0.205, 'expenses'), (0.145, 'salary'), (0.13, 'bonus'), (0.12, 'restricted_stock'), (0.115, 'long_term_incentive'), (0.04, 'total_payments')]


In [9]:
feature_list = ['poi', 'deferral_payments', 'deferred_income', 
                'restricted_stock_deferred', 
                'expenses','from_this_person_to_poi'
               ]

my_dataset = data_dict

data = featureFormat(my_dataset, feature_list
                     , sort_keys = True)
labels, features = targetFeatureSplit(data)


## Task 6: Export data 
Dump  dataset to a .pkl file.

In [None]:
DATASET_PICKLE_FILENAME = "cleaned_dataset.pkl"

with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
    pickle.dump(data_dict, dataset_outfile)
   