In [73]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# zscore -  to identify th eoutliers
from scipy.stats import zscore
# select effective features
from sklearn.feature_selection import SelectKBest, f_classif 
## GridSearchCV - to find the best parameters
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import tree, ensemble, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.cluster import KMeans
## evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#### loading data

In [74]:
enron_data_pkl = pickle.load(open('data\enron.pkl', "rb"))
df_enron = pd.DataFrame(enron_data_pkl).transpose() ## index - POI

In [75]:
df_enron.head()

Unnamed: 0,to_messages,deferral_payments,expenses,poi,long_term_incentive,email_address,from_poi_to_this_person,deferred_income,fraction_from_poi,restricted_stock_deferred,...,other,director_fees,bonus,total_stock_value,from_this_person_to_poi,restricted_stock,salary,total_payments,fraction_to_poi,exercised_stock_options
METTS MARK,807.0,0.0,94299.0,0.0,0.0,mark.metts@enron.com,38.0,0.0,0.047088,0.0,...,1740.0,0.0,600000.0,585062.0,1.0,585062.0,365788.0,1061827.0,0.034483,0.0
BAXTER JOHN C,0.0,1295738.0,11200.0,0.0,1586055.0,0.0,0.0,-1386055.0,0.0,0.0,...,2660303.0,0.0,1200000.0,10623258.0,0.0,3942714.0,267102.0,5634343.0,0.0,6680544.0
ELLIOTT STEVEN,0.0,0.0,78552.0,0.0,0.0,steven.elliott@enron.com,0.0,-400729.0,0.0,0.0,...,12961.0,0.0,350000.0,6678735.0,0.0,1788391.0,170941.0,211725.0,0.0,4890344.0
CORDES WILLIAM R,764.0,0.0,0.0,0.0,0.0,bill.cordes@enron.com,10.0,0.0,0.013089,0.0,...,0.0,0.0,0.0,1038185.0,0.0,386335.0,0.0,0.0,0.0,651850.0
HANNON KEVIN P,1045.0,0.0,34039.0,1.0,1617011.0,kevin.hannon@enron.com,32.0,-3117011.0,0.030622,0.0,...,11350.0,0.0,1500000.0,6391065.0,21.0,853064.0,243293.0,288682.0,0.65625,5538001.0


#### data cleaning

In [76]:
# data clening
df_enron = df_enron.replace("NaN", np.nan)

#### feature engineering

In [77]:
## replacing 0, for avoiding divided by 0 error
df_enron["to_messages"].replace(0, np.nan, inplace = True)
df_enron["from_messages"].replace(0, np.nan, inplace = True)
df_enron["salary"].replace(0, np.nan, inplace = True)
df_enron["total_payments"].replace(0, np.nan, inplace = True)
df_enron["total_stock_value"].replace(0, np.nan, inplace = True)

# fraction of emails received from POI
df_enron["from_poi_ratio"] = df_enron["from_poi_to_this_person"] / df_enron["to_messages"]
# fraction of email sent to POI
df_enron["to_poi_ratio"] = df_enron["from_this_person_to_poi"] / df_enron["from_messages"]
# ratio of bonous to salary for each individual or POI
df_enron["bonus_to_salary_ratio"] = df_enron["bonus"] / df_enron["salary"]
# ratio of deferral payment to total payment for each POI 
df_enron["deferral_payments_ratio"] = df_enron["deferral_payments"] / df_enron["total_payments"]
# ratio of deferral income to total payment for each POI
df_enron["deferred_income_ratio"] = df_enron["deferred_income"] / df_enron["total_payments"]
# ratio of director fees received by the individual to total payment received by the individuals
df_enron["director_fees_ratio"] = df_enron["director_fees"] / df_enron["total_payments"]
# ratio of the value of stock option excercised by the individual to total individual stock value
df_enron["exercised_stock_options_ratio"] = df_enron["exercised_stock_options"] / df_enron["total_stock_value"]
# ratio of expense by the individuals to sallary of individuals
df_enron["expenses_ratio"] = df_enron["expenses"] / df_enron["salary"]
# ratio of Long-term incentives awarded to the individual to total payment received by teh individuals
df_enron["long_term_incentive_ratio"] = df_enron["long_term_incentive"] / df_enron["total_payments"]
# ratio of other compenset and benifits received by the individuals  to the individuals salary
df_enron["other_ratio"] = df_enron["other"] / df_enron["salary"]
# ratio of total restricted stock owned by the individuals to total values of individual stock 
df_enron["restricted_stock_ratio"] = df_enron["restricted_stock"] / df_enron["total_stock_value"]
# ratio of POI salary to the total payments of POI
df_enron["salary_ratio"] = df_enron["salary"] / df_enron["total_payments"]
# ratio of shared receipt with poi to the number of emails received by the poi
df_enron["shared_receipt_with_poi_ratio"] = df_enron["shared_receipt_with_poi"] / df_enron["to_messages"]
# ratio of total payment received by the individuals to the total value of teh individual stock
df_enron["total_payments_to_stock_ratio"] = df_enron["total_payments"] / df_enron["total_stock_value"]

df_enron["from_poi_to_this_person"].fillna(0, inplace=True)
df_enron["from_this_person_to_poi"].fillna(0, inplace=True)
df_enron["bonus_to_salary_ratio"].fillna(0, inplace=True)
df_enron["deferral_payments_ratio"].fillna(0, inplace=True)
df_enron["deferred_income_ratio"].fillna(0, inplace=True)
df_enron["director_fees_ratio"].fillna(0, inplace=True)
df_enron["exercised_stock_options_ratio"].fillna(0, inplace=True)
df_enron["expenses_ratio"].fillna(0, inplace=True)
df_enron["long_term_incentive_ratio"].fillna(0, inplace=True)
df_enron["other_ratio"].fillna(0, inplace=True)
df_enron["restricted_stock_ratio"].fillna(0, inplace=True)
df_enron["salary_ratio"].fillna(0, inplace=True)
df_enron["shared_receipt_with_poi_ratio"].fillna(0, inplace=True)
df_enron["total_payments_to_stock_ratio"].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_enron["to_messages"].replace(0, np.nan, inplace = True)
  df_enron["to_messages"].replace(0, np.nan, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_enron["from_messages"].replace(0, np.nan, inplace = True)
  df_enron["from_messages"].replace(0, np.nan, inpl

#### data cleaning

In [78]:
## fill nan with 0
# df_enron.replace('Nan', 0, inplace = True) # --- not working
df_enron = df_enron.fillna(0)

  df_enron = df_enron.fillna(0)


#### Select effective features
- SelectKBest

In [79]:
## see the list of features
features_list = ['poi']
features_list = features_list + [col for col in df_enron.columns if col not in ['poi', 'email_address']]
features_list

['poi',
 'to_messages',
 'deferral_payments',
 'expenses',
 'long_term_incentive',
 'from_poi_to_this_person',
 'deferred_income',
 'fraction_from_poi',
 'restricted_stock_deferred',
 'shared_receipt_with_poi',
 'loan_advances',
 'from_messages',
 'other',
 'director_fees',
 'bonus',
 'total_stock_value',
 'from_this_person_to_poi',
 'restricted_stock',
 'salary',
 'total_payments',
 'fraction_to_poi',
 'exercised_stock_options',
 'from_poi_ratio',
 'to_poi_ratio',
 'bonus_to_salary_ratio',
 'deferral_payments_ratio',
 'deferred_income_ratio',
 'director_fees_ratio',
 'exercised_stock_options_ratio',
 'expenses_ratio',
 'long_term_incentive_ratio',
 'other_ratio',
 'restricted_stock_ratio',
 'salary_ratio',
 'shared_receipt_with_poi_ratio',
 'total_payments_to_stock_ratio']

In [80]:
X = df_enron.drop(['poi', 'email_address'], axis = 1)
y = df_enron['poi']

selector = SelectKBest(score_func = f_classif, k = 10)
X_new = selector.fit_transform(X, y)
mask = selector.get_support() 
selected_features = X.columns[mask]

In [81]:
scores = zip(features_list[1:], selector.scores_)
scores_list = sorted(scores, key=lambda x: x[1], reverse=True)
scores_list

[('exercised_stock_options', np.float64(24.815079733218194)),
 ('total_stock_value', np.float64(24.18289867856688)),
 ('bonus', np.float64(20.792252047181535)),
 ('salary', np.float64(18.289684043404513)),
 ('fraction_to_poi', np.float64(16.4097125480358)),
 ('to_poi_ratio', np.float64(16.4097125480358)),
 ('long_term_incentive_ratio', np.float64(13.85086841716765)),
 ('deferred_income', np.float64(11.458476579280369)),
 ('bonus_to_salary_ratio', np.float64(10.783584708160838)),
 ('long_term_incentive', np.float64(9.922186013189823)),
 ('restricted_stock', np.float64(9.2128106219771)),
 ('shared_receipt_with_poi_ratio', np.float64(9.10126873919354)),
 ('total_payments', np.float64(8.77277773009168)),
 ('shared_receipt_with_poi', np.float64(8.589420731682381)),
 ('loan_advances', np.float64(7.184055658288725)),
 ('expenses', np.float64(6.094173310638945)),
 ('from_poi_to_this_person', np.float64(5.243449713374958)),
 ('other', np.float64(4.187477506995375)),
 ('fraction_from_poi', np.fl

#### Find the best number of features
- DecisionTreeClassifier

In [82]:
n_features = np.arange(1, len(features_list))
n_features

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35])

In [83]:
## create a piplne -- feature selecteion and then classification
pipe = Pipeline([
    ('select_features', SelectKBest(score_func = f_classif)),
    ('calssify', tree.DecisionTreeClassifier())
])

param_grid = [
    {
        'select_features__k': n_features
    }
]

In [84]:
## use gridsearch to automate the process of finding the optimal number of features
tree_clf = GridSearchCV(pipe, param_grid = param_grid, scoring = 'f1', cv = 5)
tree_clf.fit(X, y)

tree_clf.best_params_

{'select_features__k': np.int64(25)}

###### F1 score for different list of selected features

In [85]:
tree_best_params = tree_clf.best_params_
tree_best_score =  tree_clf.best_score_
print("Best Parameters: ", tree_best_params)
print("Best Scores: ", tree_best_score)

cv_results = tree_clf.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(f"Mean F1 score: {mean_score:.3f} for parameters: {params}")

Best Parameters:  {'select_features__k': np.int64(25)}
Best Scores:  0.33095238095238094
Mean F1 score: 0.206 for parameters: {'select_features__k': np.int64(1)}
Mean F1 score: 0.147 for parameters: {'select_features__k': np.int64(2)}
Mean F1 score: 0.266 for parameters: {'select_features__k': np.int64(3)}
Mean F1 score: 0.227 for parameters: {'select_features__k': np.int64(4)}
Mean F1 score: 0.213 for parameters: {'select_features__k': np.int64(5)}
Mean F1 score: 0.233 for parameters: {'select_features__k': np.int64(6)}
Mean F1 score: 0.233 for parameters: {'select_features__k': np.int64(7)}
Mean F1 score: 0.244 for parameters: {'select_features__k': np.int64(8)}
Mean F1 score: 0.180 for parameters: {'select_features__k': np.int64(9)}
Mean F1 score: 0.133 for parameters: {'select_features__k': np.int64(10)}
Mean F1 score: 0.300 for parameters: {'select_features__k': np.int64(11)}
Mean F1 score: 0.260 for parameters: {'select_features__k': np.int64(12)}
Mean F1 score: 0.196 for paramet

#### Find the best number of features
- RandomForestClassifier

In [86]:
pipe = Pipeline([
    ('select_features', SelectKBest(score_func = f_classif)),
    ('calssify', ensemble.RandomForestClassifier(max_depth = None,
                                            min_samples_split = 2))
])

param_grid = [
    {
        'select_features__k': n_features
    }
]

In [87]:
forest_clf = GridSearchCV(pipe, param_grid = param_grid, scoring = 'f1', cv = 5)
forest_clf.fit(X, y)

forest_clf.best_params_

{'select_features__k': np.int64(17)}

###### F1 score for different list of selected features

In [88]:
forest_best_params = forest_clf.best_params_
forest_best_score =  forest_clf.best_score_
print("Best Parameters: ", forest_best_params)
print("Best Scores: ", forest_best_score)

cv_results = forest_clf.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(f"Mean F1 score: {mean_score:.3f} for parameters: {params}")

Best Parameters:  {'select_features__k': np.int64(17)}
Best Scores:  0.39333333333333337
Mean F1 score: 0.196 for parameters: {'select_features__k': np.int64(1)}
Mean F1 score: 0.167 for parameters: {'select_features__k': np.int64(2)}
Mean F1 score: 0.348 for parameters: {'select_features__k': np.int64(3)}
Mean F1 score: 0.274 for parameters: {'select_features__k': np.int64(4)}
Mean F1 score: 0.227 for parameters: {'select_features__k': np.int64(5)}
Mean F1 score: 0.327 for parameters: {'select_features__k': np.int64(6)}
Mean F1 score: 0.247 for parameters: {'select_features__k': np.int64(7)}
Mean F1 score: 0.313 for parameters: {'select_features__k': np.int64(8)}
Mean F1 score: 0.180 for parameters: {'select_features__k': np.int64(9)}
Mean F1 score: 0.313 for parameters: {'select_features__k': np.int64(10)}
Mean F1 score: 0.233 for parameters: {'select_features__k': np.int64(11)}
Mean F1 score: 0.147 for parameters: {'select_features__k': np.int64(12)}
Mean F1 score: 0.260 for paramet

###### select the number of features based on best score of both 
- forest_best_score
- tree_best_score

In [89]:
if forest_best_score > tree_best_score:
    new_feature_list = ['poi'] + [score[0] for score in scores_list][:int(forest_best_params['select_features__k'])]
else:
     new_feature_list = ['poi'] + [score[0] for score in scores_list][:int(tree_best_params['select_features__k'])]

In [90]:
new_feature_list

['poi',
 'exercised_stock_options',
 'total_stock_value',
 'bonus',
 'salary',
 'fraction_to_poi',
 'to_poi_ratio',
 'long_term_incentive_ratio',
 'deferred_income',
 'bonus_to_salary_ratio',
 'long_term_incentive',
 'restricted_stock',
 'shared_receipt_with_poi_ratio',
 'total_payments',
 'shared_receipt_with_poi',
 'loan_advances',
 'expenses',
 'from_poi_to_this_person']

###### Extract features and labels for model
- converting the dataset into the dictionary
- formating the values of feature into list --> numpy array
- split the label and features

In [91]:
data_dict = df_enron.T.to_dict()
my_dataset = data_dict

In [92]:
#my_dataset

In [93]:
## extract features and lebels for testing
def feature_format(my_dataset, new_feature_list, sort_keys = True):
    return_list = []
    keys = my_dataset.keys() ## getting the name of pOI's
    
    if sort_keys:
        keys = sorted(keys)
        
    for key in keys:
        temp_list = []
        for feature in new_feature_list:
            try:
                value = my_dataset[key][feature]
                #if value == "NaN" an
                temp_list.append(float(value))
            except KeyError:
                print(f"Error: key {feature} not present")
                return
        return_list.append(np.array(temp_list))

    return np.array(return_list)

In [94]:
def lebel_feature_split(data):
    labels = []
    features = []

    for item in data:
        labels.append(item[1:])
        features.append(item[0])
    return labels, features

In [95]:
data = feature_format(my_dataset, new_feature_list, sort_keys = True)
features, labels = lebel_feature_split(data)

In [96]:
#features

In [97]:
#labels

#### Transform features by scaling each features to a given range

In [98]:
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)

#### Try different models and test the accuracy score
- GausNB
- RandomForestClassifier
- DecisionTreeClassifier
- LogisticRegression
- KMeans
- XGBClassifier

In [99]:
df = pd.DataFrame()
n_features = len(new_feature_list) - 1
n_features

17

In [100]:
def cross_validate(classifier, model):
    accuracy_score = cross_val_score(classifier, features, labels, cv = 5, scoring = 'accuracy')

    precision_score = cross_val_score(classifier, features, labels, cv = 5, scoring = 'precision')

    recall_score = cross_val_score(classifier, features, labels, cv = 5, scoring = 'recall')

    f1_score = cross_val_score(classifier, features, labels, cv = 5, scoring = 'f1')

    accuracy = '%0.2f (+/- %0.2f)' % (accuracy_score.mean(),accuracy_score.std() * 2)

    precision = '%0.2f (+/- %0.2f)' % (precision_score.mean(),precision_score.std() * 2)

    recall = '%0.2f (+/- %0.2f)' % (recall_score.mean(),recall_score.std() * 2)

    f1 = '%0.2f (+/- %0.2f)' % (f1_score.mean(),recall_score.std() * 2)

    return pd.DataFrame(index=[model], data={'Accuracy': [accuracy],'Precision': [precision],'Recall': [recall], 'F1': [f1]})
  

In [101]:
# GaussianNB
classifier1 = GaussianNB()
df1 = cross_validate(classifier1, 'GaussianNB')
df1

Unnamed: 0,Accuracy,Precision,Recall,F1
GaussianNB,0.84 (+/- 0.07),0.26 (+/- 0.43),0.30 (+/- 0.53),0.27 (+/- 0.53)


In [102]:
# RandomForestClassifier
classifier1 = RandomForestClassifier()
df2 = cross_validate(classifier1, 'RandomForestClassifier')
df2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
RandomForestClassifier,0.87 (+/- 0.07),0.40 (+/- 0.75),0.25 (+/- 0.49),0.28 (+/- 0.49)


In [103]:
# DecisionTreeClassifier
classifier1 = DecisionTreeClassifier()
df3 = cross_validate(classifier1, 'DecisionTreeClassifier')
df3

Unnamed: 0,Accuracy,Precision,Recall,F1
DecisionTreeClassifier,0.83 (+/- 0.06),0.13 (+/- 0.22),0.27 (+/- 0.45),0.22 (+/- 0.45)


In [104]:
# LogisticRegression
classifier1 = LogisticRegression()
df4 = cross_validate(classifier1, 'LogisticRegression')
df4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
LogisticRegression,0.87 (+/- 0.07),0.30 (+/- 0.80),0.12 (+/- 0.29),0.17 (+/- 0.29)


In [105]:
# KMeans
classifier1 = KMeans()
df5 = cross_validate(classifier1, 'KMeans')
df5

Traceback (most recent call last):
  File "C:\Users\looka\OneDrive\Documents\ML\enron_model\enron_env\lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
  File "C:\Users\looka\OneDrive\Documents\ML\enron_model\enron_env\lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "C:\Users\looka\OneDrive\Documents\ML\enron_model\enron_env\lib\site-packages\sklearn\utils\_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\looka\OneDrive\Documents\ML\enron_model\enron_env\lib\site-packages\sklearn\metrics\_classification.py", line 2190, in precision_score
    p, _, _, _ = precision_recall_fscore_support(
  File "C:\Users\looka\OneDrive\Documents\ML\enron_model\enron_env\lib\site-packages\sklearn\utils\_param_validation.py", line 186, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\looka\OneDrive\Documen

Unnamed: 0,Accuracy,Precision,Recall,F1
KMeans,0.28 (+/- 0.39),nan (+/- nan),nan (+/- nan),nan (+/- nan)


In [106]:
## XGBClassifier
classifier1 = XGBClassifier()
df6 = cross_validate(classifier1, 'XGBClassifier')
df6

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
XGBClassifier,0.86 (+/- 0.11),0.51 (+/- 0.83),0.23 (+/- 0.24),0.29 (+/- 0.24)


#### Get the best hyperparameters for dicisiontree and RnadomForest after tuning with the hyperparameter, check the score again

##### DECISIONTREE

In [107]:
classifier = tree.DecisionTreeClassifier()
## define the configuration of parameters to test  with the DecisionTreeClassifier
param_grid = {'criterion': ['gini', 'entropy'],
              'min_samples_split': [2, 4, 6, 8, 10, 20],
              'max_depth': [None, 5, 10, 15, 20],
              'max_features': [None, 'sqrt', 'log2', 'auto']}

In [108]:
## use gridsearchCV for the optimal hyperparameters for the decision tree
tree_clf = GridSearchCV(classifier, param_grid = param_grid, scoring = 'f1', cv = 5)
tree_clf.fit(features, labels)
# get the best algorithm hyperparameters for the decision tree
tree_clf.best_params_

300 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\looka\OneDrive\Documents\ML\enron_model\enron_env\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\looka\OneDrive\Documents\ML\enron_model\enron_env\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\looka\OneDrive\Documents\ML\enron_model\enron_env\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\looka\OneDrive\Documents\ML\enron_

{'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_split': 10}

In [109]:
tree_clf.best_estimator_

In [110]:
# A Dataframe to use in the reporting
df_final = cross_validate(tree_clf.best_estimator_, 'Tuned DecisionTreeClassifier')
df_final

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1
Tuned DecisionTreeClassifier,0.88 (+/- 0.09),0.14 (+/- 0.39),0.28 (+/- 0.08),0.30 (+/- 0.08)


###### RANDOMFOREST

In [111]:
clf3 = RandomForestClassifier(max_depth = None, min_samples_split = 2)

In [112]:
param_grid = {"n_estimators": [9,18,27,36],
             "max_depth": [None, 1, 5, 10, 15],
             "min_samples_leaf": [1, 2, 4, 6]}

In [113]:
forest_clf = GridSearchCV(clf3, param_grid = param_grid, scoring = 'f1', cv = 5)
forest_clf.fit(features, labels)

In [114]:
forest_clf.best_params_

{'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 9}

In [115]:
# A Dataframe to use in the reporting
df_final = cross_validate(forest_clf.best_estimator_, 'Tuned RandomForestClassifier')
df_final

Unnamed: 0,Accuracy,Precision,Recall,F1
Tuned RandomForestClassifier,0.90 (+/- 0.04),0.47 (+/- 0.65),0.23 (+/- 0.24),0.41 (+/- 0.24)


#### Using RandomForestClassifier for final model

In [116]:
final_classifier = forest_clf.best_estimator_
final_classifier

#### make dataset with the selected features
- Split
- test
- evaluate

In [117]:
enron_selected_features = df_enron[new_feature_list]

In [118]:
X = enron_selected_features.drop('poi', axis = 1)
y = enron_selected_features['poi']

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [120]:
final_classifier.fit(X_train, y_train)

In [121]:
y_pred = final_classifier.predict(X_test)

In [122]:
y_pred

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.])

###### evaluate

In [123]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94        24
         1.0       0.75      0.60      0.67         5

    accuracy                           0.90        29
   macro avg       0.83      0.78      0.80        29
weighted avg       0.89      0.90      0.89        29

Confusion Matrix:
[[23  1]
 [ 2  3]]


In [126]:
sample_input = np.array([[100000, 50000, 1000, 2000, 10000, 300, 10000, 400, 300, 100, 100, 100, 100, 100, 100, 100, 100]])
prediction = final_classifier.predict(sample_input)
prediction



array([0.])

In [127]:
with open('pipeline.pickle','wb') as f:
    pickle.dump(pipe, f)
    
with open('pipeline.pickle', 'rb') as f:
    loaded_pipe = pickle.load(f)

In [None]:
['poi',
 'exercised_stock_options',
 'total_stock_value',
 'bonus',
 'salary',
 'fraction_to_poi',
 'to_poi_ratio',
 'long_term_incentive_ratio',
 'deferred_income',
 'bonus_to_salary_ratio',
 'long_term_incentive',
 'restricted_stock',
 'shared_receipt_with_poi_ratio',
 'total_payments',
 'shared_receipt_with_poi',
 'loan_advances',
 'expenses',
 'from_poi_to_this_person']