# Importing necessary libs

In [3]:
#!pip install mixed-naive-bayes

In [1]:
# basic libs
import pandas as pd
import numpy as np

# show all available columns
pd.set_option('display.max_columns', 200)
# show all available rows
pd.set_option('display.max_rows', 200)

# Data Prep
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from category_encoders import TargetEncoder

# import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from mixed_naive_bayes import MixedNB

# ensemble models 
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier, StackingClassifier, ExtraTreesClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

# neural network model
from sklearn.neural_network import MLPClassifier

# otimization features
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

# Validation and metrics
from sklearn.model_selection import cross_val_predict, learning_curve, KFold, LeaveOneOut, cross_validate, validation_curve
from sklearn.metrics import precision_recall_curve, log_loss, make_scorer
from sklearn.metrics import auc, confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, f1_score, fbeta_score

# statistics
from scipy.stats import loguniform, uniform

# models
import statsmodels.formula.api as smf
import statsmodels.api as sm

# warnig treatments
import warnings
warnings.filterwarnings('ignore')

# Charging and dividing DataFrame 

In [2]:
# Charging dataframe
PATH = 'D:\DataScience\Python\Jupyter\Desafio05\Data\Processed'
FILE = '\Default_Credit_Card_processed_toModel.csv'

df_credit_card = pd.read_csv(PATH + FILE)

In [3]:
# Defining Target constraint
TARGET = 'Default'

# Sharing training and testing data
df_train, df_test = train_test_split(df_credit_card, stratify=df_credit_card[TARGET], test_size=0.2, random_state=42)

# Isolating target variable
X_train = df_train.drop(TARGET, axis=1)
y_train = df_train[TARGET]

X_test = df_test.drop(TARGET, axis=1)
y_test = df_test[TARGET]

# Defining the pipeline strategy of variables 

In [4]:
df_credit_card.head(2)

Unnamed: 0,Default,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,score
0,0,160000,2,2,2,33,2,2,3,2,0,0,161771,172632,168541,164310,15000,0,0,6100,12300,6100,-0.50401
1,0,150000,2,1,2,34,1,-1,-1,-2,-2,-2,0,53,0,0,53,0,0,0,0,0,-0.381528


In [5]:
df_credit_card['score'].unique()

array([-0.50401004, -0.38152765, -0.3779966 , ..., -0.46404359,
       -0.39293201, -0.43378021])

In [54]:
for col in df_credit_card[['SEX', 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]:
    print(df_credit_card[col].unique())

[2 1]
[2 1 3 4]
[ 2  1  0 -1  3 -2  4  6  5  8  7]
[ 2 -1  0 -2  3  1  4  5  7  6  8]
[ 3 -1  0  2 -2  4  7  6  5  1  8]
[ 2 -2  0 -1  3  7  4  5  1  8  6]
[ 0 -2 -1  2  7  5  4  3  6  8]
[ 0 -2  2 -1  6  4  3  7  5  8]


# Data preparation

In [7]:
# changing objects X_Train
X_train['SEX'] = X_train['SEX'].astype('object')
X_train['MARRIAGE'] = X_train['MARRIAGE'].astype('object')
X_train['EDUCATION'] = X_train['EDUCATION'].astype('object')
X_train['PAY_0'] = X_train['PAY_0'].astype('object')
X_train['PAY_2'] = X_train['PAY_2'].astype('object')
X_train['PAY_3'] = X_train['PAY_3'].astype('object')
X_train['PAY_4'] = X_train['PAY_4'].astype('object')
X_train['PAY_5'] = X_train['PAY_5'].astype('object')
X_train['PAY_6'] = X_train['PAY_6'].astype('object')

In [8]:
# changing objects X_test
X_test['SEX'] = X_test['SEX'].astype('object')
X_test['MARRIAGE'] = X_test['MARRIAGE'].astype('object')
X_test['EDUCATION'] = X_test['EDUCATION'].astype('object')
X_test['PAY_0'] = X_test['PAY_0'].astype('object')
X_test['PAY_2'] = X_test['PAY_2'].astype('object')
X_test['PAY_3'] = X_test['PAY_3'].astype('object')
X_test['PAY_4'] = X_test['PAY_4'].astype('object')
X_test['PAY_5'] = X_test['PAY_5'].astype('object')
X_test['PAY_6'] = X_test['PAY_5'].astype('object')

In [9]:
# Defining pipes
vars_minmax = ['AGE']
vars_encs = ['SEX', 'MARRIAGE']
vars_cats = ['EDUCATION']
vars_targ = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
vars_stds = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4',
             'PAY_AMT5', 'PAY_AMT6', 'score']

# Charging pipelines
pipe_num_min_max = ('min_max_scaler', MinMaxScaler(), vars_minmax)
pipe_cat_one_hot = ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), vars_encs)
pipe_cat_ordinal = ('ordinal_encoder', OrdinalEncoder(), vars_cats)
pipe_cat_target = ('target_encoder', TargetEncoder(), vars_targ)
pipe_num_stds = ('standard_scaler', StandardScaler(), vars_stds)

In [10]:
# Elaborating transformer flow 
transformers = [pipe_num_min_max,
                pipe_cat_one_hot,
                pipe_cat_ordinal,
                pipe_cat_target,
                pipe_num_stds]
pre_processador = ColumnTransformer(transformers)

# Evaluating Models

In [36]:
import sys
sys.path.append('../../src/modeling')

# import lib Model_Evaluation that has execute model and ranking model
# excute model is a function that runs the basic models in order to future decision
# ranking model elaborates a dataframe that has an ordernation from the best model to the worst one
import Model_Evaluation as me

In [52]:
# executing the basic models 

# executing logistic regression
model_reglog = me.execute_model(LogisticRegression(random_state=123), \
                             pre_processador, 'Logistic_Regression', X_train, y_train, X_test, y_test)

# executing Support Vector Machine
model_svm = me.execute_model(SVC(kernel='rbf', probability=True, random_state=123), pre_processador, 'Support_Vector_Mac', \
                             X_train, y_train, X_test, y_test)

# executing Mixed Gaussian Naive Bayes
model_gnb = me.execute_model(MixedNB(), pre_processador, 'Gaussian_NB', X_train, y_train, X_test, y_test)

# executing Decision Tree
model_dt = me.execute_model(DecisionTreeClassifier(random_state=123), pre_processador, 'Decision_Tree', \
                            X_train, y_train, X_test, y_test)

# executing Random Forest
model_ranfor = me.execute_model(RandomForestClassifier(random_state=123), pre_processador, 'Random_Forest',\
                            X_train, y_train, X_test, y_test)

# executing AdaBoost
model_adb = me.execute_model(AdaBoostClassifier(n_estimators=50, algorithm='SAMME.R', learning_rate=0.8, random_state=123), \
                             pre_processador, 'AdaBoost', X_train, y_train, X_test, y_test)

# executing Extra Trees
model_xtr = me.execute_model(ExtraTreesClassifier(n_estimators=50, random_state=123), pre_processador, 'Extra_Trees',\
                             X_train, y_train, X_test, y_test)

# executing Multi layer perceptron
model_MLPerp = me.execute_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1), \
                          pre_processador, 'MLPerceptron', X_train, y_train, X_test, y_test)

# executing XGBoost
model_XGBoos = me.execute_model(XGBClassifier(), pre_processador, 'XGBoost', X_train, y_train, X_test, y_test)

# executing CatBoost
model_cast = me.execute_model(CatBoostClassifier(iterations=50,
                             learning_rate=0.02,
                             depth=3,
                             eval_metric='AUC',
                             random_seed = 123,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             od_wait=100), pre_processador, 'CatBoost', X_train, y_train, X_test, y_test)

# executing Light GBM Boos
model_lgb = me.execute_model(lgb.LGBMClassifier(num_iterations=50,
                                             num_leaves=5,
                                             min_data_in_leaf=1,
                                             max_depth=3,
                                             bagging_fraction=0.2,
                                             max_bin=3,
                                             random_state=123), pre_processador, 'lgbBoost', 
                             X_train, y_train, X_test, y_test ) 


[]
0:	total: 5.29ms	remaining: 259ms
1:	total: 10.7ms	remaining: 256ms
2:	total: 15.9ms	remaining: 249ms
3:	total: 21.9ms	remaining: 251ms
4:	total: 27ms	remaining: 243ms
5:	total: 32.3ms	remaining: 237ms
6:	total: 37.2ms	remaining: 228ms
7:	total: 42ms	remaining: 221ms
8:	total: 46.8ms	remaining: 213ms
9:	total: 51.8ms	remaining: 207ms
10:	total: 57.4ms	remaining: 204ms
11:	total: 63ms	remaining: 199ms
12:	total: 68ms	remaining: 194ms
13:	total: 73.4ms	remaining: 189ms
14:	total: 78.6ms	remaining: 183ms
15:	total: 83.8ms	remaining: 178ms
16:	total: 89ms	remaining: 173ms
17:	total: 94.3ms	remaining: 168ms
18:	total: 99.6ms	remaining: 163ms
19:	total: 105ms	remaining: 157ms
20:	total: 110ms	remaining: 152ms
21:	total: 116ms	remaining: 147ms
22:	total: 121ms	remaining: 142ms
23:	total: 126ms	remaining: 137ms
24:	total: 131ms	remaining: 131ms
25:	total: 136ms	remaining: 126ms
26:	total: 141ms	remaining: 120ms
27:	total: 146ms	remaining: 115ms
28:	total: 152ms	remaining: 110ms
29:	total: 1

0:	total: 6.59ms	remaining: 323ms
1:	total: 11.7ms	remaining: 280ms
2:	total: 16.4ms	remaining: 258ms
3:	total: 22.2ms	remaining: 255ms
4:	total: 27.2ms	remaining: 245ms
5:	total: 33ms	remaining: 242ms
6:	total: 38.6ms	remaining: 237ms
7:	total: 44.1ms	remaining: 232ms
8:	total: 49.4ms	remaining: 225ms
9:	total: 54.4ms	remaining: 218ms
10:	total: 59.7ms	remaining: 212ms
11:	total: 65.2ms	remaining: 206ms
12:	total: 70.8ms	remaining: 201ms
13:	total: 76.5ms	remaining: 197ms
14:	total: 82.3ms	remaining: 192ms
15:	total: 87.5ms	remaining: 186ms
16:	total: 93ms	remaining: 181ms
17:	total: 99.8ms	remaining: 177ms
18:	total: 106ms	remaining: 173ms
19:	total: 112ms	remaining: 168ms
20:	total: 117ms	remaining: 162ms
21:	total: 123ms	remaining: 157ms
22:	total: 128ms	remaining: 151ms
23:	total: 134ms	remaining: 145ms
24:	total: 139ms	remaining: 139ms
25:	total: 145ms	remaining: 134ms
26:	total: 151ms	remaining: 128ms
27:	total: 156ms	remaining: 123ms
28:	total: 162ms	remaining: 117ms
29:	total:

0:	total: 5.99ms	remaining: 293ms
1:	total: 11.6ms	remaining: 278ms
2:	total: 16.3ms	remaining: 255ms
3:	total: 21.4ms	remaining: 246ms
4:	total: 26.6ms	remaining: 239ms
5:	total: 31.7ms	remaining: 232ms
6:	total: 36.6ms	remaining: 225ms
7:	total: 41.4ms	remaining: 217ms
8:	total: 46.8ms	remaining: 213ms
9:	total: 52.2ms	remaining: 209ms
10:	total: 57.6ms	remaining: 204ms
11:	total: 63.3ms	remaining: 200ms
12:	total: 69.1ms	remaining: 197ms
13:	total: 74.5ms	remaining: 192ms
14:	total: 79.8ms	remaining: 186ms
15:	total: 85.1ms	remaining: 181ms
16:	total: 90.5ms	remaining: 176ms
17:	total: 95.9ms	remaining: 170ms
18:	total: 102ms	remaining: 167ms
19:	total: 108ms	remaining: 162ms
20:	total: 114ms	remaining: 157ms
21:	total: 119ms	remaining: 152ms
22:	total: 125ms	remaining: 146ms
23:	total: 130ms	remaining: 141ms
24:	total: 136ms	remaining: 136ms
25:	total: 141ms	remaining: 130ms
26:	total: 148ms	remaining: 126ms
27:	total: 154ms	remaining: 121ms
28:	total: 160ms	remaining: 116ms
29:	to

In [53]:
models = [model_reglog, model_svm, model_gnb, model_dt, model_ranfor, model_adb, model_xtr, model_MLPerp,
          model_XGBoos, model_cast, model_lgb]
df_model_results = me.ranking_models(models)
df_model_results

Unnamed: 0,Model,Score_Train,Score_Test,Auc_Roc,Log_loss_Train,Log_loss_Test,Mean_Acc_Score,Std_Acc_Score,Model_Run,Diff_log_loss,Status
7,MLPerceptron,82.0,82.18,77.6,0.431045,0.428954,81.93,0.63,(ColumnTransformer(transformers=[('min_max_sca...,-0.002091,Normal
0,Logistic_Regression,82.02,82.15,76.73,0.438033,0.437253,81.98,0.68,(ColumnTransformer(transformers=[('min_max_sca...,-0.00078,Normal
8,XGBoost,88.37,81.42,94.45,0.280217,0.444547,81.39,0.77,(ColumnTransformer(transformers=[('min_max_sca...,0.16433,Overfit
4,Random_Forest,99.94,82.2,100.0,0.113638,0.445196,81.39,0.59,(ColumnTransformer(transformers=[('min_max_sca...,0.331559,Overfit
10,lgbBoost,80.1,80.62,75.24,0.453577,0.446746,80.06,0.52,(ColumnTransformer(transformers=[('min_max_sca...,-0.006831,Normal
1,Support_Vector_Mac,82.01,82.02,75.77,0.447135,0.448496,81.8,0.54,(ColumnTransformer(transformers=[('min_max_sca...,0.001361,Normal
9,CatBoost,81.95,82.0,76.61,0.470352,0.467904,81.92,0.61,(ColumnTransformer(transformers=[('min_max_sca...,-0.002448,Normal
6,Extra_Trees,99.94,81.25,100.0,0.000888,0.565823,80.5,0.74,(ColumnTransformer(transformers=[('min_max_sca...,0.564935,Overfit
5,AdaBoost,81.88,82.07,78.31,0.677516,0.677522,81.73,0.69,(ColumnTransformer(transformers=[('min_max_sca...,6e-06,Normal
2,Gaussian_NB,76.51,77.4,67.0,1.180029,1.611133,,,(ColumnTransformer(transformers=[('min_max_sca...,0.431105,Normal


In [21]:
df_model_results.sort_values(by = ["Log_loss_Train", "Mean_Acc_Score", "Auc_Roc", "Log_loss_Test"],
                ascending = [True, False, False, True])

Unnamed: 0,Model,Score_Train,Score_Test,Auc_Roc,Log_loss_Train,Log_loss_Test,Mean_Acc_Score,Std_Acc_Score,Model_Run,Diff_log_loss,Status
6,Extra_Trees,99.94,81.25,100.0,0.000888,0.565823,80.5,0.74,(ColumnTransformer(transformers=[('min_max_sca...,0.564935,Overfit
3,Decision_Tree,99.94,72.82,100.0,0.000888,9.377971,72.53,0.91,(ColumnTransformer(transformers=[('min_max_sca...,9.377083,Overfit
4,Random_Forest,99.94,82.2,100.0,0.113638,0.445196,81.39,0.59,(ColumnTransformer(transformers=[('min_max_sca...,0.331559,Overfit
8,XGBoost,88.37,81.42,94.45,0.280217,0.444547,81.39,0.77,(ColumnTransformer(transformers=[('min_max_sca...,0.16433,Overfit
7,MLPerceptron,82.0,82.18,77.6,0.431045,0.428954,81.93,0.63,(ColumnTransformer(transformers=[('min_max_sca...,-0.002091,Normal
0,Logistic_Regression,82.02,82.15,76.73,0.438033,0.437253,81.98,0.68,(ColumnTransformer(transformers=[('min_max_sca...,-0.00078,Normal
1,Support_Vector_Mac,82.01,82.02,75.77,0.447135,0.448496,81.8,0.54,(ColumnTransformer(transformers=[('min_max_sca...,0.001361,Normal
10,lgbBoost,80.1,80.62,75.24,0.453577,0.446746,80.06,0.52,(ColumnTransformer(transformers=[('min_max_sca...,-0.006831,Normal
9,CatBoost,81.95,82.0,76.61,0.470352,0.467904,81.92,0.61,(ColumnTransformer(transformers=[('min_max_sca...,-0.002448,Normal
5,AdaBoost,81.88,82.07,78.31,0.677516,0.677522,81.73,0.69,(ColumnTransformer(transformers=[('min_max_sca...,6e-06,Normal


# Chosen models Optimization

In [15]:
# Extra Trees, Random Forest, XGBoost and MLPerceptron
# afterwards: run voting and stacking in order to check better perfomance

In [16]:
# stratified kfold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

In [17]:
# executing Random Forest optimization
model_rf = Pipeline(steps=[('pre_processor', pre_processador), ('model', RandomForestClassifier(random_state=123))])

# defining hyperparameters
criterion = ["gini", "entropy"]
min_samples_leaf = [1, 5, 10,]
min_samples_split = [2, 4, 10,]
n_estimators = [10, 20, 30, 40, 50, 60, 100, 120, 150, 200, 250, 300, 500,]
max_depth = [2, 4, 6, 8, 10, 12, 20,]
param_grid = dict(model__criterion = criterion, \
                  model__min_samples_leaf = min_samples_leaf, \
                  model__min_samples_split = min_samples_split, \
                  model__n_estimators = n_estimators, \
                  model__max_depth = max_depth)

# finding best parameters
grid_rf = RandomizedSearchCV(model_rf, param_grid, n_iter=100, cv=skf, scoring='accuracy', \
                             verbose=1, random_state=123, n_jobs=-1)

# training best parameters
grid_rf.fit(X_train, y_train)

rf_best = grid_rf.best_estimator_ # saving the best hyperparameters definition

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [18]:
# executing Extra Trees optimization
model_extratrees = Pipeline(steps=[('pre_processor', pre_processador), ('model', ExtraTreesClassifier(random_state=123))])

# defining hyperparameters
criterion = ["gini", "entropy"]
min_samples_leaf = [1, 3, 5, 8, 10,]
min_samples_split = [1, 2, 3, ]
n_estimators = [10, 20, 30, 40, 50, 60, 100, ]
max_depth = [2, 4, 6, ]
max_features = ["auto", "sqrt", "log2"]
bootstrap = [True, False]
warm_start = [True, False]
max_samples = [0, 1, 2, 3, 4, 5]
n_iter = [50, 100, 150]
param_grid = dict(model__criterion = criterion, \
                  model__min_samples_leaf = min_samples_leaf, \
                  model__min_samples_split = min_samples_split, \
                  model__n_estimators = n_estimators, \
                  model__max_depth = max_depth, \
                  model__max_features = max_features, \
                  model__bootstrap = bootstrap, \
                  model__warm_start = warm_start, \
                  model__max_samples = max_samples)

# finding best parameters
grid_xtree = RandomizedSearchCV(model_extratrees, param_grid, n_iter=100, cv=skf, scoring='accuracy', \
                             verbose=1, random_state=123, n_jobs=-1)

# training best parameters
grid_xtree.fit(X_train, y_train)

xtree_best = grid_xtree.best_estimator_ # saving the best hyperparameters definition

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [24]:
# executing Multi layer perceptron optimization
model_MLP = Pipeline(steps=[('pre_processor', pre_processador),
                ('model', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) )])

# defining hyperparameters
hidden_layer_sizes = [50, 100, 150, 200, 250, 300, ]
activation = ['tanh']
solver = ['lbfgs']
alpha = loguniform(1e-2, 1)
batch_size = [50, 100, 150, 200, 250, 300, ]
learning_rate=['adaptive']
max_iter = [10, 20, 30, 40, 50, 60, 100, 120, 150, 200, 250, 300, 350, 400, 500,]
max_fun = [1000, 1500, 2000, 2500]
n_iter_no_change = [2, 4, 6, 8, 10, 12, 20,]
param_grid = dict(model__hidden_layer_sizes = hidden_layer_sizes, \
                  model__activation = activation, \
                  model__solver = solver, \
                  model__alpha = alpha, \
                  model__batch_size = batch_size, \
                  model__learning_rate = learning_rate, \
                  model__max_iter = max_iter, \
                  model__max_fun = max_fun, \
                  model__n_iter_no_change = n_iter_no_change)

# finding best parameters
grid_MLP = RandomizedSearchCV(model_MLP, param_grid,  n_iter=10, cv=skf, scoring='accuracy', \
                              verbose=1, random_state=123, n_jobs=-1)

# training best parameters
grid_MLP.fit(X_train, y_train)

MLP_best = grid_MLP.best_estimator_ # saving the best hyperparameters definition

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [28]:
# executing XGBoost optimization
model_xgb = Pipeline(steps=[('pre_processor', pre_processador), ('model', XGBClassifier() )])


# definition of hyper parameters for XGBoost Classifier
learning_rate = [0.1, 0.01, 0.001]
gamma = [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2]
max_depth = [2, 4, 6, 8, 10, 12, 20, 30]
colsample_bytree = [0.3, 0.6, 0.8, 1.0]
subsample = [0.2, 0.4, 0.5, 0.6, 0.7]
reg_alpha = [0, 0.5, 1]
reg_lambda =  [1, 1.5, 2, 3, 4.5]
min_child_weight = [1, 3, 5, 7]
n_estimators = [10, 20, 30, 40, 50, 60, 100, 120, 150, 200, 250, 300, 500]

param_grid = dict(model__learning_rate = learning_rate, \
                  model__gamma = gamma, \
                  model__colsample_bytree = colsample_bytree, \
                  model__n_estimators = n_estimators, \
                  model__max_depth = max_depth, \
                  model__subsample = subsample, \
                  model__reg_alpha = reg_alpha, \
                  model__reg_lambda = reg_lambda, \
                  model__min_child_weight = min_child_weight)

# finding best parameters
grid_xgb = RandomizedSearchCV(model_xgb, param_grid,  n_iter=100, cv=skf, scoring='accuracy', \
                             verbose=1, random_state=123, n_jobs=-1)

# training best parameters
grid_xgb.fit(X_train, y_train)

xgb_best = grid_xgb.best_estimator_ # saving the best hyperparameters definition

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [43]:
# running models with the best hyperparameters and getting measures

# rf_best
rf_best_params = measure_best_model(rf_best, pre_processador, 'RF_Best', X_train, y_train, X_test, y_test)

# MLP_best
MLP_best_params = measure_best_model(MLP_best, pre_processador, 'MLP_Best', X_train, y_train, X_test, y_test)

# Xtree_best
xtree_best_params = measure_best_model(xtree_best, pre_processador, 'XTREE_Best', X_train, y_train, X_test, y_test)

# XGB_best
xgb_best_params = measure_best_model(xgb_best, pre_processador, 'XGB_Best', X_train, y_train, X_test, y_test)

# Ranking peformances of the best models chosen
models = [rf_best_params, MLP_best_params, xtree_best_params, xgb_best_params]
df_model_results_best = me.ranking_models(models)
df_model_results_best



Unnamed: 0,Model,Score_Train,Score_Test,Auc_Roc,Log_loss_Train,Log_loss_Test,Mean_Acc_Score,Std_Acc_Score,Model_Run,Diff_log_loss,Status
0,RF_Best,84.65,82.38,86.13,0.371818,0.422352,82.08,0.7,(ColumnTransformer(transformers=[('min_max_sca...,0.050535,Normal
1,MLP_Best,81.96,82.03,77.11,0.435276,0.433801,81.9,0.67,(ColumnTransformer(transformers=[('min_max_sca...,-0.001475,Normal
2,XTREE_Best,82.01,81.87,77.28,0.437387,0.438278,81.7,0.72,(ColumnTransformer(transformers=[('min_max_sca...,0.000891,Normal
3,XGB_Best,82.98,82.15,79.69,0.643485,0.644737,82.1,0.78,(ColumnTransformer(transformers=[('min_max_sca...,0.001252,Normal


In [47]:
# Combing Models:
models = [ ('MLP', MLP_best), ('rf', rf_best), ('xtree', xtree_best), ('xgb', xgb_best) ]

In [48]:
# training voting classifier with the best models optimized
model_vote = VotingClassifier(estimators=models, voting='soft')
model_vote.fit(X_train, y_train)



VotingClassifier(estimators=[('MLP',
                              Pipeline(steps=[('pre_processor',
                                               ColumnTransformer(transformers=[('min_max_scaler',
                                                                                MinMaxScaler(),
                                                                                ['AGE']),
                                                                               ('one_hot_encoder',
                                                                                OneHotEncoder(handle_unknown='ignore'),
                                                                                ['SEX',
                                                                                 'MARRIAGE']),
                                                                               ('ordinal_encoder',
                                                                                OrdinalEncoder(),
                

In [49]:
# training stacking classifier with the best models optimized
model_stack = StackingClassifier(estimators=models)
model_stack.fit(X_train, y_train)



StackingClassifier(estimators=[('MLP',
                                Pipeline(steps=[('pre_processor',
                                                 ColumnTransformer(transformers=[('min_max_scaler',
                                                                                  MinMaxScaler(),
                                                                                  ['AGE']),
                                                                                 ('one_hot_encoder',
                                                                                  OneHotEncoder(handle_unknown='ignore'),
                                                                                  ['SEX',
                                                                                   'MARRIAGE']),
                                                                                 ('ordinal_encoder',
                                                                                  OrdinalEncod

In [50]:
# Ranking peformances of the best models chosen, including voting and stacking


# Vote_parameters
vote_best_params = measure_best_model(model_vote, pre_processador, 'Vote_Best', X_train, y_train, X_test, y_test)

# Stack_parameters
stack_best_params = measure_best_model(model_stack, pre_processador, 'Stack_Best', X_train, y_train, X_test, y_test)

# Ranking peformances of the best models chosen, including voting and stacking
models = [rf_best_params, MLP_best_params, xtree_best_params, xgb_best_params, vote_best_params, stack_best_params]
df_model_results_best = me.ranking_models(models)








Unnamed: 0,Model,Score_Train,Score_Test,Auc_Roc,Log_loss_Train,Log_loss_Test,Mean_Acc_Score,Std_Acc_Score,Model_Run,Diff_log_loss,Status
0,RF_Best,84.65,82.38,86.13,0.371818,0.422352,82.08,0.7,(ColumnTransformer(transformers=[('min_max_sca...,0.050535,Normal
5,Stack_Best,84.45,82.28,85.76,0.378047,0.425592,82.11,0.68,"StackingClassifier(estimators=[('MLP',\n ...",0.047545,Normal
1,MLP_Best,81.96,82.03,77.11,0.435276,0.433801,81.9,0.67,(ColumnTransformer(transformers=[('min_max_sca...,-0.001475,Normal
2,XTREE_Best,82.01,81.87,77.28,0.437387,0.438278,81.7,0.72,(ColumnTransformer(transformers=[('min_max_sca...,0.000891,Normal
4,Vote_Best,83.02,82.25,81.91,0.441866,0.453459,82.07,0.77,"VotingClassifier(estimators=[('MLP',\n ...",0.011594,Normal
3,XGB_Best,82.98,82.15,79.69,0.643485,0.644737,82.1,0.78,(ColumnTransformer(transformers=[('min_max_sca...,0.001252,Normal


In [51]:
df_model_results_best

Unnamed: 0,Model,Score_Train,Score_Test,Auc_Roc,Log_loss_Train,Log_loss_Test,Mean_Acc_Score,Std_Acc_Score,Model_Run,Diff_log_loss,Status
0,RF_Best,84.65,82.38,86.13,0.371818,0.422352,82.08,0.7,(ColumnTransformer(transformers=[('min_max_sca...,0.050535,Normal
5,Stack_Best,84.45,82.28,85.76,0.378047,0.425592,82.11,0.68,"StackingClassifier(estimators=[('MLP',\n ...",0.047545,Normal
1,MLP_Best,81.96,82.03,77.11,0.435276,0.433801,81.9,0.67,(ColumnTransformer(transformers=[('min_max_sca...,-0.001475,Normal
2,XTREE_Best,82.01,81.87,77.28,0.437387,0.438278,81.7,0.72,(ColumnTransformer(transformers=[('min_max_sca...,0.000891,Normal
4,Vote_Best,83.02,82.25,81.91,0.441866,0.453459,82.07,0.77,"VotingClassifier(estimators=[('MLP',\n ...",0.011594,Normal
3,XGB_Best,82.98,82.15,79.69,0.643485,0.644737,82.1,0.78,(ColumnTransformer(transformers=[('min_max_sca...,0.001252,Normal


In [45]:
rf_best

Pipeline(steps=[('pre_processor',
                 ColumnTransformer(transformers=[('min_max_scaler',
                                                  MinMaxScaler(), ['AGE']),
                                                 ('one_hot_encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['SEX', 'MARRIAGE']),
                                                 ('ordinal_encoder',
                                                  OrdinalEncoder(),
                                                  ['EDUCATION']),
                                                 ('target_encoder',
                                                  TargetEncoder(),
                                                  ['PAY_0', 'PAY_2', 'PAY_3',
                                                   'PAY_4', 'PAY_5', 'PAY_6']),
                                                 ('standard_scaler',
                          

>> Conclusion: optimized random forest resulted in the best measurements