In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import average_precision_score

import pycaret
from pycaret.classification import *
from infra_utils import get_raw_data

In [2]:
data_file = ('dataset' +  os.path.sep +  'creditcard.csv')
data_file

'dataset\\creditcard.csv'

In [3]:
df = pd.read_csv(data_file)

In [4]:
df_train, df_test = get_raw_data(df)

In [5]:
%%time

# pycaret.classification library:

clf1 = setup(data=df_train,
             verbose=True,
             target='target',
             log_experiment=True,
             use_gpu=False,
             experiment_name='Tuned_Model',
             fix_imbalance=True,
             # transformation=True, error
             polynomial_features=True,
             # feature_selection=True, error
             #remove_multicollinearity=True,
             #multicollinearity_threshold=0.6,
             pca=False
             
            )


add_metric(id='apc', name='APC', score_func=average_precision_score, target='pred_proba')

best_model = pycaret.classification.compare_models(sort='APC',
                                                   include=['et', 'rf', 'lr', 'gbc'],
                                                   probability_threshold=0.5
                                                  )

Unnamed: 0,Description,Value
0,Session id,6275
1,Target,target
2,Target type,Binary
3,Original data shape,"(2237, 31)"
4,Transformed data shape,"(3352, 496)"
5,Transformed train set shape,"(2680, 496)"
6,Transformed test set shape,"(672, 496)"
7,Numeric features,30
8,Preprocess,True
9,Imputation type,simple


2024/08/03 08:44:24 INFO mlflow.tracking.fluent: Experiment with name 'Tuned_Model' does not exist. Creating a new experiment.


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
gbc,Gradient Boosting Classifier,0.97,0.978,0.8581,0.9294,0.8905,0.8732,0.8753,0.9488,16.176
rf,Random Forest Classifier,0.9732,0.9751,0.8538,0.9586,0.9005,0.8851,0.8887,0.9433,1.164
et,Extra Trees Classifier,0.9738,0.9743,0.8494,0.9656,0.9021,0.8871,0.8906,0.9391,1.721
lr,Logistic Regression,0.9597,0.9504,0.8674,0.8588,0.8607,0.8373,0.8388,0.9179,1.337


CPU times: total: 1min
Wall time: 4min 32s


In [6]:
best_model

## Hyperparameter Tuning
### Define search space for hepyer paramater tuning

In [None]:
%%time
params = {'max_depth': np.random.randint(1, 40, 3)}
#          'n_estimetor': np.random.randint(2, 1000, 10)}
params['max_depth']
# Tune Model

tuned_dt = pycaret.classification.tune_model(best_model, custom_grid=params)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 3 candidates, totalling 30 fits


In [20]:
tuned_dt

In [21]:
result = predict_model(tuned_dt, data=df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
0,Extra Trees Classifier,0.9737,0.9684,0.8538,0.9733,0.9097,0.8943,0.897,0.9376


## Voting Classifier

In [22]:
tops = compare_models(n_select=3, sort='APC')
tops

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC,TT (Sec)
catboost,CatBoost Classifier,0.9738,0.9822,0.8625,0.9512,0.9025,0.8875,0.8903,0.9525,52.726
gbc,Gradient Boosting Classifier,0.9732,0.9762,0.8666,0.9432,0.9011,0.8857,0.8881,0.9464,14.415
et,Extra Trees Classifier,0.9776,0.9775,0.8577,0.9861,0.9151,0.9024,0.9067,0.9458,0.242
lightgbm,Light Gradient Boosting Machine,0.9751,0.9781,0.8536,0.9703,0.9056,0.8915,0.8955,0.9453,1.806
rf,Random Forest Classifier,0.977,0.9772,0.8713,0.9685,0.9146,0.9014,0.9048,0.9425,0.849
ada,Ada Boost Classifier,0.968,0.9718,0.8534,0.9186,0.8836,0.8651,0.8667,0.9351,2.889
lr,Logistic Regression,0.9597,0.9482,0.8753,0.8532,0.8613,0.8378,0.8398,0.9163,1.212
nb,Naive Bayes,0.9329,0.9159,0.5773,0.9325,0.708,0.6729,0.7003,0.8212,0.088
qda,Quadratic Discriminant Analysis,0.9668,0.883,0.7692,1.0,0.8672,0.8488,0.8597,0.8024,0.231
lda,Linear Discriminant Analysis,0.9291,0.8805,0.7609,0.7576,0.7531,0.7121,0.716,0.7365,0.137




[<catboost.core.CatBoostClassifier at 0x215c8f8f4d0>,
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='log_loss', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_samples_leaf=1,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            n_estimators=100, n_iter_no_change=None,
                            random_state=3098, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='sqrt',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_samples_leaf=1,
                      min_samples

In [24]:
tops[1]

In [26]:
%%time
bledned_weights = blend_models([tops[0], tops[1], tops[2]], weights=[0.5, 0.3, 0.2])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9745,0.976,0.8261,1.0,0.9048,0.8902,0.8956,0.9404
1,0.9745,0.9721,0.8696,0.9524,0.9091,0.8943,0.8956,0.9522
2,0.9873,0.9974,0.913,1.0,0.9545,0.9472,0.9485,0.9888
3,0.9809,0.988,0.8696,1.0,0.9302,0.9192,0.9222,0.9536
4,0.9682,0.9536,0.8261,0.95,0.8837,0.8654,0.8682,0.9075
5,0.9808,0.9942,0.9545,0.913,0.9333,0.9221,0.9224,0.9788
6,0.9808,0.981,0.9091,0.9524,0.9302,0.9191,0.9194,0.9486
7,0.9679,0.9824,0.8636,0.9048,0.8837,0.8651,0.8655,0.9475
8,0.9936,0.999,0.9545,1.0,0.9767,0.973,0.9734,0.9945
9,0.9487,0.9671,0.6818,0.9375,0.7895,0.7611,0.7736,0.8963




In [27]:
%%time
blender = blend_models(tops)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9745,0.9796,0.8261,1.0,0.9048,0.8902,0.8956,0.9407
1,0.9745,0.987,0.8696,0.9524,0.9091,0.8943,0.8956,0.9587
2,0.9936,0.9955,0.9565,1.0,0.9778,0.9741,0.9744,0.9835
3,0.9809,0.9893,0.8696,1.0,0.9302,0.9192,0.9222,0.9593
4,0.9682,0.9575,0.8261,0.95,0.8837,0.8654,0.8682,0.9075
5,0.9808,0.9847,0.9545,0.913,0.9333,0.9221,0.9224,0.966
6,0.9808,0.978,0.9091,0.9524,0.9302,0.9191,0.9194,0.9455
7,0.9744,0.9766,0.9091,0.9091,0.9091,0.8942,0.8942,0.945
8,0.9936,0.9986,0.9545,1.0,0.9767,0.973,0.9734,0.993
9,0.9487,0.9495,0.6818,0.9375,0.7895,0.7611,0.7736,0.8725




## Hyperparameter tuning with scikit-optimize on voting model

In [31]:
%%time
tuned_vote = tune_model(bledned_weights, search_library='scikit-optimize', optimize='APC', n_iter=5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,APC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9745,0.9779,0.8261,1.0,0.9048,0.8902,0.8956,0.9432
1,0.9745,0.9828,0.8696,0.9524,0.9091,0.8943,0.8956,0.9581
2,0.9873,0.9977,0.913,1.0,0.9545,0.9472,0.9485,0.9899
3,0.9745,0.9893,0.8696,0.9524,0.9091,0.8943,0.8956,0.9629
4,0.9618,0.9494,0.7826,0.9474,0.8571,0.8353,0.8404,0.9093
5,0.9936,0.9905,0.9545,1.0,0.9767,0.973,0.9734,0.9745
6,0.9808,0.9783,0.9091,0.9524,0.9302,0.9191,0.9194,0.9457
7,0.9615,0.9813,0.8636,0.8636,0.8636,0.8412,0.8412,0.9455
8,0.9936,0.999,0.9545,1.0,0.9767,0.973,0.9734,0.9945
9,0.9487,0.9579,0.6818,0.9375,0.7895,0.7611,0.7736,0.8898


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).




CPU times: total: 5.03 s
Wall time: 1h 13min 42s


In [None]:
result = predict_model(tuned_vote, raw_score=True, data=df_test)

In [None]:
save_model(tuned_vote, 'tuned_vote')