### Improving Data Quality with Training Dynamics of Gradient Boosting Decision Trees
#### Authors: M.A. Ponti, L.A. Oliveira, V.Garcia, M.Esteban, J.M. Román, L. Argerich

## Demo: Brownboost no NNAR BreastCancer dataset


In [1]:
import os
import pandas as pd
import numpy as np
import daal4py as d4p
from metrics import print_metrics

import session_info
session_info.show(html=False)


-----
daal4py             NA
metrics             NA
numpy               1.23.2
pandas              1.4.4
session_info        1.0.0
-----
IPython             7.19.0
jupyter_client      6.1.7
jupyter_core        4.6.3
jupyterlab          2.2.6
notebook            6.1.4
-----
Python 3.8.5 (default, Sep  4 2020, 02:22:02) [Clang 10.0.0 ]
macOS-10.16-x86_64-i386-64bit
-----
Session information updated at 2023-02-01 11:58


# Breast Cancer Dataset

In [2]:
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("datasets/nnar_breast_cancer.csv")

data_features = [col for col in dataset.columns if col not in [
        "id", 
        "subset", 
        "tag", 
        "label",
        "noise_10%_nnar_coliving_neighbors_5_distance_minkowski",
        "noise_20%_nnar_coliving_neighbors_10_distance_minkowski",
        "noise_30%_nnar_coliving_neighbors_15_distance_minkowski"
    ]
]

noised_label = "noise_30%_nnar_coliving_neighbors_15_distance_minkowski"


assert dataset.shape == dataset.drop_duplicates(subset=data_features).shape

# VALID tag in subset column was wrong (same instances as test), we create a new validation set from training
train = dataset[dataset.subset=="TRAIN"]
x_train = train[data_features]
y_train = train["label"].to_numpy()
y_train_noise1 = train["noise_10%_nnar_coliving_neighbors_5_distance_minkowski"].to_numpy()
y_train_noise2 = train["noise_20%_nnar_coliving_neighbors_10_distance_minkowski"].to_numpy()
y_train_noise3 = train["noise_30%_nnar_coliving_neighbors_15_distance_minkowski"].to_numpy()

valid = dataset[dataset.subset=="VALID"]
x_valid = valid[data_features]
y_valid = valid["label"].to_numpy()
y_valid_noise1 = valid["noise_10%_nnar_coliving_neighbors_5_distance_minkowski"].to_numpy()
y_valid_noise2 = valid["noise_20%_nnar_coliving_neighbors_10_distance_minkowski"].to_numpy()
y_valid_noise3 = valid["noise_30%_nnar_coliving_neighbors_15_distance_minkowski"].to_numpy()

test = dataset[dataset.subset=="TEST"]
x_test = test[data_features]
y_test = test["label"].to_numpy()

print()
print("Dataset shape: {} - {}".format(dataset.shape, dataset["label"].shape))
print("Train shape: {} - {}".format(x_train.shape, y_train.shape))
print("Valid shape: {} - {}".format(x_valid.shape, y_valid.shape))
print("Test shape: {} - {}".format(x_test.shape, y_test.shape))
print()

print(dataset.columns)
dataset.head()




Dataset shape: (569, 36) - (569,)
Train shape: (381, 30) - (381,)
Valid shape: (94, 30) - (94,)
Test shape: (94, 30) - (94,)

Index(['id', 'label', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave_points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'subset',
       'noise_10%_nnar_coliving_neighbors_5_distance_minkowski',
       'noise_20%_nnar_coliving_neighbors_10_distance_minkowski',
       'noise_30%_nnar_coliving_neighbors_15_distance_minkowski'],
      dtype='object')


Unnamed: 0,id,label,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst,subset,noise_10%_nnar_coliving_neighbors_5_distance_minkowski,noise_20%_nnar_coliving_neighbors_10_distance_minkowski,noise_30%_nnar_coliving_neighbors_15_distance_minkowski
0,8670,1,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,...,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019,TRAIN,1.0,1.0,0.0
1,8913,0,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,...,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915,TRAIN,0.0,0.0,0.0
2,8915,0,14.96,19.1,97.03,687.3,0.08992,0.09823,0.0594,0.04819,...,0.1313,0.303,0.1804,0.1489,0.2962,0.08472,TRAIN,0.0,1.0,1.0
3,9047,0,12.94,16.17,83.18,507.6,0.09879,0.08836,0.03296,0.0239,...,0.1172,0.1958,0.181,0.08388,0.3297,0.07834,TRAIN,0.0,0.0,0.0
4,85715,1,13.17,18.66,85.98,534.6,0.1158,0.1231,0.1226,0.0734,...,0.1786,0.4166,0.5006,0.2088,0.39,0.1179,TRAIN,0.0,0.0,0.0


In [3]:
experiments = [
    { 
    "name": "Experiment A: Train CLEAN & Valid CLEAN",
    "training": (x_train, y_train),
    "validation": (x_valid, y_valid),
    "test": (x_test, y_test)    
    },
    { 
    "name": "Experiment B: Train NOISE1 & Valid CLEAN",
    "training": (x_train, y_train_noise1),
    "validation": (x_valid, y_valid),
    "test": (x_test, y_test)    
    },
    { 
    "name": "Experiment C: Train NOISE2 & Valid CLEAN",
    "training": (x_train, y_train_noise2),
    "validation": (x_valid, y_valid),
    "test": (x_test, y_test)
    },
    { 
    "name": "Experiment D: Train NOISE3 & Valid CLEAN",
    "training": (x_train, y_train_noise3),
    "validation": (x_valid, y_valid),
    "test": (x_test, y_test)

    },
    { 
    "name": "Experiment E: Train NOISE1 & Valid NOISE1",
    "training": (x_train, y_train_noise1),
    "validation": (x_valid, y_valid_noise1),
    "test": (x_test, y_test) 
    },
    { 
    "name": "Experiment F: Train NOISE2 & Valid NOISE2",
    "training": (x_train, y_train_noise2),
    "validation": (x_valid, y_valid_noise2),
    "test": (x_test, y_test)
    },
    { 
    "name": "Experiment G: Train NOISE3 & Valid NOISE3",
    "training": (x_train, y_train_noise3),
    "validation": (x_valid, y_valid_noise3),
    "test": (x_test, y_test)
            
    }
]

## Train clean & Valid clean

In [4]:
import brown_boost

results = []
for experiment in experiments:

    experiment_name = experiment["name"]
    experinment_train_x = experiment["training"][0]
    experiment_train_y = experiment["training"][1]
    experiment_valid_x = experiment["validation"][0]
    experiment_valid_y = experiment["validation"][1]
    experiment_test_x = experiment["test"][0]
    experiment_test_y = experiment["test"][1]


    print("*"*70)
    print(experiment_name)
    print("*"*70)

    # BASELINE (no optimization)
    baseline_model = brown_boost.train_binary_brownboost(x_train=experinment_train_x, y_train=experiment_train_y)

    valid_preditions = baseline_model.predict(experiment_valid_x)
    met1 = print_metrics(y_pred=valid_preditions, y=experiment_valid_y, title="BASELINE: validation metrics")

    test_predictions = baseline_model.predict(experiment_test_x)
    met2 = print_metrics(y_pred=test_predictions, y=experiment_test_y, title="BASELINE: test metrics")

    # OPTIMIZED MODEL (optuna)
    optimization = brown_boost.optimize_binary_brownboost(experinment_train_x, experiment_train_y, experiment_valid_x, experiment_valid_y)
    optimized_model = brown_boost.train_binary_brownboost(params=optimization.best_trial.params, x_train=experinment_train_x, y_train=experiment_train_y)

    valid_preditions = optimized_model.predict(experiment_valid_x)
    met3 =print_metrics(y_pred=valid_preditions, y=experiment_valid_y, title="OPTIMIZED: validation metrics")

    test_predictions = optimized_model.predict(experiment_test_x)
    met4 = print_metrics(y_pred=test_predictions, y=experiment_test_y, title="OPTIMIZED: test metrics")

    results.append({
        "name": experiment_name,
        "baseline_valid": met1,
        "baseline_test": met2,
        "optimized_valid": met3,
        "optimized_test": met4
    })


**********************************************************************
Experiment A: Train CLEAN & Valid CLEAN
**********************************************************************

------------------------------------------------------------
BASELINE: validation metrics
------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        59
           1       1.00      0.94      0.97        35

    accuracy                           0.98        94
   macro avg       0.98      0.97      0.98        94
weighted avg       0.98      0.98      0.98        94


PR AUC: 0.9820668693009118

True positives: 33
True negatives: 59
False positives: 0
False negatives: 2
------------------------------------------------------------

------------------------------------------------------------
BASELINE: test metrics
------------------------------------------------------------

              precision  

Best trial: 6. Best value: 0.982067: 100%|██████████| 200/200 [00:11<00:00, 17.01it/s]



Best Trial Precision Score:  0.9820668693009118
Best Trial Params: 
max_iter: 100
acc_thrs: 0.009119387062604737
alg_type: float
newtonrap_thrs: 0.00019018332304217216
newtonrap_max_iter: 300

------------------------------------------------------------
OPTIMIZED: validation metrics
------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        59
           1       1.00      0.94      0.97        35

    accuracy                           0.98        94
   macro avg       0.98      0.97      0.98        94
weighted avg       0.98      0.98      0.98        94


PR AUC: 0.9820668693009118

True positives: 33
True negatives: 59
False positives: 0
False negatives: 2
------------------------------------------------------------

------------------------------------------------------------
OPTIMIZED: test metrics
------------------------------------------------------------

            

Best trial: 184. Best value: 0.967361: 100%|██████████| 200/200 [00:15<00:00, 13.24it/s]



Best Trial Precision Score:  0.9673609869479707
Best Trial Params: 
max_iter: 100
acc_thrs: 0.010567140838722105
alg_type: float
newtonrap_thrs: 6.698054064968564e-05
newtonrap_max_iter: 800

------------------------------------------------------------
OPTIMIZED: validation metrics
------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.94      0.98      0.96        59
           1       0.97      0.89      0.93        35

    accuracy                           0.95        94
   macro avg       0.95      0.93      0.94        94
weighted avg       0.95      0.95      0.95        94


PR AUC: 0.9485087386018237

True positives: 31
True negatives: 58
False positives: 1
False negatives: 4
------------------------------------------------------------

------------------------------------------------------------
OPTIMIZED: test metrics
------------------------------------------------------------

             

Best trial: 32. Best value: 0.929534: 100%|██████████| 200/200 [00:13<00:00, 14.44it/s]



Best Trial Precision Score:  0.929533941236069
Best Trial Params: 
max_iter: 100
acc_thrs: 0.0037330052823448126
alg_type: double
newtonrap_thrs: 1.0693683182183227e-05
newtonrap_max_iter: 900

------------------------------------------------------------
OPTIMIZED: validation metrics
------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.91      0.98      0.94        59
           1       0.97      0.83      0.89        35

    accuracy                           0.93        94
   macro avg       0.94      0.91      0.92        94
weighted avg       0.93      0.93      0.92        94


PR AUC: 0.929533941236069

True positives: 29
True negatives: 58
False positives: 1
False negatives: 6
------------------------------------------------------------

------------------------------------------------------------
OPTIMIZED: test metrics
------------------------------------------------------------

            

Best trial: 17. Best value: 0.825326: 100%|██████████| 200/200 [00:27<00:00,  7.38it/s]



Best Trial Precision Score:  0.8253263007330591
Best Trial Params: 
max_iter: 1000
acc_thrs: 4.547575897560998e-05
alg_type: float
newtonrap_thrs: 0.0001253196635244498
newtonrap_max_iter: 600

------------------------------------------------------------
OPTIMIZED: validation metrics
------------------------------------------------------------

              precision    recall  f1-score   support

           0       0.86      0.86      0.86        59
           1       0.77      0.77      0.77        35

    accuracy                           0.83        94
   macro avg       0.82      0.82      0.82        94
weighted avg       0.83      0.83      0.83        94


PR AUC: 0.8139817629179331

True positives: 27
True negatives: 51
False positives: 8
False negatives: 8
------------------------------------------------------------

------------------------------------------------------------
OPTIMIZED: test metrics
------------------------------------------------------------

           

Best trial: 91. Best value: 0.934237: 100%|██████████| 200/200 [00:22<00:00,  9.06it/s]



Best Trial Precision Score:  0.9342370167009837
Best Trial Params: 
max_iter: 600
acc_thrs: 0.005680560564556866
alg_type: float
newtonrap_thrs: 2.3127671624201584e-05
newtonrap_max_iter: 800

------------------------------------------------------------
OPTIMIZED: validation metrics
------------------------------------------------------------

              precision    recall  f1-score   support

         0.0       0.95      0.94      0.94        63
         1.0       0.88      0.90      0.89        31

    accuracy                           0.93        94
   macro avg       0.91      0.92      0.92        94
weighted avg       0.93      0.93      0.93        94


PR AUC: 0.9050703500343171

True positives: 28
True negatives: 59
False positives: 4
False negatives: 3
------------------------------------------------------------

------------------------------------------------------------
OPTIMIZED: test metrics
------------------------------------------------------------

            

Best trial: 15. Best value: 0.870774: 100%|██████████| 200/200 [00:23<00:00,  8.51it/s]



Best Trial Precision Score:  0.8707743100626446
Best Trial Params: 
max_iter: 200
acc_thrs: 0.041372353688729525
alg_type: double
newtonrap_thrs: 0.00010221683670064033
newtonrap_max_iter: 600

------------------------------------------------------------
OPTIMIZED: validation metrics
------------------------------------------------------------

              precision    recall  f1-score   support

         0.0       0.92      0.89      0.91        65
         1.0       0.77      0.83      0.80        29

    accuracy                           0.87        94
   macro avg       0.85      0.86      0.85        94
weighted avg       0.88      0.87      0.87        94


PR AUC: 0.8274856223226753

True positives: 24
True negatives: 58
False positives: 7
False negatives: 5
------------------------------------------------------------

------------------------------------------------------------
OPTIMIZED: test metrics
------------------------------------------------------------

           

Best trial: 11. Best value: 0.767477: 100%|██████████| 200/200 [00:30<00:00,  6.51it/s]



Best Trial Precision Score:  0.7674772036474165
Best Trial Params: 
max_iter: 700
acc_thrs: 1.4761838599871846e-05
alg_type: double
newtonrap_thrs: 7.182514420727993e-05
newtonrap_max_iter: 600

------------------------------------------------------------
OPTIMIZED: validation metrics
------------------------------------------------------------

              precision    recall  f1-score   support

         0.0       0.80      0.81      0.81        59
         1.0       0.68      0.66      0.67        35

    accuracy                           0.76        94
   macro avg       0.74      0.74      0.74        94
weighted avg       0.75      0.76      0.75        94


PR AUC: 0.7306365099231182

True positives: 23
True negatives: 48
False positives: 11
False negatives: 12
------------------------------------------------------------

------------------------------------------------------------
OPTIMIZED: test metrics
------------------------------------------------------------

        

In [5]:
import pickle

with open("breast_cancer_results.pickle", "wb") as fp:   #Pickling
    pickle.dump(results, fp)

with open("breast_cancer_results.pickle", "rb") as fp:   #Pickling
    res = pickle.load(fp)

print(len(res))

7
