In [33]:
#pip install binclass-tools

In [34]:
import os
import sys
import inspect

import numpy as np
import pandas as pd

In [35]:
print(sys.version)

3.8.13 (default, Mar 28 2022, 11:38:47) 
[GCC 7.5.0]


### Create dataset for classification and train random forest model

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate a binary imbalanced classification problem, with 80% zeros and 20% ones.
X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=14, n_redundant=0,
                           random_state=12, shuffle=False, weights = [0.8, 0.2])

# Train - test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=123)

# Train a RF classifier
cls = RandomForestClassifier(max_depth=6, oob_score=True, random_state=123)
cls.fit(X_train, y_train)

In [37]:
# Get prediction probabilities for the train set
train_predicted_proba = cls.predict_proba(X_train)[:,1]

# Get prediction probabilities for the test set
test_predicted_proba = cls.predict_proba(X_test)[:,1] 

In [38]:
from sklearn.metrics import brier_score_loss

In [39]:
brier_score_loss(y_test,test_predicted_proba)

0.08207331781390739

## Import bctools package

In [40]:
import bctools as bc

### MAIN PARAMETERS
        
- **true_y**: true labels for target class \
  Type: iterable (list, array, series...) 
  
  
- **predicted_proba**: predicted probabilities for class 1 \
  Type: iterable (list, array, series...) \
  (e.g. output from model.predict_proba(data)[:,1]) 
  
  
- **threshold_step**: step between each classification threshold \
  Type: iterable (list, array, series...), default = 0.01
  

- **amounts**: amount associated with each data point \
   Type: iterable (list, array, series...), default = None   
   

- **cost_dict**: dictionary with cost associated to each class in TN, FP, FN, TP \
   Type: dictionary with keys: "TN", "FP", "FN", "TP" and values that can be both lists (with coherent lenghts) and/or floats,\
   default = None \
   (output from bc.get_cost_dict()) 
  
  
- **optimize_threshold**: metrics to be used for threshold optimization using GHOST method \
  Type: {'all', 'ROC', 'MCC', 'Kappa', 'Fscore', 'Cost'} or list containing any combination of the allowed values (except 'all'), default = None \
  If 'Cost' is explicitely passed, *cost_dict* must be given and the threshold will be optimized to minimize the total cost\
  'all' is equivalent to ['ROC', 'MCC', 'Kappa', 'Fscore', 'Cost'] if *cost_dict* is given, \
  ['ROC', 'MCC', 'Kappa', 'Fscore'] otherwise


- **N_subsets**: Number of subsets used in the optimization process \
   Type: int, default = 70. Ignored when *optimize_threshold* = None.


- **subsets_size**: Size of the subsets used in the optimization process. If float, it represents the proportion of the dataset, if int, the actual number of instances \
   Type: int or float, default = 0.2. Ignored when *optimize_threshold* = None.


- **with_replacement**: whether subsets used in the optimization process are randomly drawn with replacement or without \
   Type: bool, default = False. Ignored when *optimize_threshold* = None.


- **currency**: currency symbol to be visualized in plots\
  Type: str, default = '€'
  

- **random_state**: controls randomness of threshold optimization bootstrap method \
  Type: int, default = None

### Plot Roc and PR plot, with isoFbeta curves, for the test set

In [41]:
area_under_ROC = bc.curve_ROC_plot(true_y = y_test, 
                                   predicted_proba = test_predicted_proba)

In [42]:
area_under_ROC

0.9550544562049395

In [43]:
# beta parameter determines the weight of recall in the combined score (used to compute Iso-Fbeta curves)
# it can be set to any float > 0, default is 1

area_under_PR = bc.curve_PR_plot(true_y = y_test, 
                                 predicted_proba = test_predicted_proba,
                                 beta = 1)

In [44]:
area_under_PR

0.9021518156511643

### Interactive probabilities violin plot for the test set

In [45]:
# set threshold step parameter to plot probabilities voilin plots for the test set
threshold_step = 0.05

bc.predicted_proba_violin_plot(true_y = y_test, 
                               predicted_proba = test_predicted_proba, 
                               threshold_step = threshold_step,
                               #marker_size =3
                              )

### Interactive kernel density estimation curve (or normal distribution curve) plot for the test set

Both plots below

In [46]:
#curve type parameter can be either 'kde' (default) or 'normal'
threshold_step = 0.05

curve_type = 'kde' #default
bc.predicted_proba_density_curve_plot(true_y = y_test, 
                                      predicted_proba = test_predicted_proba, 
                                      threshold_step = threshold_step,
                                      curve_type = curve_type)

curve_type = 'normal'
bc.predicted_proba_density_curve_plot(true_y = y_test, 
                                      predicted_proba = test_predicted_proba, 
                                      threshold_step = threshold_step,
                                      curve_type = 'normal',
                                      title = 'Interactive Probabilities Distribution Plot') 

### Confusion matrix and metrics analysis for train and test set

In [47]:
# set params for the train dataset
threshold_step = 0.05
amounts = np.abs(X_train[:, 13])
optimize_threshold = 'all'
currency = '$' 

In [48]:
# The function get_cost_dict can be used to define the dictionary of costs.
# It takes as input, for each class, a float or a list of floats. 
# Lists must have coherent lenghts 

train_cost_dict = bc.get_cost_dict(TN = 0, FP = 10, FN = np.abs(X_train[:, 12]), TP = 0)

In [49]:
# plot confusion matrix and get variable metrics dataframe, invariant metric dataframe and optimized thresholds dataframe.

# cost_dict and amounts, if not given, are set to None and won't be visualized.
# also optimize_threshold, if not given, is set to None: threshold won't be optimized 
# and the third table (Optimized metric - Optimal threshold) won't be visualized,
# the optimized thresholds dataframe returned will be None.

# WARNING: threshold optimization could take a while

var_metrics_df, invar_metrics_df, opt_thresh_df = bc.confusion_matrix_plot(
    true_y = y_train, 
    predicted_proba = train_predicted_proba, 
    threshold_step = threshold_step, 
    amounts = amounts, 
    cost_dict = train_cost_dict, 
    optimize_threshold = optimize_threshold, 
    #N_subsets = 70, subsets_size = 0.2, # default
    #with_replacement = False,           # default
    currency = currency,
    random_state = 123,
    title = 'Interactive Confusion Matrix for the Training Set');

In [50]:
# the three dataframes returned
display(var_metrics_df, invar_metrics_df, opt_thresh_df)

Unnamed: 0,threshold,accuracy,balanced_accuracy,cohens_kappa,f1_score,matthews_corr_coef,precision,recall
0,0.0,0.2025,0.5,0.0,0.3368,0.0,0.2025,1.0
1,0.05,0.3962,0.6215,0.115,0.4015,0.247,0.2512,1.0
2,0.1,0.7288,0.8299,0.44,0.5989,0.5311,0.4274,1.0
3,0.15,0.8875,0.9295,0.7115,0.7826,0.7431,0.6429,1.0
4,0.2,0.965,0.9781,0.8982,0.9205,0.9029,0.8526,1.0
5,0.25,0.9838,0.9852,0.9507,0.961,0.9513,0.9357,0.9877
6,0.3,0.9862,0.9776,0.9573,0.9659,0.9573,0.9689,0.963
7,0.35,0.9875,0.9714,0.9606,0.9684,0.9611,0.9935,0.9444
8,0.4,0.9762,0.9414,0.9231,0.9377,0.9258,1.0,0.8827
9,0.45,0.9675,0.9198,0.893,0.9128,0.8981,1.0,0.8395


Unnamed: 0,invariant_metric,value
0,roc_auc,0.9992
1,pr_auc,0.9971
2,brier_score,0.0438


Unnamed: 0,optimized_metric,optimal_threshold
0,kappa,0.3
1,mcc,0.25
2,roc,0.25
3,f1_score,0.35
4,f2_score,0.25
5,f05_score,0.35
6,cost,0.35


In [51]:
# You can also analyze the test dataset.
# In this case there is no need to optimize the threshold value for any measure.
threshold_step = 0.05
amounts = np.abs(X_test[:, 13])
optimize_threshold = None
currency = '$'

test_cost_dict = bc.get_cost_dict(TN = 0, FP = 10, FN = np.abs(X_test[:, 12]), TP = 0)

In [52]:
var_metrics_df, invar_metrics_df, __ = bc.confusion_matrix_plot(
    true_y = y_test, 
    predicted_proba = test_predicted_proba, 
    threshold_step = threshold_step, 
    amounts = amounts, 
    cost_dict = test_cost_dict, 
    optimize_threshold = optimize_threshold, 
    #N_subsets = 70, subsets_size = 0.2, # default
    #with_replacement = False,           # default
    currency = currency,
    random_state = 123,
    title = 'Interactive Confusion Matrix for the Testing Set');

In [53]:
# the two dataframes returned
display(var_metrics_df, invar_metrics_df)

Unnamed: 0,threshold,accuracy,balanced_accuracy,cohens_kappa,f1_score,matthews_corr_coef,precision,recall
0,0.0,0.205,0.5,0.0,0.3402,0.0,0.205,1.0
1,0.05,0.31,0.566,0.0587,0.3727,0.1739,0.2291,1.0
2,0.1,0.62,0.752,0.3018,0.5128,0.4115,0.3478,0.9756
3,0.15,0.75,0.8337,0.4653,0.6154,0.5422,0.4494,0.9756
4,0.2,0.835,0.8691,0.5932,0.6972,0.6291,0.5588,0.9268
5,0.25,0.88,0.8521,0.6567,0.7333,0.661,0.6735,0.8049
6,0.3,0.92,0.8773,0.7546,0.8049,0.7546,0.8049,0.8049
7,0.35,0.94,0.8718,0.8016,0.8378,0.8087,0.9394,0.7561
8,0.4,0.93,0.8383,0.7591,0.8,0.7758,0.9655,0.6829
9,0.45,0.905,0.7683,0.648,0.6984,0.6923,1.0,0.5366


Unnamed: 0,invariant_metric,value
0,roc_auc,0.9551
1,pr_auc,0.903
2,brier_score,0.0821


In [54]:
# the invariant metric dataframe can be obtained directly with 
# the function get_invariant_metrics_df from the utilities module

bc.utilities.get_invariant_metrics_df(true_y = y_test, 
                                      predicted_proba = test_predicted_proba)

Unnamed: 0,invariant_metric,value
0,roc_auc,0.9551
1,pr_auc,0.903
2,brier_score,0.0821


In [55]:
# for a specific threshold, 
# the confusion matrix and a dataframe containing the list of metrics visualized in the first table of
# the interactive confusion matrix plot, can be obtained directly with
# the function get_confusion_matrix_and_metrics_df from the utilities module

conf_matrix, metrics_fixed_thresh_df = bc.utilities.get_confusion_matrix_and_metrics_df(
    true_y = y_test, 
    predicted_proba = test_predicted_proba,
    threshold = 0.3 # default = 0.5
)

display(conf_matrix, metrics_fixed_thresh_df)


array([[151,   8],
       [  8,  33]])

Unnamed: 0,threshold_dependent_metric,value
0,accuracy,0.92
1,balanced_accuracy,0.8773
2,f1_score,0.8049
3,precision,0.8049
4,recall,0.8049
5,cohens_kappa,0.7546
6,matthews_corr_coef,0.7546


In [56]:
# the optimized thresholds dataframe can be obtained directly with 
# the function get_optimized_thresholds_df from the thresholds module

# this function requires a list of thresholds instead of the step, for example:
threshold_values = np.arange(0.05, 1, 0.05) # will generate an array of values from 0 to 1 with step 0.05

# in this case, we will optimize thresholds using the train dataset 
# (best practice would be using a validation dataset different from both train and test)

# to otpimize for minimal cost, we need a train_cost_dict 
train_cost_dict = bc.get_cost_dict(TN = 0, FP = 10, 
                                   FN = np.abs(X_train[:, 12]), TP = 0)

bc.thresholds.get_optimized_thresholds_df(optimize_threshold = ['Kappa', 'Fscore', 'Cost'], 
                                          threshold_values = threshold_values, 
                                          true_y = y_train, 
                                          predicted_proba = train_predicted_proba,
                                          cost_dict = train_cost_dict, 
                                          N_subsets = 70, subsets_size = 0.2, with_replacement = False, # default
                                          random_state = 120)

Unnamed: 0,optimized_metric,optimal_threshold
0,kappa,0.3
1,f1_score,0.25
2,f2_score,0.25
3,f05_score,0.35
4,cost,0.35


In [57]:
# to directly optimize a threshold for one specific metric in {'ROC', 'MCC', 'Kappa', 'F1'}, 
# the function get_optimal_threshold from the thresholds module can be used:

# if ThOpt_metrics = Fscore, 3 values will be returned (optimal threshold for beta = 1, for beta = 2 and for beta = 0.5)

bc.thresholds.get_optimal_threshold(y_train, 
                                    train_predicted_proba, 
                                    threshold_values, 
                                    ThOpt_metrics = 'ROC', # default = 'Kappa'
                                    N_subsets = 70, subsets_size = 0.2, with_replacement = False, # defaults
                                    random_seed = 120)

0.25

In [58]:
# to directly optimize a threshold for minimal cost, 
# the function get_cost_optimal_threshold from the thresholds module can be used (cost_dict must be given):

bc.thresholds.get_cost_optimal_threshold(y_train, 
                                         train_predicted_proba, 
                                         threshold_values, 
                                         cost_dict = train_cost_dict,
                                         N_subsets = 70, subsets_size = 0.2, with_replacement = False, # defaults
                                         random_seed = 120)

0.35000000000000003

In [59]:
# plot "Interactive confusion line chart" and get amount/cost per threshold dataframe and total_amount.

# at least one of cost_dict or amounts must be given
# either cost_dict or amounts, if not given, is set to None and won't be visualized
# when amounts is not given, the total_amount returned will be None 

amount_cost_df, total_amount = bc.confusion_linechart_plot(
    true_y = y_test, 
    predicted_proba = test_predicted_proba, 
    threshold_step =  threshold_step, 
    amounts = amounts, 
    cost_dict = test_cost_dict, 
    currency = currency);

In [60]:
# total_amount and dataframe returned
print(f'total amount: {currency}{total_amount}')
amount_cost_df 

total amount: $335.85


Unnamed: 0,threshold,amount_TN,amount_FP,amount_FN,amount_TP,cost_TN,cost_FP,cost_FN,cost_TP,total_cost
0,0.0,0.0,290.087727,0.0,45.761465,0.0,1590.0,0.0,0.0,1590.0
1,0.05,29.286441,260.801286,0.0,45.761465,0.0,1380.0,0.0,0.0,1380.0
2,0.1,141.016189,149.071538,0.271689,45.489775,0.0,750.0,2.295028,0.0,752.295028
3,0.15,185.252232,104.835495,0.271689,45.489775,0.0,490.0,2.295028,0.0,492.295028
4,0.2,232.413556,57.674171,1.096405,44.66506,0.0,300.0,4.25104,0.0,304.25104
5,0.25,260.154255,29.933472,7.812413,37.949052,0.0,160.0,9.48321,0.0,169.48321
6,0.3,272.472271,17.615456,7.812413,37.949052,0.0,80.0,9.48321,0.0,89.48321
7,0.35,288.065533,2.022194,9.907729,35.853736,0.0,20.0,13.266683,0.0,33.266683
8,0.4,289.577899,0.509828,12.351725,33.40974,0.0,10.0,21.557577,0.0,31.557577
9,0.45,290.087727,0.0,17.779753,27.981711,0.0,0.0,34.519345,0.0,34.519345


In [61]:
# the amount/cost per threshold dataframe can be obtained directly with 
# the function get_amounts_cost_df in the utilities module

# this function requires a list of thresholds, instead of the step, for example:
threshold_values = np.arange(0, 1, 0.05) # will generate an array of values from 0 to 1 with step 0.05

# example without amounts
bc.utilities.get_amount_cost_df(
    true_y = y_test, 
    predicted_proba = test_predicted_proba,
    threshold_values = threshold_values, 
    #amounts = amounts,  
    cost_dict = test_cost_dict)

Unnamed: 0,threshold,cost_TN,cost_FP,cost_FN,cost_TP,total_cost
0,0.0,0.0,1590.0,0.0,0.0,1590.0
1,0.05,0.0,1380.0,0.0,0.0,1380.0
2,0.1,0.0,750.0,2.295028,0.0,752.295028
3,0.15,0.0,490.0,2.295028,0.0,492.295028
4,0.2,0.0,300.0,4.25104,0.0,304.25104
5,0.25,0.0,160.0,9.48321,0.0,169.48321
6,0.3,0.0,80.0,9.48321,0.0,89.48321
7,0.35,0.0,20.0,13.266683,0.0,33.266683
8,0.4,0.0,10.0,21.557577,0.0,31.557577
9,0.45,0.0,0.0,34.519345,0.0,34.519345


### Custom Interactive Amount/Cost line chart

In [62]:
# plot "Amount/Cost line chart" and get a dataframe containing amount and cost per threshold for selected
# "confusion classes" (TN, FP, FN, TP) and their total

# at least one of cost_dict or amounts must be given
# either cost_dict or amounts, if not given, is set to None and won't be visualized
# amount_classes, if not given, is set to 'all' when amounts is given, to None otherwise
# cost_classes, if not given, is set to 'all' when cost_dict is given, to None otherwise

# for example, if we want to plot the sum of the amounts of the True Positive and False Positive data
# and the sum of the costs of all the data:

amount_classes = ['TP', 'FP'] 
cost_classes = 'all'

total_cost_amount_df = bc.total_amount_cost_plot(
    true_y = y_test, 
    predicted_proba = test_predicted_proba, 
    threshold_step = threshold_step,
    amounts = amounts, 
    cost_dict = test_cost_dict,
    amount_classes = amount_classes,
    cost_classes = cost_classes,
    currency = currency);

In [63]:
# dataframe returned by the function
total_cost_amount_df

Unnamed: 0,threshold,amount_TP,amount_FP,amount_sum,cost_TN,cost_FP,cost_FN,cost_TP,cost_sum
0,0.0,45.761465,290.087727,335.849192,0.0,1590.0,0.0,0.0,1590.0
1,0.05,45.761465,260.801286,306.562751,0.0,1380.0,0.0,0.0,1380.0
2,0.1,45.489775,149.071538,194.561314,0.0,750.0,2.295028,0.0,752.295028
3,0.15,45.489775,104.835495,150.325271,0.0,490.0,2.295028,0.0,492.295028
4,0.2,44.66506,57.674171,102.339231,0.0,300.0,4.25104,0.0,304.25104
5,0.25,37.949052,29.933472,67.882524,0.0,160.0,9.48321,0.0,169.48321
6,0.3,37.949052,17.615456,55.564508,0.0,80.0,9.48321,0.0,89.48321
7,0.35,35.853736,2.022194,37.87593,0.0,20.0,13.266683,0.0,33.266683
8,0.4,33.40974,0.509828,33.919568,0.0,10.0,21.557577,0.0,31.557577
9,0.45,27.981711,0.0,27.981711,0.0,0.0,34.519345,0.0,34.519345


### Additional useful function

In [64]:
# the function get_confusion_class_df takes in input a "confusion class" {'TN', 'FP', 'FN', 'TP'},
# a feature dataset (X), the true labels (y), the predicted probabilites and a threshold 
# and returns the portion of the feature dataset corresponding to the given class

# for example, if we want the True Positive data points with a 0.7 threshold:
confusion_category = 'TP'

bc.get_confusion_category_observations_df(
    confusion_category = confusion_category, 
    X_data = X_test, 
    true_y = y_test, 
    predicted_proba = test_predicted_proba, 
    threshold = 0.7 # default = 0.5
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
30,-2.601367,-1.51421,-0.081816,2.257485,-5.195684,-2.953742,3.949413,2.76187,1.651492,0.180683,-1.664504,-1.396264,-3.826065,0.941951,-2.004694,1.038209,-0.183376,1.504055,-0.797956,-0.512469
47,-2.568051,-4.736157,3.401512,0.614939,-0.390128,-3.364416,-3.667949,4.046054,3.568885,1.479944,3.078459,2.142917,1.48118,0.686454,0.416553,1.044883,0.718451,-1.232943,0.280403,1.074427
53,1.466142,2.557351,4.432927,-1.129646,-0.673413,-3.782365,-1.112528,3.371804,4.870778,2.628418,2.974501,3.786003,0.01542,-1.213112,0.290725,1.351958,0.576588,-1.929523,-0.327521,1.16338
100,-0.683903,-1.137473,2.989311,-2.349425,-2.312612,-5.200242,0.138438,3.786642,2.184161,4.529078,2.959609,1.633566,0.520825,1.296019,-0.086999,-0.778458,0.803152,1.031072,-0.212475,-0.237224
149,-3.892485,-0.50545,-1.10924,1.071018,-2.246515,-7.147058,4.757241,-0.231286,-1.42023,-0.60719,-1.24541,0.021053,-0.001838,0.741768,0.08832,1.367268,1.927205,-0.486881,-1.175421,1.039506
162,-3.049729,-3.784003,1.107009,-0.201179,0.873662,-3.947325,-2.886823,-0.819648,5.083153,0.85056,4.091439,0.033962,-2.147115,-2.442134,-0.254247,0.827896,-3.532146,0.291766,-0.181126,0.280283
192,0.62663,2.995032,1.472569,5.170367,-0.489948,-3.800033,-1.06871,0.274598,1.768753,1.10827,4.649526,-2.272895,1.402271,-1.062539,0.290058,0.188573,0.997652,-0.855024,-2.532455,0.466048
