In [None]:
from PyFastBDT import FastBDT
from pytorch_tabnet.tab_model import TabNetClassifier,TabNetRegressor
from sklearn.preprocessing import StandardScaler
import torch
import basf2_mva
import modularAnalysis as ma
import ROOT as root
import numpy as np
import pandas as pd
import sklearn
import math
import uproot
import os
import sys
import root_pandas as rp
import analysis_variables as a_v
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
from sklearn import metrics

warnings.simplefilter('ignore')

In [None]:
apply_on_files = 1 # 0 or 1
do_grid_search = 0

cs_vars = ['','','','....']


target_var = ['Signal']
contSupp_var = ['contSupp_BB_lep']

train_path = a_v.input_dir + 'BDT_contSupp_BB_train_2023-12-06_RW_2.parq'
test_path = a_v.input_dir + 'BDT_contSupp_BB_test_2023-12-06.parq'

train = pd.read_parquet(train_path, engine='pyarrow')
test = pd.read_parquet(test_path, engine='pyarrow')

In [None]:
x_train = train[cs_vars]
x_train_weights = train.contReweight
y_train = train[target_var]
x_test = test[cs_vars]
y_test = test[target_var]

In [None]:
if do_grid_search == 1:
    stdoutOrigin=sys.stdout 
    sys.stdout = open("grid_search_contSuppBB_lep.txt", "w")
    for trees in [150,200,250,300]:
        for  depth_ in [2,3,4]:
            for shrink in [0.1,0.2,0.3]:
                grid_search = FastBDT.Classifier(nTrees=trees,depth=depth_,shrinkage=shrink)
                grid_search.fit(X=x_train, y=y_train)
                y_pred_proba = grid_search.predict(x_train).tolist()
                auc_train = metrics.roc_auc_score(y_train, y_pred_proba)
                y_pred_proba = grid_search.predict(x_test).tolist()
                auc_test = metrics.roc_auc_score(y_test, y_pred_proba)
                print('nTrees:',trees,'depth:',depth_,'shrinkage:',shrink,'Train AUC:',auc_train,'Test AUC:',auc_test)
    sys.stdout.close()
    sys.stdout=stdoutOrigin

In [None]:
clf = FastBDT.Classifier(nTrees=300,depth=2,shrinkage=0.1)
clf.fit(X=x_train, y=y_train, weights= x_train_weights)

In [None]:
if apply_on_files == 1:
    
    extra = 1
    
    from multiprocessing import Process, Pool
    import multiprocessing.managers
    import tqdm

    import warnings
    warnings.filterwarnings("ignore")
    
    inFilePath = []
    
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')
    inFilePath.append(a_v.input_dir + '....parq')

    def apply_bdt(i):
        print('Working on', i)
        df = pd.read_parquet(i, engine='pyarrow')
        BDT_out = clf.predict(df[cs_vars]).tolist()
        df = df.assign(contSuppBDT_BB_output_lep_new = BDT_out)
        df.to_parquet(i.replace('_CSBB','_CSBB_2'), compression='GZIP')
        
        
    n_cpus = 3

    pool = Pool(n_cpus)

    run_list = inFilePath
    pool_result = pool.map(apply_bdt, run_list)

    pool.close()
    pool.join()

In [None]:
plt.rcParams.update({
          'font.size': 20,
          'figure.figsize': (12, 10),
          'axes.grid': False,
          'grid.linestyle': '-',
          'grid.alpha': 0.2,
          'lines.markersize': 5.0,
          'xtick.minor.visible': True,
          'xtick.direction': 'in',
          'xtick.major.size': 10.0,
          'xtick.minor.size': 5.0,
          'xtick.top': True,
          'ytick.minor.visible': True,
          'ytick.direction': 'in',
          'ytick.major.size': 10.0,
          'ytick.minor.size': 5.0,
          'ytick.right': True,
          'errorbar.capsize': 0.0,
        })

try:
    os.mkdir('/../BDT_plots')
except FileExistsError:
    print('Directory BDT_plots already exist')

y_pred_proba = clf.predict(x_train).tolist()
fpr_train, tpr_train, threshold_train = metrics.roc_curve(y_train,  y_pred_proba)
auc_train = metrics.roc_auc_score(y_train, y_pred_proba)
y_pred_proba = clf.predict(x_test).tolist()
fpr_test, tpr_test, threshold_test = metrics.roc_curve(y_test,  y_pred_proba)
auc_test = metrics.roc_auc_score(y_test, y_pred_proba)

#create ROC curve test
plt.plot(fpr_train,tpr_train,label="Train AUC="+str(round(auc_train,3)))
plt.plot(fpr_test,tpr_test,label="Test AUC="+str(round(auc_test,3)))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.savefig(f'BDT_plots/{contSupp_var}_ROC_curve.pdf', bbox_inches='tight')

In [None]:
x_train = train[cs_vars]
x_train = x_train.assign(BDT_output = clf.predict(x_train).tolist())
x_train = x_train.assign(Signal = train['Signal'])

x_test = test[cs_vars]
x_test = x_test.assign(BDT_output = clf.predict(x_test).tolist())
x_test = x_test.assign(Signal = test['Signal'])

sig_train = x_train.query('Signal==1').BDT_output
bkg_train = x_train.query('Signal==0').BDT_output
sig_test = x_test.query('Signal==1').BDT_output
bkg_test = x_test.query('Signal==0').BDT_output

norm_sig_train = [1/len(sig_train)] * len(sig_train)
norm_bkg_train = [1/len(bkg_train)] * len(bkg_train)
norm_sig_test = [1/len(sig_test)] * len(sig_test)
norm_bkg_test = [1/len(bkg_test)] * len(bkg_test)

In [None]:
plt.rcParams.update({
          'figure.figsize': (14, 12),
        })

bins_ = 80
alpha_ = 0.6

fig, ax = plt.subplots(nrows=3, 
                       ncols=1, 
                       gridspec_kw={'height_ratios': [3, 1, 1]},
                       sharex=True, 
)
plt.subplots_adjust(hspace=0.05)
    
ax[0].hist(sig_train,
         bins=bins_,
         range=(0,1),
         histtype='stepfilled',
         color=['lightskyblue'],
         label=['Train-Signal'],
         weights=norm_sig_train,
         edgecolor='deepskyblue',
         linewidth=2,
         alpha=alpha_
        )
ax[0].hist(bkg_train,
         bins=bins_,
         range=(0,1),
         histtype='stepfilled',
         color=['lightsalmon'],
         label=['Train-Background'],
         weights=norm_bkg_train,
         edgecolor='tomato',
         linewidth=2,
         alpha=alpha_
        )



bin_counts, bin_edges = np.histogram(bkg_test.to_numpy(),
                                     bins=bins_,
                                     range=(0,1),
                                     weights=norm_bkg_test)

bin_counts_err, bin_edges = np.histogram(bkg_test.to_numpy(),
                                     bins=bins_,
                                     range=(0,1),
                                     )

bin_counts_2, bin_edges_2 = np.histogram(bkg_train.to_numpy(),
                                     bins=bins_,
                                     range=(0,1),
                                     weights=norm_bkg_train)

bin_counts_2_err, bin_edges = np.histogram(bkg_train.to_numpy(),
                                     bins=bins_,
                                     range=(0,1),
                                     )

bin_centres = (bin_edges[1:] + bin_edges[:-1]) / 2

y_error = np.sqrt(bin_counts_err)*norm_bkg_test[0]
y_error_2 = np.sqrt(bin_counts_2_err)*norm_bkg_train[0]

hist = ax[0].errorbar(x=bin_centres,
                    y=bin_counts_2, 
                    yerr=y_error_2,
                    marker=None,
                    color='tomato',
                    drawstyle='steps-mid'
                    )

ax[0].errorbar(x=bin_centres, 
            y=bin_counts,
            yerr=y_error, 
            fmt='o',
            color = 'tomato',
            label='Test-Background',
            capsize=2)

difference = bin_counts_2 - bin_counts
difference_errors = difference/10
ax[2].errorbar(x=bin_centres,
               y=difference,
               xerr=np.array([((1-0)/bins_*0.5)]*len(bin_centres)),
               yerr=difference_errors,
               color = 'darkred',
               label='Train-Test Background',
               fmt='o'
              )

bin_counts, bin_edges = np.histogram(sig_test.to_numpy(),
                                     bins=bins_,
                                     range=(0,1),
                                     weights=norm_sig_test)

bin_counts_err, bin_edges = np.histogram(sig_test.to_numpy(),
                                     bins=bins_,
                                     range=(0,1),
                                     )

bin_counts_2, bin_edges_2 = np.histogram(sig_train.to_numpy(),
                                         bins=bins_,
                                         range=(0,1),
                                         weights=norm_sig_train)

bin_counts_2_err, bin_edges = np.histogram(sig_train.to_numpy(),
                                     bins=bins_,
                                     range=(0,1),
                                     )

y_error = np.sqrt(bin_counts_err)*norm_sig_test[0]
y_error_2 = np.sqrt(bin_counts_2_err)*norm_sig_train[0]

hist = ax[0].errorbar(x=bin_centres,
                    y=bin_counts_2, 
                    yerr=y_error_2,
                    marker=None,
                    color='deepskyblue',
                    drawstyle='steps-mid'
                    )

ax[0].errorbar(x=bin_centres, 
            y=bin_counts,
            yerr=y_error, 
            fmt='o',
            color = 'deepskyblue',
            label='Test-Signal',
            capsize=2)

difference = bin_counts_2 - bin_counts
difference_errors = difference/10
ax[1].errorbar(x=bin_centres,
               y=difference,
               xerr=np.array([((1-0)/bins_*0.5)]*len(bin_centres)),
               yerr=difference_errors,
               color = 'navy',
               label='Train-Test Signal',
               fmt='o'
              )

#ax.set_title('BDT output')
ax[0].legend(loc=0, ncol=1, fontsize='small', frameon=False)
handles, labels = ax[0].get_legend_handles_labels()
order = [0,1,3,2]
ax[0].legend([handles[idx] for idx in order],[labels[idx] for idx in order])
ax[1].legend(loc=0, ncol=1, fontsize='small', frameon=False)
ax[2].legend(loc=0, ncol=1, fontsize='small', frameon=False)
ax[2].set_xlabel('contSupp BDT output', fontsize=26)
ax[0].set_ylabel('Events/Bins norm', fontsize=26)
ax[1].set_ylabel('Diff.', fontsize=26)
ax[2].set_ylabel('Diff.', fontsize=26)
ax[0].set_xlim(0,1)
ax[0].set_ylim(0,None)
ax[1].set_ylim(-0.01,0.01)
ax[2].hlines(0, 0, 1, color='tomato')
ax[2].set_ylim(-0.01,0.01)
ax[1].hlines(0, 0, 1, color='deepskyblue')
plt.tight_layout()

plt.savefig(f'BDT_plots/{contSupp_var}_overtraining_BDToutput.pdf', bbox_inches='tight')

In [None]:
plt.rcParams.update({
          'figure.figsize': (14, 10),
        })

x = clf.individualFeatureImportance(x_train)
for i in range(len(x.values())):
    x[cs_vars[i]] = x.pop(i)
importance_corr = dict(sorted(x.items(), key=lambda item: item[1]))
x = clf.individualFeatureImportance(x_train)
for i in range(len(x.values())):
    cs_vars[i] = cs_vars[i].replace('d0_','')
    x[cs_vars[i]] = x.pop(i)
importance = dict(sorted(x.items(), key=lambda item: item[1]))

ax = pd.DataFrame.from_dict(importance,orient='index').plot(kind='barh',legend=False)
ax.set_xlabel('Importance')
plt.tight_layout()

plt.savefig(f'BDT_plots/{contSupp_var}_variables_importance.pdf', bbox_inches='tight')

In [None]:
plt.rcParams.update({
          'figure.figsize': (23, 20),
        })

corr_var = list(importance_corr.keys()) #+ ['BDT_output']

sig = train.assign(BDT_output = clf.predict(x_train).tolist()).query('Signal==1')[corr_var]
corr_matrix =  sig.corr() * 100
corr_matrix.columns = corr_matrix.columns.str.replace('d0_', '')
corr_matrix.index = corr_matrix.index.str.replace('d0_', '')

fig, ax = plt.subplots()
colormap = sn.color_palette("Blues",12)
ax = sn.heatmap(corr_matrix, annot=True, fmt=".0f",linewidth=.5,cmap=colormap)
ax.set(xlabel="Signal", ylabel="")
ax.invert_yaxis()
ax.xaxis.set_ticks_position('top') 
plt.xticks(rotation='vertical')

plt.savefig('BDT_plots/contSupp_BB_lep_corr_matrix_sig.pdf', bbox_inches='tight')

bkg = train.assign(BDT_output = clf.predict(x_train).tolist()).query('Signal==0')[corr_var]
corr_matrix =  bkg.corr() *100
corr_matrix.columns = corr_matrix.columns.str.replace('d0_', '')
corr_matrix.index = corr_matrix.index.str.replace('d0_', '')

fig, ax = plt.subplots()
colormap = sn.color_palette("Blues",12)
ax = sn.heatmap(corr_matrix, annot=True, fmt=".0f",linewidth=.5,cmap=colormap)
ax.set(xlabel="Background", ylabel="")
ax.invert_yaxis()
ax.xaxis.set_ticks_position('top') 
plt.xticks(rotation='vertical')

plt.savefig(f'BDT_plots/{contSupp_var}_corr_matrix_bkg.pdf', bbox_inches='tight')