In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier,TabNetRegressor
from sklearn.preprocessing import StandardScaler
import torch
import ROOT as root
import numpy as np
import pandas as pd
import sklearn
import math
import uproot
import os
import sys
import root_pandas as rp
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
from sklearn import metrics

warnings.simplefilter('ignore')

In [None]:
apply_on_files = 1 # 0 or 1
do_grid_search = 0

cs_vars = ['','','','....']

target_var = ['Signal']
contSupp_var = ['contSupp_BB_lep']

train_path = a_v.input_dir + 'BDT_contSupp_BB_train_2023-12-06_RW_2.parq'
test_path = a_v.input_dir + 'BDT_contSupp_BB_test_2023-12-06.parq'

train = pd.read_parquet(train_path, engine='pyarrow')
test = pd.read_parquet(test_path, engine='pyarrow')

In [None]:
train_ = train[cs_vars+target_var+['contReweight']]
test_ = test[cs_vars+target_var]

scaler = StandardScaler()
train_[cs_vars] = scaler.fit_transform(train_[cs_vars])
test_[cs_vars] = scaler.transform(test_[cs_vars])

x_train = train_[cs_vars].values
x_train_weights = train_['contReweight'].values
y_train = train_[target_var].values.flatten()#.reshape(-1, 1)

x_test = test_[cs_vars].values
y_test = test_[target_var].values.flatten()#.reshape(-1, 1)

In [None]:
tabnet_params = dict(
    n_d=15,
    n_a=15,
    n_steps=8,
    seed=0,
    #optimizer_params=dict(lr = 1e-3),
    verbose=1,
)

In [None]:
clf = TabNetClassifier(**tabnet_params)
clf.fit(x_train, y_train,
        eval_set = [(x_test,y_test)],
        patience=0,
        max_epochs=100,
        eval_metric=['auc'],
        num_workers=2,
        weights= x_train_weights
       )

In [None]:
#plt.plot(clf.history["loss"], label = "train")
plt.plot(clf.history["val_0_auc"], label = "test")
plt.legend()
plt.show()

In [None]:
feat_imp = pd.DataFrame(clf.feature_importances_, index=cs_vars)
feature_importance = feat_imp.copy()

feature_importance["imp_mean"] = feature_importance.mean(axis=1)
feature_importance = feature_importance.sort_values("imp_mean")

plt.tick_params(labelsize=18)
plt.barh(feature_importance.index.values, feature_importance["imp_mean"])
plt.title("feature_importance", fontsize=18)

In [None]:
print(sklearn.metrics.accuracy_score(y_test, predictions))

In [None]:
plt.rcParams.update({
          'font.size': 20,
          'figure.figsize': (12, 10),
          'axes.grid': False,
          'grid.linestyle': '-',
          'grid.alpha': 0.2,
          'lines.markersize': 5.0,
          'xtick.minor.visible': True,
          'xtick.direction': 'in',
          'xtick.major.size': 10.0,
          'xtick.minor.size': 5.0,
          'xtick.top': True,
          'ytick.minor.visible': True,
          'ytick.direction': 'in',
          'ytick.major.size': 10.0,
          'ytick.minor.size': 5.0,
          'ytick.right': True,
          'errorbar.capsize': 0.0,
        })

try:
    os.mkdir('/../BDT_plots')
except FileExistsError:
    print('Directory BDT_plots already exist')

predictions = clf.predict_proba(x_train)
y_pred_proba = []
for i in range(len(predictions)): 
    x=predictions[i][1]
    y_pred_proba.append(x)
fpr_train, tpr_train, threshold_train = metrics.roc_curve(y_train,  y_pred_proba)
auc_train = metrics.roc_auc_score(y_train, y_pred_proba)
predictions = clf.predict_proba(x_test)
y_pred_proba = []
for i in range(len(predictions)): 
    x=predictions[i][1]
    y_pred_proba.append(x)
fpr_test, tpr_test, threshold_test = metrics.roc_curve(y_test,  y_pred_proba)
auc_test = metrics.roc_auc_score(y_test, y_pred_proba)

#create ROC curve test
plt.plot(fpr_train,tpr_train,label="Train AUC="+str(round(auc_train,3)))
plt.plot(fpr_test,tpr_test,label="Test AUC="+str(round(auc_test,3)))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.savefig('BDT_plots/contSupp_BB_lep_ROC_curve.pdf', bbox_inches='tight')