In [1]:
from lzt_utils.dataset import LztDataset
from lzt_utils.constants import N_RINGS
from lzt_utils.root import rdf_column_names, rdf_to_pandas, open_vector
from lzt_utils import formulas
import lzt_utils.plotting.pyplot as lplt
import mplhep
import matplotlib.pyplot as plt
import os
from pathlib import Path
import ROOT
import numpy as np
import pandas as pd
from itertools import product
ROOT.EnableImplicitMT()
plt.style.use(mplhep.style.ROOT)

In [2]:
models_dir = Path('/', 'root', 'ext_data', 'aprendizado-profundo', 'models-2024-12-15')
# models_dir = Path('/', 'root', 'ext_data', 'tests', 'lorenzetti')

# Training Progression

In [3]:
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    history_all_folds = pd.read_csv(model_dir / 'history_all_folds.csv', index_col=0)
    auc_all_folds = pd.read_csv(model_dir / 'auc_all_folds.csv', index_col=0).set_index(
        ['dataset', 'fold']
    )

    all_lines = None
    color_cycle = lplt.get_plt_color_cycle()
    for (ifold,), training_history in history_all_folds.groupby(['fold']):
        fig, ax = plt.subplots(figsize=(15, 8))
        ax.grid()
        twinx_ax = ax.twinx()
        lines = ax.plot(training_history['loss'],
                              label='Train',
                              color=color_cycle[0]
                    )
        lines += ax.plot(training_history['val_loss'],
                            label='Val',
                            linestyle='--',
                            color=color_cycle[1])
        diff = training_history['val_loss'] - training_history['loss']
        lines += twinx_ax.plot(diff,
                                   label='Val - Train',
                                   color=color_cycle[2],
                                   linestyle='-.')
        labels = [l.get_label() for l in lines]
        ax.legend(lines, labels, fontsize='x-small')
        ax.set(title=f'Loss progression during training',
               xlabel='Epoch', ylabel='Loss')
        twinx_ax.set(ylabel='Train - Val')
        fig.tight_layout()
        fig.savefig(model_dir / f'loss_progression_fold_{ifold:02d}.png')
        plt.close(fig)

# Output distribution

In [4]:
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    for fold_dir in model_dir.iterdir():
        if not fold_dir.is_dir():
            continue
        fold = int(fold_dir.name.split('_')[-1])
        predictions_df = pd.read_csv(fold_dir / 'predictions.csv', index_col=0)
        for dataset in ['train', 'val']:
            fig, ax = plt.subplots(figsize=(10,8))
            for class_ in range(2):
                class_name = 'Signal' if class_ else 'Background'
                is_class = predictions_df['y_true'] == class_
                is_val = predictions_df['dataset'] == dataset
                class_predictions = predictions_df.loc[is_val & is_class, 'y_pred']
                lplt.histplot(
                    class_predictions,
                    bin_min=0, bin_max=1, ax=ax,
                    hist_kwargs=dict(label=class_name,
                                     histtype='stepfilled',
                                     density=True,
                                     alpha=0.7),
                    metrics=False)
            ax.legend()
            ax.set(title=f'Model output for fold {fold} & dataset {dataset}',
                   xlabel='Model output', ylabel='Density')
            fig.tight_layout()
            fig.savefig(fold_dir / f'output_distribution_fold_{fold:02d}_{dataset}.png')
            plt.close(fig)

# ROC Curves

In [5]:
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    metrics_all_folds = pd.read_csv(model_dir / 'metrics_all_folds.csv', index_col=0)
    auc_all_folds = pd.read_csv(model_dir / 'auc_all_folds.csv', index_col=0).set_index(
        ['dataset', 'fold']
    )
    for (dataset,), dataset_metrics in metrics_all_folds.groupby([ 'dataset']):
        fig, ax = plt.subplots(figsize=(15, 8))
        ax.grid()
        second_ax = fig.add_axes([0.43, 0.2, 0.5, 0.5])
        second_ax.grid()
        for (fold,), fold_metrics in dataset_metrics.groupby(['fold']):
            current_auc = auc_all_folds.loc[(dataset, fold), 'auc']
            lines = lplt.plot_roc_curve(
                fold_metrics['tpr'],
                fold_metrics['fpr'],
                ax=ax,
                add_diagonal=(not fold),
                diagonal_kwargs=dict(label='Random Guess'),
                plot_kwargs=dict(label=f'Fold {fold}, AUC = {current_auc:.5f}', linewidth=3),
                axes_set=dict(xlim=(0, 1.01), ylim=(0, 1.01), title='')
            )
            lplt.plot_roc_curve(
                fold_metrics['tpr'],
                fold_metrics['fpr'],
                ax=second_ax,
                add_diagonal=False,
                plot_kwargs=dict(color=lines[0].get_color()),
                axes_set=dict(xlim=(0, 0.04), ylim=(0.95, 1.01),
                              xlabel='', ylabel='', title='')
            )
        ax.legend()
        ax.set_title(f'ROC Curve for {dataset} data')
        fig.tight_layout()
        fig.savefig(model_dir / f'roc_curve_{dataset}.png')
        plt.close(fig)

  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()


# SP Index

In [6]:
max_sps = {
    'model': [],
    'fold': [],
    'max_sp': [],
    'threshold': [],
    'dataset': [],
    'tpr': [],
    'fpr': []
}
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    metrics_all_folds = pd.read_csv(model_dir / 'metrics_all_folds.csv')
    model_name = model_dir.name
    for (dataset,), dataset_metrics in metrics_all_folds.groupby([ 'dataset']):
        fig, ax = plt.subplots(figsize=(15, 8))
        ax.grid()
        second_ax = fig.add_axes([0.38, 0.2, 0.5, 0.5])
        second_ax.grid()
        for (fold,), fold_metrics in dataset_metrics.groupby(['fold']):
            if not fold:
                ax.axhline(1, color='black', linestyle='--', label='Perfect SP Index')
            lines = ax.plot(
                fold_metrics['thresholds'],
                fold_metrics['sp'],
                label=f'Fold {fold}',
                linewidth=3
            )
            second_ax.plot(
                fold_metrics['thresholds'],
                fold_metrics['sp'],
                color=lines[0].get_color(),
            )
            idxmax = fold_metrics['sp'].idxmax()
            max_sps['model'].append(model_name)
            max_sps['fold'].append(fold)
            max_sps['max_sp'].append(fold_metrics.loc[idxmax, 'sp'])
            max_sps['threshold'].append(fold_metrics.loc[idxmax, 'thresholds'])
            max_sps['dataset'].append(dataset)
            max_sps['tpr'].append(fold_metrics.loc[idxmax, 'tpr'])
            max_sps['fpr'].append(fold_metrics.loc[idxmax, 'fpr'])
        ax.set(
            xlabel='Threshold',
            ylabel='SP Index',
            title=f'SP Index vs Threshold for {dataset} data',
        )
        ax.legend(loc='lower left')
        second_ax.set(
            xlim=(0, 0.1),
            ylim=(0.8, 1.1),
            xlabel='',
            ylabel='',
            title='',
        )
        fig.tight_layout()
        fig.savefig(model_dir / f'sp_{dataset}.png')
        plt.close(fig)

  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()


In [7]:
max_sps = pd.DataFrame(max_sps)
max_sps.to_csv(models_dir / 'max_sps.csv')
max_sps

Unnamed: 0,model,fold,max_sp,threshold,dataset,tpr,fpr
0,mlpv1,0,0.992042,0.424749,train,0.993616,0.009530
1,mlpv1,1,0.991110,0.501672,train,0.995002,0.012775
2,mlpv1,2,0.991152,0.501672,train,0.994415,0.012106
3,mlpv1,3,0.991942,0.434783,train,0.994540,0.010654
4,mlpv1,4,0.991041,0.481605,train,0.995128,0.013038
...,...,...,...,...,...,...,...
105,cnnv4,0,0.996818,0.585284,val,0.998992,0.005354
106,cnnv4,1,0.996447,0.615385,val,0.998656,0.005760
107,cnnv4,2,0.995812,0.638796,val,0.997648,0.006023
108,cnnv4,3,0.996291,0.779264,val,0.998488,0.005904


In [8]:
max_sp_stats = max_sps.groupby(['model', 'dataset']).agg(
    mean_auc=('max_sp', 'mean'),
    std_auc=('max_sp', 'std'),
).reset_index()
max_sp_stats.to_csv(models_dir / 'max_sp_stats.csv')
max_sp_stats

Unnamed: 0,model,dataset,mean_auc,std_auc
0,cnnv1,train,0.994964,0.000366
1,cnnv1,val,0.994825,0.000709
2,cnnv2,train,0.995622,0.00048
3,cnnv2,val,0.995242,0.000816
4,cnnv3,train,0.995182,0.000705
5,cnnv3,val,0.995098,0.000728
6,cnnv4,train,0.996503,0.000231
7,cnnv4,val,0.996279,0.000388
8,mlpv1,train,0.991457,0.000491
9,mlpv1,val,0.991425,0.000779


In [9]:
def select_per_fold(x: pd.DataFrame) -> pd.Series:
    idxmax = x['max_sp'].idxmax()
    return x.loc[idxmax].drop('model')
max_sps[max_sps['dataset'] == 'val'] \
    .groupby(['model']) \
    .apply(select_per_fold) \
    .sort_values(['max_sp', 'threshold'], ascending=[False, True])


  .apply(select_per_fold) \


Unnamed: 0_level_0,fold,max_sp,threshold,dataset,tpr,fpr
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cnnv4,0,0.996818,0.585284,val,0.998992,0.005354
cnnv2,1,0.996292,0.70903,val,0.99916,0.006573
cnnv3,1,0.996171,0.725753,val,0.99832,0.005975
cnnv1,1,0.99604,0.491639,val,0.99832,0.006238
mlpv7,0,0.994005,0.792642,val,0.997984,0.009967
mlpv4,0,0.992616,0.505017,val,0.996136,0.010899
mlpv1,0,0.992435,0.424749,val,0.994457,0.009584
mlpv6,1,0.991454,0.588629,val,0.994289,0.011377
mlpv3,1,0.991035,0.498328,val,0.994793,0.012715
mlpv2,1,0.990963,0.61204,val,0.996304,0.014364


# AUC comparison

In [10]:
aucs = []
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    auc_per_fold = pd.read_csv(model_dir / 'auc_all_folds.csv', index_col=0)
    auc_per_fold['model'] = model_dir.name
    aucs.append(auc_per_fold)
aucs = pd.concat(aucs, axis=0)
aucs_stats = aucs.groupby(['model', 'dataset']).agg(
    mean_auc=('auc', 'mean'),
    std_auc=('auc', 'std'),
).reset_index()
best = aucs.groupby('dataset').apply(
    lambda x: x.sort_values('auc', ascending=False).iloc[0]
)

  best = aucs.groupby('dataset').apply(


In [11]:
aucs_stats

Unnamed: 0,model,dataset,mean_auc,std_auc
0,cnnv1,train,0.999295,5.8e-05
1,cnnv1,val,0.999242,9e-05
2,cnnv2,train,0.999328,9.5e-05
3,cnnv2,val,0.999277,0.000183
4,cnnv3,train,0.999311,6.5e-05
5,cnnv3,val,0.999257,5.4e-05
6,cnnv4,train,0.999417,2.7e-05
7,cnnv4,val,0.999368,6.7e-05
8,mlpv1,train,0.998529,0.000189
9,mlpv1,val,0.998466,0.000253


In [12]:
aucs_stats \
    .sort_values('mean_auc', ascending=False) \
    .reset_index(drop=True) \
    .to_csv(models_dir / 'aucs_stats.csv')

In [13]:
best

Unnamed: 0_level_0,fold,dataset,auc,model
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train,4,train,0.999443,cnnv4
val,2,val,0.999462,cnnv2
