In [1]:
from lzt_utils.dataset import LztDataset
from lzt_utils.constants import N_RINGS
from lzt_utils.root import rdf_column_names, rdf_to_pandas, open_vector
from lzt_utils import formulas
import lzt_utils.plotting.pyplot as lplt
import mplhep
import matplotlib.pyplot as plt
import os
from pathlib import Path
import ROOT
import numpy as np
import pandas as pd
from itertools import product
ROOT.EnableImplicitMT()
plt.style.use(mplhep.style.ROOT)

In [2]:
models_dir = Path('/', 'root', 'ext_data', 'aprendizado-profundo', 'models-2024-12-15')
# models_dir = Path('/', 'root', 'ext_data', 'tests', 'lorenzetti')

# Training Progression

In [3]:
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    history_all_folds = pd.read_csv(model_dir / 'history_all_folds.csv', index_col=0)
    auc_all_folds = pd.read_csv(model_dir / 'auc_all_folds.csv', index_col=0).set_index(
        ['dataset', 'fold']
    )

    all_lines = None
    color_cycle = lplt.get_plt_color_cycle()
    for (ifold,), training_history in history_all_folds.groupby(['fold']):
        fig, ax = plt.subplots(figsize=(15, 8))
        ax.grid()
        twinx_ax = ax.twinx()
        lines = ax.plot(training_history['loss'],
                              label='Train',
                              color=color_cycle[0]
                    )
        lines += ax.plot(training_history['val_loss'],
                            label='Val',
                            linestyle='--',
                            color=color_cycle[1])
        diff = training_history['val_loss'] - training_history['loss']
        lines += twinx_ax.plot(diff,
                                   label='Val - Train',
                                   color=color_cycle[2],
                                   linestyle='-.')
        labels = [l.get_label() for l in lines]
        ax.legend(lines, labels, fontsize='x-small')
        ax.set(title=f'Loss progression during training',
               xlabel='Epoch', ylabel='Loss')
        twinx_ax.set(ylabel='Train - Val')
        fig.tight_layout()
        fig.savefig(model_dir / f'loss_progression_fold_{ifold:02d}.png')
        plt.close(fig)

# Output distribution

In [4]:
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    for fold_dir in model_dir.iterdir():
        if not fold_dir.is_dir():
            continue
        fold = int(fold_dir.name.split('_')[-1])
        predictions_df = pd.read_csv(fold_dir / 'predictions.csv', index_col=0)
        for dataset in ['train', 'val']:
            fig, ax = plt.subplots(figsize=(10,8))
            for class_ in range(2):
                class_name = 'Signal' if class_ else 'Background'
                is_class = predictions_df['y_true'] == class_
                is_val = predictions_df['dataset'] == dataset
                class_predictions = predictions_df.loc[is_val & is_class, 'y_pred']
                lplt.histplot(
                    class_predictions,
                    bin_min=0, bin_max=1, ax=ax,
                    hist_kwargs=dict(label=class_name,
                                     histtype='stepfilled',
                                     density=True,
                                     alpha=0.7),
                    metrics=False)
            ax.legend()
            ax.set(title=f'Model output for fold {fold} & dataset {dataset}',
                   xlabel='Model output', ylabel='Density')
            fig.tight_layout()
            fig.savefig(fold_dir / f'output_distribution_fold_{fold:02d}_{dataset}.png')
            plt.close(fig)

# ROC Curves

In [5]:
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    metrics_all_folds = pd.read_csv(model_dir / 'metrics_all_folds.csv', index_col=0)
    auc_all_folds = pd.read_csv(model_dir / 'auc_all_folds.csv', index_col=0).set_index(
        ['dataset', 'fold']
    )
    for (dataset,), dataset_metrics in metrics_all_folds.groupby([ 'dataset']):
        fig, ax = plt.subplots(figsize=(15, 8))
        ax.grid()
        second_ax = fig.add_axes([0.43, 0.2, 0.5, 0.5])
        second_ax.grid()
        for (fold,), fold_metrics in dataset_metrics.groupby(['fold']):
            current_auc = auc_all_folds.loc[(dataset, fold), 'auc']
            lines = lplt.plot_roc_curve(
                fold_metrics['tpr'],
                fold_metrics['fpr'],
                ax=ax,
                add_diagonal=(not fold),
                diagonal_kwargs=dict(label='Random Guess'),
                plot_kwargs=dict(label=f'Fold {fold}, AUC = {current_auc:.5f}'),
                axes_set=dict(xlim=(0, 1.01), ylim=(0, 1.01), title='')
            )
            lplt.plot_roc_curve(
                fold_metrics['tpr'],
                fold_metrics['fpr'],
                ax=second_ax,
                add_diagonal=False,
                plot_kwargs=dict(color=lines[0].get_color()),
                axes_set=dict(xlim=(0, 0.04), ylim=(0.95, 1.01),
                              xlabel='', ylabel='', title='')
            )
        ax.legend()
        ax.set_title(f'ROC Curve for {dataset} data')
        fig.tight_layout()
        fig.savefig(model_dir / f'roc_curve_{dataset}.png')
        plt.close(fig)

  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()


# SP Index

In [6]:
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    metrics_all_folds = pd.read_csv(model_dir / 'metrics_all_folds.csv')
    for (dataset,), dataset_metrics in metrics_all_folds.groupby([ 'dataset']):
        fig, ax = plt.subplots(figsize=(15, 8))
        ax.grid()
        second_ax = fig.add_axes([0.38, 0.2, 0.5, 0.5])
        second_ax.grid()
        for (fold,), fold_metrics in dataset_metrics.groupby(['fold']):
            if not fold:
                ax.axhline(1, color='black', linestyle='--', label='Perfect SP Index')
            lines = ax.plot(
                fold_metrics['thresholds'],
                fold_metrics['sp'],
                label=f'Fold {fold}',
            )
            second_ax.plot(
                fold_metrics['thresholds'],
                fold_metrics['sp'],
                color=lines[0].get_color(),
            )
        ax.set(
            xlabel='Threshold',
            ylabel='SP Index',
            title=f'SP Index vs Threshold for {dataset} data',
        )
        ax.legend(loc='lower left')
        second_ax.set(
            xlim=(0, 0.1),
            ylim=(0.8, 1.1),
            xlabel='',
            ylabel='',
            title='',
        )
        fig.tight_layout()
        fig.savefig(model_dir / f'sp_{dataset}.png')
        plt.close(fig)

  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()
  fig.tight_layout()


# AUC comparison

In [7]:
aucs = []
for model_dir in models_dir.iterdir():
    if not model_dir.is_dir():
        continue
    auc_per_fold = pd.read_csv(model_dir / 'auc_all_folds.csv', index_col=0)
    auc_per_fold['model'] = model_dir.name
    aucs.append(auc_per_fold)
aucs = pd.concat(aucs, axis=0)
aucs_stats = aucs.groupby(['model', 'dataset']).agg(
    mean_auc=('auc', 'mean'),
    std_auc=('auc', 'std'),
).reset_index()
best = aucs.groupby('dataset').apply(
    lambda x: x.sort_values('auc', ascending=False).iloc[0]
)

  best = aucs.groupby('dataset').apply(


In [12]:
aucs_stats[aucs_stats['dataset'] == 'val'].sort_values('mean_auc', ascending=False)

Unnamed: 0,model,dataset,mean_auc,std_auc
7,cnnv4,val,0.999368,6.7e-05
3,cnnv2,val,0.999277,0.000183
5,cnnv3,val,0.999257,5.4e-05
1,cnnv1,val,0.999242,9e-05
21,mlpv7,val,0.998917,5e-05
15,mlpv4,val,0.998638,0.000138
9,mlpv1,val,0.998466,0.000253
19,mlpv6,val,0.998187,0.000226
13,mlpv3,val,0.998056,0.000122
11,mlpv2,val,0.898333,0.222682


In [9]:
best

Unnamed: 0_level_0,fold,dataset,auc,model
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train,4,train,0.999443,cnnv4
val,2,val,0.999462,cnnv2
