In [1]:
import os
import numpy as np
import pandas as pd
from ringer.utils import significant_around, get_number_order
from ringer.data import load_var_infos
from ringer.latex import confidence_region_df_as_latex

In [2]:
ratios = {
    'ratio1': lambda x: x['boosted_el']/x['boosted_jet'],
    'ratio1_extra': lambda x: x['boosted_el']/x['el_jet'],
    'ratio2': lambda x: (x['boosted_jet']-x['boosted_el'])/x['el_jet'],
    'ratio3': lambda x: x['boosted_jet']-x['boosted_el']
}
ratios_labels = {
'ratio1': '$rd_1$',
'ratio1_extra': '$rd_1\'$',
'ratio2': '$rd_2$',
'ratio3': '$rd_1$'
}
n_folds = 10
var_infos = load_var_infos()
var_infos

Unnamed: 0_level_0,label,type,lower_lim,upper_lim,l2calo,offline,TaP,formula,description
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
et,$E_T$,var,0.0,inf,trig_L2_cl_et,el_et,,,transverse particle energy on the calorimeter
eta,$\eta$,var,-2.5,2.5,trig_L2_cl_eta,el_eta,,,pseudorapidity
reta,$R_{\eta}$,shower_shape,0.0,1.0,trig_L2_cl_reta,el_reta,,$R_{\eta} = \frac{E^{3x7}_{EM2}}{E^{7x7}_{EM2}}$,Ratio of the energy in 3x7 cells over the ener...
eratio,$E_{ratio}$,shower_shape,0.0,1.0,trig_L2_cl_eratio,el_eratio,,$E_{ratio} = \frac{E^{max}_{EM1} - E^{2^{nd}ma...,Ratio of the energy difference between the max...
f1,$f_1$,shower_shape,0.0,1.0,trig_L2_cl_f1,el_f1,,$f_1 = \frac{E^{total}_{EM1}}{E^{total}_{EM}}$,Ratio of the energy in the first layer to the ...
ehad1,$E_{had1}$,unidentified,-inf,inf,trig_L2_cl_ehad1,el_ehad1,,,unidentified
f3,$f_3$,shower_shape,0.0,1.0,trig_L2_cl_f3,el_f3,,$f_3 = \frac{E^{total}_{EM3}}{E^{total}_{EM}}$,Ratio of the energy in the third layer to the ...
wstot,$\omega_{stot}$,shower_shape,0.0,inf,trig_L2_cl_wstot,el_wstot,,$\omega_{stot} = \sqrt{\frac{\Sigma(E_i - E_{i...,Shower width er > 150 GeV only on EM1
weta2,$\omega_{\eta^2}$,shower_shape,0.0,inf,trig_L2_cl_weta2,el_weta2,,$\omega_{\eta2} = \sqrt{\frac{\Sigma E_i \: X\...,Lateral shower width on EM2
e2tsts1,e2tsts1,unidentified,-inf,inf,trig_L2_cl_e2tsts1,el_e2tsts1,,,unidentified


In [3]:
wass_distances = pd.read_csv(os.path.join('..', '..', 'data', 'wass_distances.csv'), index_col=0)
for name, func in ratios.items():
    wass_distances[name] = wass_distances.apply(func, axis=1)
wass_distances = wass_distances.set_index('name')
wass_distances.head(10)

Unnamed: 0_level_0,boosted_el,boosted_jet,el_jet,description,ratio1,ratio1_extra,ratio2,ratio3
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
trig_L2_cl_reta,0.003692,0.034417,0.031252,fold_0_train,0.107279,0.118144,0.983138,0.030725
trig_L2_cl_reta,0.003664,0.03435,0.031288,fold_0_test,0.106667,0.117108,0.980774,0.030686
trig_L2_cl_reta,0.003693,0.034409,0.031257,fold_1_train,0.107315,0.118137,0.982702,0.030717
trig_L2_cl_reta,0.003683,0.03442,0.031238,fold_1_test,0.106998,0.1179,0.983982,0.030737
trig_L2_cl_reta,0.003697,0.034414,0.031256,fold_2_train,0.107435,0.118288,0.982731,0.030716
trig_L2_cl_reta,0.003643,0.034379,0.031249,fold_2_test,0.105962,0.116577,0.983604,0.030736
trig_L2_cl_reta,0.007819,0.073079,0.066388,fold_3_train,0.106999,0.117782,0.982998,0.065259
trig_L2_cl_reta,0.007992,0.073187,0.066392,fold_3_test,0.1092,0.120376,0.981972,0.065195
trig_L2_cl_reta,0.003688,0.034408,0.031256,fold_4_train,0.10718,0.117989,0.982864,0.03072
trig_L2_cl_reta,0.003702,0.034427,0.031249,fold_4_test,0.107545,0.11848,0.983197,0.030724


In [4]:
def get_var_label_from_col(col_name: str) -> str:
    if col_name.startswith('trig_L2'):
        name = var_infos.index[var_infos['l2calo'] == col_name]
    else:
        name = var_infos.index[var_infos['offline'] == col_name]
    
    if len(name) > 1:
        raise RuntimeError('There was more than one name')
    
    name = name[0]
    label = var_infos.loc[name, 'label']
    return label

col2label_mapping = {col: get_var_label_from_col(col) for col in wass_distances.index}
col2label_mapping

{'trig_L2_cl_reta': '$R_{\\eta}$',
 'trig_L2_cl_eratio': '$E_{ratio}$',
 'trig_L2_cl_f1': '$f_1$',
 'trig_L2_cl_f3': '$f_3$',
 'trig_L2_cl_wstot': '$\\omega_{stot}$',
 'trig_L2_cl_weta2': '$\\omega_{\\eta^2}$',
 'el_rhad': '$R_{had}$',
 'el_rhad1': '$R_{had1}$',
 'el_rphi': '$R_{\\phi}$'}

In [5]:
train_fold_regex = 'fold_[0-9]+_train'
is_train_fold = wass_distances['description'].str.contains(train_fold_regex)
selected_columns = wass_distances.columns[wass_distances.columns != 'description']
train_folds = wass_distances.loc[is_train_fold, selected_columns]   #type: ignore
train_wass_confidence = train_folds.groupby('name').agg(['mean', 'std'])
train_wass_confidence = train_wass_confidence.sort_values(by=('boosted_el', 'mean'))
train_wass_confidence

Unnamed: 0_level_0,boosted_el,boosted_el,boosted_jet,boosted_jet,el_jet,el_jet,ratio1,ratio1,ratio1_extra,ratio1_extra,ratio2,ratio2,ratio3,ratio3
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
el_rhad,0.000171,2.5e-05,0.006624,0.000938,0.00669,0.000947,0.025745,0.000463,0.025491,0.000453,0.96463,0.000715,0.006453,0.000913
el_rhad1,0.000234,1.3e-05,0.008519,0.000432,0.008626,0.000438,0.027483,0.00038,0.027142,0.000368,0.960424,0.000635,0.008285,0.00042
trig_L2_cl_eratio,0.000383,1e-06,0.00554,3e-06,0.005175,2e-06,0.069132,0.000218,0.074008,0.000249,0.996521,0.000143,0.005157,2e-06
trig_L2_cl_wstot,0.000464,6e-05,0.006078,0.000784,0.005784,0.000746,0.076417,0.000127,0.080299,0.000145,0.970496,0.000111,0.005614,0.000724
trig_L2_cl_reta,0.004108,0.001304,0.038323,0.012213,0.03481,0.011096,0.107215,0.000146,0.118036,0.000159,0.982897,0.000273,0.034215,0.010909
trig_L2_cl_f3,0.005925,7.4e-05,0.027526,0.000359,0.033427,0.000432,0.215255,0.000634,0.177258,0.000434,0.646224,0.000845,0.021601,0.000287
el_rphi,0.010848,8e-06,0.061227,1.5e-05,0.051055,6e-06,0.177178,0.000125,0.212476,0.000162,0.986746,0.000243,0.050379,1.3e-05
trig_L2_cl_weta2,0.019238,0.000117,0.073836,0.000413,0.055204,0.000304,0.26055,0.000217,0.348487,0.000369,0.989019,0.000154,0.054598,0.000297
trig_L2_cl_f1,0.116652,0.000146,0.014876,6.6e-05,0.103883,0.000117,7.841756,0.028978,1.122919,0.000686,-0.97972,0.00014,-0.101776,0.000111


In [6]:
wass_distance_columns = ['boosted_el', 'boosted_jet', 'el_jet']
train_data = train_folds[wass_distance_columns].rename(col2label_mapping, axis=0)
train_data.index.name = 'Shower Shape Variable'
latex_repr = confidence_region_df_as_latex(train_data,
                                           groupby='Shower Shape Variable',
                                           sort_values=dict(by='boosted_el', ascending=True))
print(latex_repr)

\begin{tabular}{||c|c|c|c|c||}
 & Shower Shape Variable & boosted_el & boosted_jet & el_jet \\
0 & $R_{had}$ & $0.00017 \pm 0.00002$ & $0.0066 \pm 0.0009$ & $0.0067 \pm 0.0009$ \\
\cline{1-5}
1 & $R_{had1}$ & $0.00023 \pm 0.00001$ & $0.0085 \pm 0.0004$ & $0.0086 \pm 0.0004$ \\
\cline{1-5}
2 & $E_{ratio}$ & $0.000383 \pm 0.000001$ & $0.005540 \pm 0.000003$ & $0.005175 \pm 0.000002$ \\
\cline{1-5}
3 & $\omega_{stot}$ & $0.00046 \pm 0.00006$ & $0.0061 \pm 0.0008$ & $0.0058 \pm 0.0007$ \\
\cline{1-5}
4 & $R_{\eta}$ & $0.004 \pm 0.001$ & $0.04 \pm 0.01$ & $0.03 \pm 0.01$ \\
\cline{1-5}
5 & $f_3$ & $0.00593 \pm 0.00007$ & $0.0275 \pm 0.0004$ & $0.0334 \pm 0.0004$ \\
\cline{1-5}
6 & $R_{\phi}$ & $0.010848 \pm 0.000008$ & $0.06123 \pm 0.00001$ & $0.051055 \pm 0.000006$ \\
\cline{1-5}
7 & $\omega_{\eta^2}$ & $0.0192 \pm 0.0001$ & $0.0738 \pm 0.0004$ & $0.0552 \pm 0.0003$ \\
\cline{1-5}
8 & $f_1$ & $0.1167 \pm 0.0001$ & $0.01488 \pm 0.00007$ & $0.1039 \pm 0.0001$ \\
\cline{1-5}
\end{tabular}



In [7]:
wass_distance_columns = ['boosted_el', 'boosted_jet', 'el_jet']
train_data = train_folds[list(ratios.keys())] \
    .rename(col2label_mapping, axis=0) \
    .rename(ratios_labels, axis=1)
train_data.index.name = 'Shower Shape Variable'
latex_repr = confidence_region_df_as_latex(train_data,
                                           groupby='Shower Shape Variable')
print(latex_repr)

\begin{tabular}{||c|c|c|c|c|c||}
 & Shower Shape Variable & $rd_1$ & $rd_1'$ & $rd_2$ & $rd_1$ \\
0 & $E_{ratio}$ & $0.0691 \pm 0.0002$ & $0.0740 \pm 0.0002$ & $0.9965 \pm 0.0001$ & $0.005157 \pm 0.000002$ \\
\cline{1-6}
1 & $R_{\eta}$ & $0.1072 \pm 0.0001$ & $0.1180 \pm 0.0002$ & $0.9829 \pm 0.0003$ & $0.03 \pm 0.01$ \\
\cline{1-6}
2 & $R_{\phi}$ & $0.1772 \pm 0.0001$ & $0.2125 \pm 0.0002$ & $0.9867 \pm 0.0002$ & $0.05038 \pm 0.00001$ \\
\cline{1-6}
3 & $R_{had1}$ & $0.0275 \pm 0.0004$ & $0.0271 \pm 0.0004$ & $0.9604 \pm 0.0006$ & $0.0083 \pm 0.0004$ \\
\cline{1-6}
4 & $R_{had}$ & $0.0257 \pm 0.0005$ & $0.0255 \pm 0.0005$ & $0.9646 \pm 0.0007$ & $0.0065 \pm 0.0009$ \\
\cline{1-6}
5 & $\omega_{\eta^2}$ & $0.2605 \pm 0.0002$ & $0.3485 \pm 0.0004$ & $0.9890 \pm 0.0002$ & $0.0546 \pm 0.0003$ \\
\cline{1-6}
6 & $\omega_{stot}$ & $0.0764 \pm 0.0001$ & $0.0803 \pm 0.0001$ & $0.9705 \pm 0.0001$ & $0.0056 \pm 0.0007$ \\
\cline{1-6}
7 & $f_1$ & $7.84 \pm 0.03$ & $1.1229 \pm 0.0007$ & $-0.9797 \

In [None]:
def get_order(df: pd.DataFrame, sort_col: str) -> pd.Series:
    ascending = sort_col.startswith('ratio1')
    df = df.sort_values(sort_col, ascending=ascending)
    ordered_ss = df['name'].values
    values = df[sort_col].values
    row = pd.Series(
        [ordered_ss, values],
        index=['order', 'values']
    )
    return row
orders = {
    name: wass_distances.groupby('description').apply(get_order, sort_col=name) # type: ignore
    for name in ratios.keys()
}
orders['ratio1']

In [None]:
def get_order_differences(df: pd.DataFrame, n_folds: int):
    differences = list()
    for ifold in range(n_folds):
        train_fold_order = df.loc[f'fold_{ifold}_train', 'order']
        test_fold_order = df.loc[f'fold_{ifold}_test', 'order']
        is_different = (train_fold_order !=
                        test_fold_order).any()   # type: ignore
        if is_different:
            differences.append(f'fold_{ifold}')
    return differences

order_differences = {
    ratio_name: get_order_differences(ratio_order, n_folds)
    for ratio_name, ratio_order in orders.items()
}
order_differences

In [None]:
def side_by_side_vis(orders, ratio, fold):
    df = orders[ratio]
    is_fold = df.index.str.startswith(fold)
    for col in ['order', 'values']:
        for idx, item in df.loc[is_fold, col].items():
            print(f'{idx}: {item}')

side_by_side_vis(orders, 'ratio1', 'fold_3')

In [None]:
side_by_side_vis(orders, 'ratio1_extra', 'fold_4')

In [None]:
side_by_side_vis(orders, 'ratio2', 'fold_0')

In [None]:
side_by_side_vis(orders, 'ratio3', 'fold_0')