In [34]:
import os
import numpy as np
import pandas as pd
from ringer.utils import significant_around, get_number_order
from ringer.data import load_var_infos

In [36]:
ratios = {
    'ratio1': lambda x: x['boosted_el']/x['boosted_jet'],
    'ratio1_extra': lambda x: x['boosted_el']/x['el_jet'],
    'ratio2': lambda x: (x['boosted_jet']-x['boosted_el'])/x['el_jet'],
    'ratio3': lambda x: x['boosted_jet']-x['boosted_el']
}
ratios_labels = {
'ratio1': '$rd_1$',
'ratio1_extra': '$rd_1\'$',
'ratio2': '$rd_2$',
'ratio3': '$rd_1$'
}
n_folds = 10
var_infos = load_var_infos()
var_infos

Unnamed: 0_level_0,label,type,lower_lim,upper_lim,l2calo,offline,TaP,formula,description
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
et,$E_T$,var,0.0,inf,trig_L2_cl_et,el_et,,,transverse particle energy on the calorimeter
eta,$\eta$,var,-2.5,2.5,trig_L2_cl_eta,el_eta,,,pseudorapidity
reta,$R_{\eta}$,shower_shape,0.0,1.0,trig_L2_cl_reta,el_reta,,$R_{\eta} = \frac{E^{3x7}_{EM2}}{E^{7x7}_{EM2}}$,Ratio of the energy in 3x7 cells over the ener...
eratio,$E_{ratio}$,shower_shape,0.0,1.0,trig_L2_cl_eratio,el_eratio,,$E_{ratio} = \frac{E^{max}_{EM1} - E^{2^{nd}ma...,Ratio of the energy difference between the max...
f1,$f_1$,shower_shape,0.0,1.0,trig_L2_cl_f1,el_f1,,$f_1 = \frac{E^{total}_{EM1}}{E^{total}_{EM}}$,Ratio of the energy in the first layer to the ...
ehad1,$E_{had1}$,unidentified,-inf,inf,trig_L2_cl_ehad1,el_ehad1,,,unidentified
f3,$f_3$,shower_shape,0.0,1.0,trig_L2_cl_f3,el_f3,,$f_3 = \frac{E^{total}_{EM3}}{E^{total}_{EM}}$,Ratio of the energy in the third layer to the ...
wstot,$\omega_{stot}$,shower_shape,0.0,inf,trig_L2_cl_wstot,el_wstot,,$\omega_{stot} = \sqrt{\frac{\Sigma(E_i - E_{i...,Shower width er > 150 GeV only on EM1
weta2,$\omega_{\eta 2}$,shower_shape,0.0,inf,trig_L2_cl_weta2,el_weta2,,$\omega_{\eta2} = \sqrt{\frac{\Sigma E_i \: X\...,Lateral shower width on EM2
e2tsts1,e2tsts1,unidentified,-inf,inf,trig_L2_cl_e2tsts1,el_e2tsts1,,,unidentified


In [3]:
wass_distances = pd.read_csv(os.path.join('..', '..', 'data', 'wass_distances.csv'), index_col=0)
for name, func in ratios.items():
    wass_distances[name] = wass_distances.apply(func, axis=1)
wass_distances.head(10)

Unnamed: 0,name,boosted_el,boosted_jet,el_jet,description,ratio1,ratio1_extra,ratio2,ratio3
0,reta,0.006294,0.103531,0.098794,fold_0_test,0.060793,0.063708,0.984243,0.097237
1,eratio,0.022051,0.571283,0.549389,fold_0_test,0.0386,0.040138,0.999714,0.549232
2,f1,0.140508,0.027239,0.113877,fold_0_test,5.158248,1.233855,-0.994655,-0.113268
3,f3,0.004705,0.007813,0.012518,fold_0_test,0.602123,0.375834,0.248347,0.003109
4,wstot,0.113703,2.803046,2.749357,fold_0_test,0.040564,0.041356,0.978172,2.689343
5,weta2,0.000724,0.004064,0.003344,fold_0_test,0.178033,0.216362,0.998926,0.00334
6,rhad,0.002789,0.470697,0.472688,fold_0_test,0.005924,0.005899,0.989889,0.467908
7,rhad1,0.002174,0.247338,0.249358,fold_0_test,0.008789,0.008718,0.983183,0.245164
8,rphi,0.007152,0.109598,0.105555,fold_0_test,0.065254,0.067754,0.970551,0.102446
9,reta,0.006243,0.103951,0.098914,fold_0_train,0.060056,0.063114,0.987803,0.097708


In [26]:
train_fold_regex = 'fold_[0-9]+_train'
is_train_fold = wass_distances['description'].str.contains(train_fold_regex)
selected_columns = wass_distances.columns[wass_distances.columns != 'description']
train_folds = wass_distances.loc[is_train_fold, selected_columns]   #type: ignore
train_wass = train_folds.groupby('name').agg(['mean', 'std'])
train_wass.head(10)

Unnamed: 0_level_0,boosted_el,boosted_el,boosted_jet,boosted_jet,el_jet,el_jet,ratio1,ratio1,ratio1_extra,ratio1_extra,ratio2,ratio2,ratio3,ratio3
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
eratio,0.021936,2.470997e-05,0.571165,8.556022e-05,0.549797,4.378377e-05,0.038405,4.3e-05,0.039898,4.3e-05,0.998967,0.000125,0.549229,8.509486e-05
f1,0.14001,0.000106193,0.026753,0.000103951,0.113747,1.774802e-05,5.233411,0.016446,1.23089,0.000982,-0.995689,0.000123,-0.113257,1.986351e-05
f3,0.004692,3.599439e-06,0.007801,3.434407e-06,0.012493,2.683044e-06,0.601498,0.000675,0.375591,0.000264,0.248836,0.000526,0.003109,6.496983e-06
reta,0.006247,1.833202e-05,0.103909,2.124478e-05,0.098902,8.81402e-06,0.060125,0.000173,0.063168,0.000189,0.987452,0.000183,0.097661,2.27235e-05
rhad,0.005093,0.0002241383,0.469445,0.0001541826,0.472929,7.9777e-05,0.010849,0.000478,0.010769,0.000474,0.981862,0.000588,0.464352,0.0002881962
rhad1,0.00317,9.633484e-05,0.246327,0.0001084884,0.249258,4.824144e-05,0.01287,0.000396,0.012719,0.000388,0.97552,0.000702,0.243156,0.0002011174
rphi,0.007192,1.349391e-05,0.109661,2.400591e-05,0.105631,1.295185e-05,0.065583,0.000115,0.068085,0.000129,0.970073,0.000163,0.102469,1.933704e-05
weta2,0.000721,4.719121e-07,0.00406,5.681247e-07,0.003343,3.260758e-07,0.177639,9.5e-05,0.215788,0.000135,0.998968,8.5e-05,0.003339,2.601888e-07
wstot,0.113377,0.0003480277,2.804533,0.0004362035,2.750798,0.000254366,0.040426,0.00012,0.041216,0.000129,0.978318,0.000108,2.691156,0.0003125045


In [45]:
wass_distance_columns = ['boosted_el', 'boosted_jet', 'el_jet']
ratio_columns = list(ratios.keys())
def get_latex_text(s: pd.Series):
    smean = float(s.mean())
    serr = 5*float(s.std())
    precision = int(-get_number_order(serr))  # type: ignore
    repr_str = f'${smean:.{precision}f} \\pm {serr:.{precision}f}$'
    return repr_str
latex_df = train_folds.groupby('name').agg(get_latex_text)
latex_df = latex_df[ratio_columns].sort_values(by='ratio1')
latex_df = latex_df.rename(ratios_labels, axis=1).rename(var_infos['label'], axis=0)
latex_df.index.name = 'Shower Shape Variable'
print(latex_df.style.to_latex())

\begin{tabular}{lllll}
 & $rd_1$ & $rd_1'$ & $rd_2$ & $rd_1$ \\
Shower Shape Variable &  &  &  &  \\
$R_{had}$ & $0.011 \pm 0.002$ & $0.011 \pm 0.002$ & $0.982 \pm 0.003$ & $0.464 \pm 0.001$ \\
$R_{had1}$ & $0.013 \pm 0.002$ & $0.013 \pm 0.002$ & $0.976 \pm 0.004$ & $0.243 \pm 0.001$ \\
$E_{ratio}$ & $0.0384 \pm 0.0002$ & $0.0399 \pm 0.0002$ & $0.9990 \pm 0.0006$ & $0.5492 \pm 0.0004$ \\
$\omega_{stot}$ & $0.0404 \pm 0.0006$ & $0.0412 \pm 0.0006$ & $0.9783 \pm 0.0005$ & $2.691 \pm 0.002$ \\
$R_{\eta}$ & $0.0601 \pm 0.0009$ & $0.0632 \pm 0.0009$ & $0.9875 \pm 0.0009$ & $0.0977 \pm 0.0001$ \\
$R_{\phi}$ & $0.0656 \pm 0.0006$ & $0.0681 \pm 0.0006$ & $0.9701 \pm 0.0008$ & $0.10247 \pm 0.00010$ \\
$\omega_{\eta 2}$ & $0.1776 \pm 0.0005$ & $0.2158 \pm 0.0007$ & $0.9990 \pm 0.0004$ & $0.003339 \pm 0.000001$ \\
$f_3$ & $0.601 \pm 0.003$ & $0.376 \pm 0.001$ & $0.249 \pm 0.003$ & $0.00311 \pm 0.00003$ \\
$f_1$ & $5.23 \pm 0.08$ & $1.231 \pm 0.005$ & $-0.9957 \pm 0.0006$ & $-0.11326 \pm 0.00010$ 

In [None]:
def get_order(df: pd.DataFrame, sort_col: str) -> pd.Series:
    ascending = sort_col.startswith('ratio1')
    df = df.sort_values(sort_col, ascending=ascending)
    ordered_ss = df['name'].values
    values = df[sort_col].values
    row = pd.Series(
        [ordered_ss, values],
        index=['order', 'values']
    )
    return row
orders = {
    name: wass_distances.groupby('description').apply(get_order, sort_col=name) # type: ignore
    for name in ratios.keys()
}
orders['ratio1']

In [None]:
def get_order_differences(df: pd.DataFrame, n_folds: int):
    differences = list()
    for ifold in range(n_folds):
        train_fold_order = df.loc[f'fold_{ifold}_train', 'order']
        test_fold_order = df.loc[f'fold_{ifold}_test', 'order']
        is_different = (train_fold_order !=
                        test_fold_order).any()   # type: ignore
        if is_different:
            differences.append(f'fold_{ifold}')
    return differences

order_differences = {
    ratio_name: get_order_differences(ratio_order, n_folds)
    for ratio_name, ratio_order in orders.items()
}
order_differences

In [None]:
def side_by_side_vis(orders, ratio, fold):
    df = orders[ratio]
    is_fold = df.index.str.startswith(fold)
    for col in ['order', 'values']:
        for idx, item in df.loc[is_fold, col].items():
            print(f'{idx}: {item}')

side_by_side_vis(orders, 'ratio1', 'fold_3')

In [None]:
side_by_side_vis(orders, 'ratio1_extra', 'fold_4')

In [None]:
side_by_side_vis(orders, 'ratio2', 'fold_0')

In [None]:
side_by_side_vis(orders, 'ratio3', 'fold_0')