In [6]:
import os
import pandas as pd
from scipy.stats import wasserstein_distance
from itertools import combinations
from datetime import datetime
from packages.constants import VAR_INFOS_DTYPES
from packages.utils import get_electron_label, get_jet_label

## Parameters

In [None]:
basepath = os.path.join('..', '..')
datapath = os.path.join(basepath, 'data')
var_info_path = os.path.join(datapath, 'var_infos.csv')
collision_path = os.path.join(datapath, 
    'ided_data17_13TeV.AllPeriods.sgn.probes_lhvloose_EGAM1.bkg.vprobes_vlhvloose_EGAM7.GRL_v97.25bins.parquet')
#   'ided_data17_13TeV.AllPeriods.sgn.probes_lhvloose_EGAM1.bkg.vprobes_vlhvloose_EGAM7.GRL_v97.25bins.parquet_et4_eta4.parquet')
boosted_path = os.path.join(datapath, 
    'ided_mc16_13TeV.302236_309995_341330.sgn.boosted_probes.WZ_llqq_plus_radion_ZZ_llqq_plus_ggH3000.merge.25bins.v2.parquet')

## Loading data

In [5]:
basepath = os.path.join('..', '..')
datapath = os.path.join(basepath, 'x')
var_infos = pd.read_csv(var_info_path, index_col=0, dtype=VAR_INFOS_DTYPES)
is_ss = var_infos['type'] == 'shower_shape'
shower_shapes =  var_infos.loc[is_ss, 'name'].to_list()
shower_shapes_cols = var_infos.loc[is_ss & (~var_infos['l2calo'].isnull()), 'l2calo'].to_list()
shower_shapes_cols += var_infos.loc[is_ss & (var_infos['l2calo'].isnull()), 'offline'].to_list()
print(f'Selected shower shapes {", " .join(shower_shapes_cols)}')
var_infos

Selected shower shapes trig_L2_cl_reta, trig_L2_cl_eratio, trig_L2_cl_f1, trig_L2_cl_f3, trig_L2_cl_wstot, trig_L2_cl_weta2, el_rhad, el_rhad1, el_rphi


Unnamed: 0,name,label,type,lower_lim,upper_lim,l2calo,offline,TaP,description
0,et,$E_T$,var,0.0,inf,trig_L2_cl_et,el_et,,transverse particle energy on the calorimeter
1,eta,$\eta$,var,-2.5,2.5,trig_L2_cl_eta,el_eta,,pseudorapidity
2,reta,$R_{\eta}$,shower_shape,0.0,1.0,trig_L2_cl_reta,el_reta,,Ratio of the energy in 3x7 cells over the ener...
3,eratio,$E_{ratio}$,shower_shape,0.0,1.0,trig_L2_cl_eratio,el_eratio,,Ratio of the energy difference between the max...
4,f1,$f_1$,shower_shape,0.0,1.0,trig_L2_cl_f1,el_f1,,Ratio of the energy in the first layer to the ...
5,ehad1,$E_{had1}$,unidentified,-inf,inf,trig_L2_cl_ehad1,el_ehad1,,unidentified
6,f3,$f_3$,shower_shape,0.0,1.0,trig_L2_cl_f3,el_f3,,Ratio of the energy in the third layer to the ...
7,wstot,$\omega_{stot}$,shower_shape,0.0,inf,trig_L2_cl_wstot,el_wstot,,Shower width er > 150 GeV only on EM1
8,weta2,$\omega_{\eta 2}$,shower_shape,0.0,inf,trig_L2_cl_weta2,el_weta2,,Lateral shower width on EM2
9,e2tsts1,e2tsts1,unidentified,-inf,inf,trig_L2_cl_e2tsts1,el_e2tsts1,,unidentified


In [None]:
boosted_data = pd.read_parquet(boosted_path, columns=shower_shapes_cols)
boosted_data.tail()

In [None]:
start_time = datetime.now()
print(f'Start: {start_time}')
add_cols = ['target', 'el_lhmedium', 'el_lhvloose']
collision_data = pd.read_parquet(collision_path, columns=shower_shapes_cols + add_cols)
read_time = datetime.now()
print(f'Time to read x {read_time-start_time}')
print(collision_data.shape)
jet_label = get_jet_label(collision_data, 'el_lhvloose')
el_label = get_electron_label(collision_data, 'el_lhmedium')
print(f'There are {jet_label.sum()} jets and {el_label.sum()} electrons')
label_time = datetime.now()
print(f'Time to labeling {label_time-read_time}')
print(f'Do electrons and jet have intersections? {(jet_label & el_label).any()}')
collision_data.drop(add_cols, axis=1, inplace=True)
el_data = collision_data.loc[el_label]
jet_data = collision_data.loc[jet_label]
del collision_data
drop_time = datetime.now()
print(f'Time to drop {drop_time-label_time}')
el_data.head()

In [None]:
data = {
    'boosted': boosted_data,
    'el': el_data,
    'jet': jet_data
}

## Computing distances

In [None]:
ss_filters = {
    'f3': lambda x: x,
    'weta2': lambda x: x[x != 99],
    'reta': lambda x: x,
    'wstot': lambda x: x[x != -9999],
    'eratio': lambda x: x[x < 98],
    'f1': lambda x: x,
    'rphi': lambda x: x[x.between(-0.5, 1.5, inclusive='both')],
    'rhad': lambda x: x,
    'rhad1': lambda x:x
}

In [None]:
data_combinations = combinations(data.keys(), 2)
combinations_str = [f'{left}_{right}' for left, right in combinations(data.keys(), 2)]
wass_distances = pd.DataFrame(index=shower_shapes, columns=combinations_str)
for ss in shower_shapes:
    for left, right in combinations(data.keys(), 2):
        left_data = data[left][ss]
        right_data = data[right][ss]
        wass_distances.loc[ss, f'{left}_{right}'] = wasserstein_distance(left_data, right_data)
wass_distances.to_csv(os.path.join(basepath, 'wass_distances.csv'))
wass_distances

In [None]:
ratios = dict(
ratio1=lambda x: (x['boosted_jet']-x['boosted_el'])/x['boosted_el'],
ratio2=lambda x: (x['boosted_jet']-x['boosted_el'])/x['el_jet'],
ratio3=lambda x: (x['boosted_jet']-x['boosted_el']))

In [17]:
for ratio, get_ratio in ratios.items():
    wass_distances[ratio] = wass_distances.apply(get_ratio)
wass_distances.sort_values(by='ratio1', inplace=True)
wass_distances

Unnamed: 0,var,el,jet,ratios
7,rhad,0.0089,0.346,37.876404
8,rhad1,0.0052,0.1895,35.442308
0,eratio,0.0373,0.5406,13.493298
5,wstot,0.2084,2.7284,12.092131
1,reta,0.0124,0.1168,8.419355
6,rphi,0.0224,0.1238,4.526786
3,f3,0.003,0.0137,3.566667
4,weta2,0.0011,0.0043,2.909091
2,f1,0.1238,0.0158,-0.872375
