In [1]:
from Gaugi import load as gaugi_load
from kepler.pandas.readers import load as kepler_load

import pandas as pd
import numpy as np
import os
from itertools import product
import json

Welcome to JupyROOT 6.16/00
Using all sub packages with ROOT dependence
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Parameters

In [2]:
criterias = ['tight', 'medium', 'loose', 'vloose']
perfomance_metrics=['pd', 'fr']
l2calo_trigers = ['T0HLTElectronT2CaloTight', 'T0HLTElectronT2CaloMedium', 'T0HLTElectronT2CaloLoose', 'T0HLTElectronT2CaloVLoose']
ref_keys = ['sgnRef', 'bkgRef']
metrics_ref_keys = {metric: ref_key for metric, ref_key in zip(perfomance_metrics, ref_keys)}
triger_criterias = {key: value for key, value in zip(criterias, l2calo_trigers)}
triger_criterias

{'tight': 'T0HLTElectronT2CaloTight',
 'medium': 'T0HLTElectronT2CaloMedium',
 'loose': 'T0HLTElectronT2CaloLoose',
 'vloose': 'T0HLTElectronT2CaloVLoose'}

In [3]:
# Al the labels were 1 or 0 with floats, casted them to ints to avoid possible float comparision error
def load_as_df(filepath: str) -> pd.DataFrame:
    data = dict(np.load(filepath))
    df_data = pd.DataFrame(data['data'], columns=data['features'])
    df_target = pd.DataFrame(data['target'], columns=['target'])
    df = pd.concat([df_data, df_target], axis=1)
    return df

In [4]:
def get_values_from_ref(ref: dict, et:int, eta:int):
    values = []
    for metric, ref_key in metrics_ref_keys.items():
        for criteria in criterias:
            criteria_dict = ref[ref_key][f'{criteria}_cutbased']
            values.append([iet, ieta, 'ref', f'{criteria}_{ref_key}_passed', int(criteria_dict['passed'])])
            values.append([iet, ieta, 'ref', f'{criteria}_{ref_key}_total', int(criteria_dict['total'])])
            metric_value = criteria_dict['passed']/criteria_dict['total'] if criteria_dict['total'] != 0 else -1
            values.append([iet, ieta, 'ref', f'{criteria}_{metric}', metric_value])
    # values = pd.DataFrame(values, columns = ['data', 'criteria', 'metric', 'value'])
    return values

In [5]:
def get_values_from_data(data: pd.DataFrame, data_name: str, et:int, eta:int, fix_offline: bool = False):
    values = []
    for metric, ref_key in metrics_ref_keys.items():
        for criteria in criterias:
            criteria_dict = get_metric(data, metric, criteria, fix_offline)
            values.append([iet, ieta, data_name, f'{criteria}_{ref_key}_passed', int(criteria_dict['passed'])])
            values.append([iet, ieta, data_name, f'{criteria}_{ref_key}_total', int(criteria_dict['total'])])
            metric_value = criteria_dict['passed']/criteria_dict['total'] if criteria_dict['total'] != 0 else -1
            values.append([iet, ieta, data_name, f'{criteria}_{metric}', metric_value])
    # values.append([data_name, '',  'n_samples', data.shape[0]])
    # values = pd.DataFrame(values, columns = ['data', 'criteria', 'metric', 'value'])
    return values

In [6]:
def get_metric(data: pd.DataFrame, metric:str, criteria: str, fix_offline: bool = False):
    target = 1 if metric=='pd' else 0
    offline_col = 'el_lhtight' if fix_offline else f'el_lh{criteria}'
    offline_approved = data[offline_col] == target
    cutbased_approved = data[triger_criterias[criteria]] == 1
    res = {
        'total': int(offline_approved.sum()),
        'passed': int((offline_approved & cutbased_approved).sum())
    }
    res[metric] = res['passed']/res['total'] if res['total'] != 0 else None                        
    return res

In [7]:
dataset = 'data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM1.bkg.VProbes_EGAM7.GRL_v97'
loose_dataset = 'data17_13TeV.AllPeriods.sgn.probes_lhvloose_EGAM1.bkg.vprobes_vlhvloose_EGAM7.GRL_v97.25bins'
homepath = os.path.expanduser('~')
datapath = os.path.join(homepath, 'data', dataset)
loose_datapath = os.path.join(homepath, 'data', loose_dataset)
filepath = os.path.join(datapath, dataset + '_et{et}_eta{eta}.npz')
refpath = os.path.join(datapath, 'references')
reffilepath = os.path.join(refpath, dataset + '_et{et}_eta{eta}.ref.pic.gz')
loose_filepath = os.path.join(loose_datapath, loose_dataset + '_et{et}_eta{eta}.npz')
output_dir = os.path.join(datapath, 'ref_analysis')
json_outputpath = os.path.join(output_dir, 'ref{type}_et{et}_eta{eta}.json')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Computing reference values

In [10]:
n_ets = 5
n_etas = 5
data_types = ['original', 'no_filter', 'el_lhmedium_jet_lhvloose', 'no_filter_jet_lhtight_fixed', 'filtered_el_lhmedium_jet_lhtight']
metrics_per_bin = np.empty((n_ets,n_etas), dtype=object)
data_infos = list()
for iet, ieta in product(range(n_ets), range(n_etas)):
# for iet, ieta in zip([4],[0]):
    
    print(f'Processing et {iet} and eta {ieta}')
    print('Reading data')
    ref = gaugi_load(reffilepath.format(et=iet, eta=ieta))
    medium_data = load_as_df(filepath.format(et=iet, eta=ieta))
    vloose_data = kepler_load(loose_filepath.format(et=iet, eta=ieta))
    print('Computing infos')
    data_infos.append([iet, ieta, 'medium_data', 'n_samples', medium_data.shape[0]])
    data_infos.append([iet, ieta, 'vloose_data', 'n_samples', vloose_data.shape[0]])
    data_infos.append([iet, ieta, 'medium_data', 'target_equal_tight', (medium_data['target'] == medium_data['el_lhtight']).all()])
    data_infos.append([iet, ieta, 'medium_data', 'target_equal_medium', (medium_data['target'] == medium_data['el_lhmedium']).all()])
    data_infos.append([iet, ieta, 'medium_data', 'target_equal_loose', (medium_data['target'] == medium_data['el_lhloose']).all()])
    data_infos.append([iet, ieta, 'medium_data', 'target_equal_vloose', (medium_data['target'] == medium_data['el_lhvloose']).all()])
    data_infos.append([iet, ieta, 'vloose_data', 'target_equal_tight', (vloose_data['target'] == vloose_data['el_lhtight']).all()])
    data_infos.append([iet, ieta, 'vloose_data', 'target_equal_medium', (vloose_data['target'] == vloose_data['el_lhmedium']).all()])
    data_infos.append([iet, ieta, 'vloose_data', 'target_equal_loose', (vloose_data['target'] == vloose_data['el_lhloose']).all()])
    data_infos.append([iet, ieta, 'vloose_data', 'target_equal_vloose', (vloose_data['target'] == vloose_data['el_lhvloose']).all()])
    data_infos.append([iet, ieta, 'medium_data', 'tight_equal_zero', (medium_data['el_lhtight'] == 0).all()])
    data_infos.append([iet, ieta, 'medium_data', 'medium_equal_zero', (medium_data['el_lhmedium'] == 0).all()])
    data_infos.append([iet, ieta, 'medium_data', 'loose_equal_zero', (medium_data['el_lhloose'] == 0).all()])
    data_infos.append([iet, ieta, 'medium_data', 'vloose_equal_zero', (medium_data['el_lhvloose'] == 0).all()])
    data_infos.append([iet, ieta, 'vloose_data', 'tight_equal_zero', (vloose_data['el_lhtight'] == 0).all()])
    data_infos.append([iet, ieta, 'vloose_data', 'medium_equal_zero', (vloose_data['el_lhmedium'] == 0).all()])
    data_infos.append([iet, ieta, 'vloose_data', 'loose_equal_zero', (vloose_data['el_lhloose'] == 0).all()])
    data_infos.append([iet, ieta, 'vloose_data', 'vloose_equal_zero', (vloose_data['el_lhvloose'] == 0).all()])
    
    print('Computing metrics')
    for data_type in data_types:
        if data_type == 'original':
            data_infos.extend(get_values_from_ref(ref, iet, ieta))
        elif data_type == 'no_filter':
            data_infos.extend(get_values_from_data(medium_data, data_type, iet, ieta))
        elif data_type == 'el_lhmedium_jet_lhvloose':
            filtered_medium =  medium_data.loc[((medium_data['target'] == 1)&(medium_data['el_lhmedium'] == 1)) | ((medium_data['target'] != 1)&(medium_data['el_lhvloose'] !=1))]
            data_infos.extend(get_values_from_data(filtered_medium, data_type, iet, ieta))
            try:
                are_equal = (medium_data == filtered_medium).all().all()
            except ValueError:
                are_equal = False
            data_infos.append([iet, ieta, '', 'medium_equal_filter', are_equal])
            data_infos.append([iet, ieta, 'filtered_medium', 'filtered_samples', filtered_medium.shape[0]])
        elif data_type == 'no_filter_jet_lhtight_fixed':
            data_infos.extend(get_values_from_data(medium_data, data_type, iet, ieta, True))
        elif data_type == 'filtered_el_lhmedium_jet_lhtight':
            filtered_medium =  medium_data.loc[((medium_data['target'] == 1)&(medium_data['el_lhmedium'] == 1)) | ((medium_data['target'] != 1)&(medium_data['el_lhtight'] !=1))]
            data_infos.extend(get_values_from_data(filtered_medium, data_type, iet, ieta))
        else:
            raise ValueError(f'Treatment for filter {data_type} not found')
    
    # bin_values = pd.concat(all_values, axis=0, ignore_index=True)
    # bin_values['et'] = iet
    # bin_values['eta'] = ieta
    # metrics_per_bin[iet, ieta] =  bin_values

data_infos = pd.DataFrame(data_infos, columns = ['et', 'eta', 'data', 'metric', 'value'])
data_infos.to_csv('data_infos.csv')
print('Finished')

Processing et 0 and eta 0
Reading data
Computing infos
Computing metrics
Processing et 0 and eta 1
Reading data
Computing infos
Computing metrics
Processing et 0 and eta 2
Reading data
Computing infos
Computing metrics
Processing et 0 and eta 3
Reading data
Computing infos
Computing metrics
Processing et 0 and eta 4
Reading data
Computing infos
Computing metrics
Processing et 1 and eta 0
Reading data
Computing infos
Computing metrics
Processing et 1 and eta 1
Reading data
Computing infos
Computing metrics
Processing et 1 and eta 2
Reading data
Computing infos
Computing metrics
Processing et 1 and eta 3
Reading data
Computing infos
Computing metrics
Processing et 1 and eta 4
Reading data
Computing infos
Computing metrics
Processing et 2 and eta 0
Reading data
Computing infos
Computing metrics
Processing et 2 and eta 1
Reading data
Computing infos
Computing metrics
Processing et 2 and eta 2
Reading data
Computing infos
Computing metrics
Processing et 2 and eta 3
Reading data
Computing in

In [11]:
data_infos

Unnamed: 0,et,eta,data,metric,value
0,0,0,medium_data,n_samples,420458
1,0,0,vloose_data,n_samples,470186
2,0,0,medium_data,target_equal_tight,True
3,0,0,medium_data,target_equal_medium,True
4,0,0,medium_data,target_equal_loose,False
...,...,...,...,...,...
3495,4,4,filtered_el_lhmedium_jet_lhtight,loose_bkgRef_total,183567
3496,4,4,filtered_el_lhmedium_jet_lhtight,loose_fr,0.869917
3497,4,4,filtered_el_lhmedium_jet_lhtight,vloose_bkgRef_passed,161187
3498,4,4,filtered_el_lhmedium_jet_lhtight,vloose_bkgRef_total,183567


In [None]:
data_infos.shape

## Loading specific data

### Medium

In [18]:
et=4
eta=0
medium_data = load_as_df(filepath.format(et=et, eta=eta))
medium_total = medium_data.shape[0]
medium_total

5469197

In [50]:
medium_data_cols = np.sort([col for col in medium_data.columns if not col.startswith('L2Calo_ring')])
medium_data_cols

array(['DeltaPOverP', 'L2Calo_e2tsts1', 'L2Calo_ehad1', 'L2Calo_eratio',
       'L2Calo_et', 'L2Calo_eta', 'L2Calo_f1', 'L2Calo_f3', 'L2Calo_phi',
       'L2Calo_reta', 'L2Calo_weta2', 'L2Calo_wstot',
       'L2Electron_caloEta', 'L2Electron_etOverPt', 'L2Electron_eta',
       'L2Electron_hastrack', 'L2Electron_phi', 'L2Electron_pt',
       'L2Electron_trkClusDeta', 'L2Electron_trkClusDphi',
       'T0HLTElectronT2CaloLoose', 'T0HLTElectronT2CaloMedium',
       'T0HLTElectronT2CaloTight', 'T0HLTElectronT2CaloVLoose', 'avgmu',
       'd0', 'd0significance', 'deltaEta1', 'deltaPhi2',
       'deltaPhi2Rescaled', 'deltaR', 'eProbabilityHT', 'eeMass',
       'el_lhloose', 'el_lhmedium', 'el_lhtight', 'el_lhvloose', 'eratio',
       'et', 'eta', 'f1', 'f3', 'hastrack', 'numberOfBLayerHits',
       'numberOfPixelHits', 'numberOfTRTHits', 'phi', 'reta', 'rhad',
       'rhad1', 'rphi', 'target', 'trans_TRT_PID', 'weta2', 'wtots1'],
      dtype='<U25')

In [41]:
# Utilizando o filtro fornecido pelo Micael. Isso indica que o data medium já é o filtro aplicado sobre vloose (Acredito que seja sobre o vloose)
filtered_medium =  medium_data.loc[((medium_data['target'] == 1)&(medium_data['el_lhmedium'] == 1)) | ((medium_data['target'] != 1)&(medium_data['el_lhvloose'] !=1))]
filtered_medium_total = filtered_medium.shape[0]
filtered_medium.shape, (filtered_medium == medium_data).all().all()

((5469197, 155), True)

In [15]:
((medium_data['target'] == medium_data['el_lhtight']).all(), 
(medium_data['target'] == medium_data['el_lhmedium']).all(), 
(medium_data['target'] == medium_data['el_lhloose']).all(), 
(medium_data['target'] == medium_data['el_lhvloose']).all())

(True, True, False, False)

In [38]:
((medium_data['el_lhtight']==0).all(),
(medium_data['el_lhmedium']==0).all(),
(medium_data['el_lhloose']==0).all(),
(medium_data['el_lhvloose']==0).all())

(False, False, True, True)

In [16]:
((medium_data['target'] == medium_data[triger_criterias['tight']]).all(), 
 (medium_data['target'] == medium_data[triger_criterias['medium']]).all(), 
 (medium_data['target'] == medium_data[triger_criterias['loose']]).all(), 
 (medium_data['target'] == medium_data[triger_criterias['vloose']]).all())

(False, False, False, False)

In [19]:
vloose_data = kepler_load(loose_filepath.format(et=et, eta=eta))
vloose_total = vloose_data.shape[0]
vloose_data.shape

(5599840, 216)

### VLoose

In [49]:
vloose_data_cols = np.sort([col for col in vloose_data.columns if not col.startswith('trig_L2_cl_ring')])
vloose_data_cols

array(['L1_EM15VH', 'L1_EM15VHI', 'L1_EM20VH', 'L1_EM20VHI', 'L1_EM22VH',
       'L1_EM22VHI', 'L1_EM24VHI', 'L1_EM3', 'L1_EM7', 'RunNumber',
       'TDT__EFCalo__e28_lhtight_nod0_ivarloose',
       'TDT__EFCalo__e28_lhtight_nod0_noringer_ivarloose',
       'TDT__HLT__e28_lhtight_nod0_ivarloose',
       'TDT__HLT__e28_lhtight_nod0_noringer_ivarloose',
       'TDT__L1Calo__e28_lhtight_nod0_ivarloose',
       'TDT__L1Calo__e28_lhtight_nod0_noringer_ivarloose',
       'TDT__L2Calo__e28_lhtight_nod0_ivarloose',
       'TDT__L2Calo__e28_lhtight_nod0_noringer_ivarloose',
       'TDT__L2__e28_lhtight_nod0_ivarloose',
       'TDT__L2__e28_lhtight_nod0_noringer_ivarloose', 'avgmu',
       'el_TaP_Mass', 'el_TaP_deltaR', 'el_d0', 'el_d0significance',
       'el_deltaEta1', 'el_deltaPOverP', 'el_deltaPhi2',
       'el_deltaPhi2Rescaled', 'el_eProbabilityHT', 'el_eratio', 'el_et',
       'el_eta', 'el_etaBE2', 'el_f1', 'el_f3', 'el_hastrack',
       'el_lhloose', 'el_lhmedium', 'el_lhtight', 'el_l

In [22]:
vloose_data['trig_L2_cl_lhvloose_et12to20'].head()

0    True
1    True
2    True
3    True
4    True
Name: trig_L2_cl_lhvloose_et12to20, dtype: bool

In [23]:
vloose_data['trig_L2_cl_lhvloose_et12to20'].unique()

array([ True, False])

In [24]:
is_equal = (vloose_data['trig_L2_cl_lhvloose_et0to12'] == vloose_data['trig_L2_cl_lhvloose_et12to20'])
is_equal.all(), is_equal.any()

(False, True)

In [25]:
((vloose_data['target'] == vloose_data['el_lhtight']).all(), 
(vloose_data['target'] == vloose_data['el_lhmedium']).all(), 
(vloose_data['target'] == vloose_data['el_lhloose']).all(), 
(vloose_data['target'] == vloose_data['el_lhvloose']).all())

(False, False, False, True)

In [44]:
filtered_vloose = vloose_data.loc[((vloose_data.target == 1)&(vloose_data.el_lhmedium == 1)) | ((vloose_data.target != 1)&(vloose_data.el_lhvloose !=1))]
filtered_vloose_total = filtered_vloose.shape[0]
filtered_vloose.shape

(5469055, 216)

In [45]:
ref_total = ref_matrix[et, eta]['bkgRef']['tight_cutbased']['total'] + ref_matrix[et, eta]['sgnRef']['tight_cutbased']['total']
ref_total, medium_total, vloose_total, filtered_vloose_total

(5420314, 5469197, 5599840, 5469055)