# Bin analaysis

Notebook for analyzing the bin division on the datasets

In [1]:
import os
import json
from itertools import product
import numpy as np
import pandas as pd

In [6]:
ets = np.arange(5)
etas = np.arange(5)
dataset = 'data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM1.bkg.VProbes_EGAM7.GRL_v97'
new_dataset = 'extra_bins_' + dataset
homepath = os.path.expanduser('~')
datapath = os.path.join(homepath, 'data', dataset)
filepath = os.path.join(datapath, dataset + '_et{et}_eta{eta}.npz')
output_dir = os.path.join(datapath, 'bin_analysis')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
outpath = os.path.join(output_dir, 'et{et}_eta{eta}_statiscs.json')

In [3]:
data = dict(np.load(filepath.format(et=0, eta=0)))

In [4]:
data['features']

array(['avgmu', 'L2Calo_ring_0', 'L2Calo_ring_1', 'L2Calo_ring_2',
       'L2Calo_ring_3', 'L2Calo_ring_4', 'L2Calo_ring_5', 'L2Calo_ring_6',
       'L2Calo_ring_7', 'L2Calo_ring_8', 'L2Calo_ring_9',
       'L2Calo_ring_10', 'L2Calo_ring_11', 'L2Calo_ring_12',
       'L2Calo_ring_13', 'L2Calo_ring_14', 'L2Calo_ring_15',
       'L2Calo_ring_16', 'L2Calo_ring_17', 'L2Calo_ring_18',
       'L2Calo_ring_19', 'L2Calo_ring_20', 'L2Calo_ring_21',
       'L2Calo_ring_22', 'L2Calo_ring_23', 'L2Calo_ring_24',
       'L2Calo_ring_25', 'L2Calo_ring_26', 'L2Calo_ring_27',
       'L2Calo_ring_28', 'L2Calo_ring_29', 'L2Calo_ring_30',
       'L2Calo_ring_31', 'L2Calo_ring_32', 'L2Calo_ring_33',
       'L2Calo_ring_34', 'L2Calo_ring_35', 'L2Calo_ring_36',
       'L2Calo_ring_37', 'L2Calo_ring_38', 'L2Calo_ring_39',
       'L2Calo_ring_40', 'L2Calo_ring_41', 'L2Calo_ring_42',
       'L2Calo_ring_43', 'L2Calo_ring_44', 'L2Calo_ring_45',
       'L2Calo_ring_46', 'L2Calo_ring_47', 'L2Calo_ring_48',
       

In [7]:
dfs = [[None for _ in etas] for _ in ets]
for et, eta in product(ets, etas):
    df_data = list()
    json_data = dict.fromkeys(['et', 'eta'])
    filepath.format(et=0, eta=0)
    print(f'At et: {et} eta: {eta}')
    data = dict(np.load(filepath.format(et=et, eta=eta)))
    et_index = np.where(data['features'] == 'L2Calo_et')[0][0]
    et_values = np.unique(data['data'][:, et_index])
    min_et = et_values.min()
    max_et = et_values.max()
    df_data.append([min_et, max_et])
    json_data['et'] = {
        'min': float(min_et),
        'max': float(max_et)
        #'unique': [float(val) for val in et_values]
    }
    
    eta_index = np.where(data['features'] == 'L2Calo_eta')[0][0]
    eta_values = np.unique(np.abs(data['data'][:, eta_index]))
    min_eta = eta_values.min()
    max_eta = eta_values.max()
    df_data.append([min_eta, max_eta])
    json_data['eta'] = {
        'min': float(min_eta),
        'max': float(max_eta) #,
        #'unique': [float(val) for val in eta_values]
    }
    
    targets, counts = np.unique(data['target'], return_counts=True)
    json_data['target_count'] = {int(target) : int(count)
        for target, count in zip(targets, counts)}
    
    
    dfs[et][eta] = pd.DataFrame(df_data, columns=['min', 'max'], index=['et', 'eta'])
    with open(outpath.format(et=et, eta=eta), 'w') as json_file:
        json.dump(json_data, json_file, indent=4)
    
    del data

At et: 0 eta: 0
At et: 0 eta: 1
At et: 0 eta: 2
At et: 0 eta: 3
At et: 0 eta: 4
At et: 1 eta: 0
At et: 1 eta: 1
At et: 1 eta: 2
At et: 1 eta: 3
At et: 1 eta: 4
At et: 2 eta: 0
At et: 2 eta: 1
At et: 2 eta: 2
At et: 2 eta: 3
At et: 2 eta: 4
At et: 3 eta: 0
At et: 3 eta: 1
At et: 3 eta: 2
At et: 3 eta: 3
At et: 3 eta: 4
At et: 4 eta: 0
At et: 4 eta: 1
At et: 4 eta: 2
At et: 4 eta: 3
At et: 4 eta: 4


In [11]:
ets = np.arange(5)
etas = np.arange(5)
dataset = 'mc16_13TeV.302236_309995_341330.sgn.boosted_probes.WZ_llqq_plus_radion_ZZ_llqq_plus_ggH3000.merge.25bins.v2'
new_dataset = 'extra_bins_' + dataset
homepath = os.path.expanduser('~')
datapath = os.path.join(homepath, 'data', dataset)
filepath = os.path.join(datapath, dataset + '_et{et}_eta{eta}.npz')
output_dir = os.path.join(datapath, 'bin_analysis')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
outpath = os.path.join(output_dir, 'et{et}_eta{eta}_statiscs.json')

In [12]:
dfs = [[None for _ in etas] for _ in ets]
for et, eta in product(ets, etas):
    df_data = list()
    json_data = dict.fromkeys(['et', 'eta'])
    filepath.format(et=0, eta=0)
    print(f'At et: {et} eta: {eta}')
    data = dict(np.load(filepath.format(et=et, eta=eta)))
    et_index = np.where(data['features_float'] == 'el_et')[0][0]
    et_values = np.unique(data['data_float'][:, et_index])
    min_et = et_values.min()
    max_et = et_values.max()
    df_data.append([min_et, max_et])
    json_data['et'] = {
        'min': float(min_et),
        'max': float(max_et)
        #'unique': [float(val) for val in et_values]
    }
    
    eta_index = np.where(data['features_float'] == 'el_eta')[0][0]
    eta_values = np.unique(np.abs(data['data_float'][:, eta_index]))
    min_eta = eta_values.min()
    max_eta = eta_values.max()
    df_data.append([min_eta, max_eta])
    json_data['eta'] = {
        'min': float(min_eta),
        'max': float(max_eta) #,
        #'unique': [float(val) for val in eta_values]
    }
    
    targets, counts = np.unique(data['target'], return_counts=True)
    json_data['target_count'] = {int(target) : int(count)
        for target, count in zip(targets, counts)}
    
    
    dfs[et][eta] = pd.DataFrame(df_data, columns=['min', 'max'], index=['et', 'eta'])
    with open(outpath.format(et=et, eta=eta), 'w') as json_file:
        json.dump(json_data, json_file, indent=4)
    
    del data

At et: 0 eta: 0
At et: 0 eta: 1
At et: 0 eta: 2
At et: 0 eta: 3
At et: 0 eta: 4
At et: 1 eta: 0
At et: 1 eta: 1
At et: 1 eta: 2
At et: 1 eta: 3
At et: 1 eta: 4
At et: 2 eta: 0
At et: 2 eta: 1
At et: 2 eta: 2
At et: 2 eta: 3
At et: 2 eta: 4
At et: 3 eta: 0
At et: 3 eta: 1
At et: 3 eta: 2
At et: 3 eta: 3
At et: 3 eta: 4
At et: 4 eta: 0
At et: 4 eta: 1
At et: 4 eta: 2
At et: 4 eta: 3
At et: 4 eta: 4
