In [1]:
import uproot
import awkward as ak
import numpy as np
import matplotlib.pyplot as plt

import time
import gc
import yaml
import json
from glob import glob
# from memory_profiler import profile

In [2]:
%load_ext memory_profiler
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

### loading data

In [3]:
file_name = 'data/ShuffleMergeSpectral_2.root'
tree_name = 'taus'

In [5]:
cache = uproot.LRUArrayCache("1 GB")
taus_lazy = uproot.lazy(f'{file_name}:{tree_name}', array_cache=cache)

In [6]:
taus_lazy

<Array [{run: 1, lumi: 118, ... 1]}] type='44436 * {"run": uint32, "lumi": uint3...'>

In [7]:
cache

<LRUArrayCache (1368636/1000000000 bytes full) at 0x2b67c659a970>

In [8]:
cache.clear()

In [9]:
cache

<LRUArrayCache (0/1000000000 bytes full) at 0x2b67c659a970>

### loading data [2]

In [9]:
t = uproot.open(f'{file_name}:{tree_name}')
taus_zip = t.arrays(how='zip')

In [10]:
# for taus in batch_yielder:
#     break

In [11]:
# taus

### loading data [3]

In [12]:
taus_r = R.RDataFrame(tree_name, file_name)

### benchmark means/std

#### RDataFrame

In [13]:
%%timeit
pt_mean = taus_r.Mean('pfCand_pt')
pt_mean.GetValue()

658 ms ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%%timeit
pt_std = taus_r.StdDev('pfCand_pt')
pt_std.GetValue()

656 ms ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### lazy awkward

In [15]:
%%timeit
ak.mean(taus_lazy['pfCand_pt'])

657 µs ± 41.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
ak.std(taus_lazy['pfCand_pt'])

17.5 ms ± 194 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### zip awkward

In [17]:
%%timeit
ak.mean(taus_zip['pfCand', 'pt'])

508 µs ± 1.26 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%%timeit
ak.std(taus_zip['pfCand', 'pt'])

17.5 ms ± 157 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
%%timeit
ak.std(ak.flatten(taus_zip['pfCand', 'pt']))

16.7 ms ± 210 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### self-made awkward

In [20]:
%%timeit
pt_mean = ak.mean(taus_lazy['pfCand_pt'])
pt_std = np.sqrt(ak.sum((taus_lazy['pfCand_pt'] - pt_mean)**2) / ak.count(taus_lazy['pfCand_pt']))

17.7 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### compare S&M files

In [5]:
taus_1 = uproot.open(f'data/ShuffleMergeSpectral_1.root:{tree_name}').arrays(how='zip')

In [6]:
taus_3 = uproot.open(f'data/ShuffleMergeSpectral_3.root:{tree_name}').arrays(how='zip')

In [18]:
ak.count(taus_1), ak.count(taus_3)

(296449347, 283203975)

In [20]:
len(taus_1), len(taus_3)

(191700, 183251)

In [15]:
ak.mean(taus_1['pfCand', 'pt']), ak.mean(taus_3['pfCand', 'pt'])

(6.531693611959568, 6.531911499813611)

In [16]:
ak.std(taus_1['pfCand', 'pt']), ak.std(taus_3['pfCand', 'pt'])

(35.57811990694886, 35.44534597127814)

In [21]:
ak.mean(taus_1['pfCand', 'eta']), ak.mean(taus_3['pfCand', 'eta'])

(-0.0028388169881038566, -0.0022471090837211266)

In [22]:
ak.std(taus_1['pfCand', 'eta']), ak.std(taus_3['pfCand', 'eta'])

(1.259707597657328, 1.2584580628293074)

### full scale

In [4]:
from scaling import nested_dict, compute_mean, compute_std, compute_scaling

In [5]:
with open('scaling_definitions.yml') as f:
    scaling_dict = yaml.load(f, Loader=yaml.FullLoader)

In [6]:
file_path = scaling_dict.pop('file_path')
file_range = scaling_dict.pop('file_range')
tree_name = scaling_dict.pop('tree_name')
log_step = scaling_dict.pop('log_step')
version = scaling_dict.pop('version')
selection_dict = scaling_dict.pop('selection')
grid_selection_dict = scaling_dict.pop('grid_selection')
assert log_step > 0 and type(log_step) == int
assert len(file_range)==2 and file_range[0]<=file_range[1]
file_names = sorted(glob(file_path))[file_range[0]:file_range[1]]
print(f'\n[INFO] will process {len(file_names)} input files from {file_path}')
print(f'[INFO] will dump means & stds to json after every {log_step} files:\n')


[INFO] will process 5 input files from data/ShuffleMergeSpectral*.root
[INFO] will dump means & stds to json after every 10 files:



In [7]:
file_names

['data/ShuffleMergeSpectral_1.root',
 'data/ShuffleMergeSpectral_2.root',
 'data/ShuffleMergeSpectral_3.root',
 'data/ShuffleMergeSpectral_4.root',
 'data/ShuffleMergeSpectral_5.root']

In [8]:
sums, sums2, counts, means_stds = nested_dict(), nested_dict(), nested_dict(), nested_dict()
for var_type, var_dict in scaling_dict.items():
    for var in var_dict.keys():
        for grid_type in grid_selection_dict[var_type].keys():
            sums[var_type][var][grid_type] = np.zeros(len(file_names), dtype='float64')
            sums2[var_type][var][grid_type] = np.zeros(len(file_names), dtype='float64')
            counts[var_type][var][grid_type] = np.zeros(len(file_names), dtype='int64')

In [9]:
scaling_dict['ele']['ele_rel_pt']

[None, {'ele_rel_pt': 'ele_pt/tau_pt'}]

In [10]:
scaling_dict['global']['tau_dxy']

['(tau_dxy > -10) & (tau_dxy_error > 0)', None]

In [11]:
type(scaling_dict['ele']['ele_rel_pt'][0])

NoneType

#### compute sums

In [12]:
print('[INFO] starting to accumulate sums & counts:\n')
program_starts = time.time()
last_file_done = program_starts
for file_i, file_name in enumerate(file_names):
    compute_scaling(file_name, file_i, tree_name, scaling_dict, grid_selection_dict, sums, sums2, counts, means_stds, log_step, version)
    gc.collect()
    processed_file = time.time()
    print(f'---> processed {file_name} in {processed_file - last_file_done:.2f} s')
    last_file_done = processed_file
for var_type, var_dict in scaling_dict.items():
    for var in var_dict.keys():
        for grid_type in grid_selection_dict[var_type].keys():
            mean = compute_mean(sums[var_type][var][grid_type], counts[var_type][var][grid_type], aggregate=True)
            std = compute_std(sums[var_type][var][grid_type], sums2[var_type][var][grid_type], counts[var_type][var][grid_type], aggregate=True)
            means_stds[var_type][var][grid_type] = {'mean': mean, 'std': std}

[INFO] starting to accumulate sums & counts:



  return sums.sum()/counts.sum()
  return np.sqrt(sums2.sum()/counts.sum() - (sums.sum()/counts.sum())**2)


---> processed data/ShuffleMergeSpectral_1.root in 13.21 s
---> processed data/ShuffleMergeSpectral_2.root in 3.60 s
---> processed data/ShuffleMergeSpectral_3.root in 12.40 s
---> processed data/ShuffleMergeSpectral_4.root in 18.86 s
---> processed data/ShuffleMergeSpectral_5.root in 15.88 s


In [13]:
means_stds['global']['npv']

defaultdict(<function scaling.nested_dict()>,
            {'inner': {'mean': 28.951920238477335, 'std': 9.490252281368086},
             'outer': {'mean': 28.951920238477335, 'std': 9.490252281368086}})

In [14]:
means_stds['ele']['ele_rel_pt']

defaultdict(<function scaling.nested_dict()>,
            {'inner': {'mean': 0.9702073845641698, 'std': 0.6239547663973649},
             'outer': {'mean': 0.3759259109226752, 'std': 1.3595947142965477}})

In [15]:
sums['ele']['ele_rel_pt']

defaultdict(<function scaling.nested_dict()>,
            {'inner': array([55984.50390625, 13103.36132812, 53713.3046875 , 82421.8203125 ,
                    68188.2421875 ]),
             'outer': array([2145.63818359,  467.59317017, 1827.57531738, 3198.37402344,
                    2652.54296875])})

In [16]:
counts['ele']['ele_rel_pt']

defaultdict(<function scaling.nested_dict()>,
            {'inner': array([57799, 13551, 55563, 84645, 70249]),
             'outer': array([5670, 1255, 5411, 8179, 6862])})

#### dependance on files number

In [17]:
grid_type = 'outer'

In [18]:
compute_mean(sums['ele']['ele_rel_pt'][grid_type], counts['ele']['ele_rel_pt'][grid_type], True)

0.3759259109226752

In [19]:
compute_mean(sums['ele']['ele_rel_pt'][grid_type], counts['ele']['ele_rel_pt'][grid_type], False)

array([0.37841943, 0.3725842 , 0.33775186, 0.39104707, 0.38655537])

In [20]:
compute_mean(sums['ele']['ele_rel_pt'][grid_type], counts['ele']['ele_rel_pt'][grid_type], True, 0, 5)

0.3759259109226752

In [21]:
compute_std(sums['global']['npv'][grid_type], sums2['global']['npv'][grid_type], counts['global']['npv'][grid_type], True)

9.490252281368086

In [22]:
compute_std(sums['global']['npv'][grid_type], sums2['global']['npv'][grid_type], counts['global']['npv'][grid_type], False)

array([9.49278462, 9.45873812, 9.4706287 , 9.4973786 , 9.50097665])

In [23]:
compute_std(sums['global']['npv'][grid_type], sums2['global']['npv'][grid_type], counts['global']['npv'][grid_type], True, 0, 5)

9.490252281368086

#### validate [one array]

In [24]:
file_id = 2

In [25]:
t = uproot.open(f'{file_names[file_id]}:{tree_name}')
taus = t.arrays(how='zip')

In [26]:
taus['ele', 'rel_pt'] = taus['ele', 'pt'] / taus['tau_pt']

In [27]:
grid_type = 'inner'
mask = (abs(taus['ele', 'eta'] - taus['tau_eta']) < 0.11) & (abs(taus['ele', 'phi'] - taus['tau_phi']) < 0.11)

In [28]:
var_type = 'ele'
var = 'ele_rel_pt'

In [29]:
ak.mean(taus[var_type, 'rel_pt'][mask]), ak.std(taus[var_type, 'rel_pt'][mask])

(0.9667099452423376, 0.5659782783184839)

In [30]:
compute_mean(sums[var_type][var][grid_type], counts[var_type][var][grid_type], False)[file_id]

0.9667099452423376

In [31]:
compute_std(sums[var_type][var][grid_type], sums2[var_type][var][grid_type], counts[var_type][var][grid_type], False)[file_id]

0.5659783669268011

#### validate [all array]

In [32]:
cache = uproot.LRUArrayCache("1 GB")
taus_lazy = uproot.lazy(file_names, array_cache=cache)

In [33]:
var_type = 'global'
var = 'npv'
grid_type = 'inner'

In [34]:
ak.mean(taus_lazy[var]), ak.std(taus_lazy[var])

(28.951920238477335, 9.490252281368088)

In [35]:
means_stds[var_type][var][grid_type]

{'mean': 28.951920238477335, 'std': 9.490252281368086}

#### write to json

In [142]:
with open(f'output/means_stds_v{version}_dev.json', 'w') as fout:
    json.dump(means_stds, fout)