In [1]:
import uproot4

In [2]:
%load_ext snakeviz

```performance_study.root``` is the file containing the TnP MC ntuples

In [3]:
file = uproot4.open('performance_study.root')

In [4]:
tree = file['Events']

Measure how the I/O time scales with the number if branches exctracted

In [6]:
%%time
#%%snakeviz
print('Extracting one branch')

mass = tree['mass'].array()
print(mass)

Extracting one branch
[56, 77, 68.2, 81.5, 90.4, 93.7, 91.3, ... 112, 77.3, 90.8, 98.3, 90.8, 95.4, 89.9]
CPU times: user 848 ms, sys: 130 ms, total: 978 ms
Wall time: 1.2 s


In [7]:
%%time
#%%snakeviz
flt = 'probeFull*'
print('Extracting {} branches'.format(len(tree.keys(filter_name=flt))))

probes = tree.arrays(filter_name=flt)
print(probes)

Extracting 9 branches
[{probeFull5x5_e1x5: 10.8, probeFull5x5_e2x5: 14.5, ... probeFull5x5_r9: 0.982}]
CPU times: user 4.55 s, sys: 810 ms, total: 5.36 s
Wall time: 5.36 s


In [22]:
%%time
#%%snakeviz
flt = 'probe*'
print('Extracting {} branches'.format(len(tree.keys(filter_name=flt))))

probes = tree.arrays(filter_name=flt)
print(probes)

Extracting 71 branches
[{probeInitialEnergy: 35.3, probeEnergy: 35.4, ... probeEtaWidth: 0.0127}]
CPU times: user 28.2 s, sys: 3.18 s, total: 31.4 s
Wall time: 31.4 s


In [23]:
%%time
probe_keys = [key for key in tree.keys() if key.startswith('probe')]
print('Extracting {} branches separately'.format(len(probe_keys)))

probes = []
for key in probe_keys:
    probes.append(tree[key].array())

Extracting 71 branches separately
CPU times: user 28.7 s, sys: 614 ms, total: 29.3 s
Wall time: 29.3 s


In [9]:
%%time
#%%snakeviz
print('Extracting all {} branches'.format(len(tree.keys())))

branches = tree.arrays()
print(branches)

Extracting all 152 branches
[{candidate_id: 0, weight: 0.0481, mass: 56, pt: 77, ... npu: 40.4, puweight: 1}]
CPU times: user 59.2 s, sys: 9.33 s, total: 1min 8s
Wall time: 1min 8s


Define new branches as functions of existing ones

In [13]:
%%time
squared_mass = tree.arrays(['squared_mass', 'rho', 'lumi'], aliases = {'squared_mass': 'mass**2'})

print(squared_mass)

[{squared_mass: 3.13e+03, rho: 22.9, lumi: 927273}, ... rho: 21.3, lumi: 1603529}]
CPU times: user 1.91 s, sys: 139 ms, total: 2.05 s
Wall time: 2.04 s


Extract branches after applying cut which is function of a branch I don't want to extract

In [17]:
%%time
#%%snakeviz
flt = 'probe*'
cut = 'mass > 60 and -5 < eta < -1'
print('Extracting {} branches'.format(len(tree.keys(filter_name=flt))))

probes = tree.arrays(filter_name=flt)
print(probes)

Extracting 71 branches
[{probeInitialEnergy: 35.3, probeEnergy: 35.4, ... probeEtaWidth: 0.0127}]
CPU times: user 28.3 s, sys: 1.99 s, total: 30.2 s
Wall time: 30.2 s


Use ```iterate``` to separate the file into batches

In [25]:
%%time

for batch in tree.iterate(step_size="8 GB"):
    print(repr(batch))

<Array [{candidate_id: 0, ... puweight: 1}] type='20975999 * {"candidate_id": in...'>
CPU times: user 59.4 s, sys: 11.2 s, total: 1min 10s
Wall time: 1min 10s


Study ```TChain```-like behavior

In [26]:
# Data                                                                                                                                                             
base_dir = '/work/gallim/root_files/tnp_original/20201130_data_UL18'
file_name = 'output_EGamma_alesauva-UL2018_0-10_6_4-v0-Run2018{}-12Nov2019_UL2018-{}-981b04a73c9458401b9ffd78fdd24189_USER_{}.root'
number = 500
tree_path = 'tagAndProbeDumper/trees/Data_13TeV_All'
runs_id = [('A', 'v2'), ('B', 'v2'), ('C', 'v2'), ('D', 'v4')]

# Simulation
base_dir = '/work/gallim/root_files/tnp_original/20201130_mc_UL18'
file_name = 'output_DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8_alesauva-UL2018_0-10_6_4-v0-RunIISummer19UL18MiniAOD-106X_upgrade2018_realistic_v11_L1v1-v2-b5e482a1b1e11b6e5da123f4bf46db27_USER_{}.root'
number = 500
tree_path = 'tagAndProbeDumper/trees/DYJetsToLL_amcatnloFXFX_13TeV_All'


def get_all_files(base_dir, file_name, number, tree_path, runs_id = None):
    files = []
    
    if runs_id is None:
        runs_id = []
                                                                                                                                                                       
    if runs_id:
        for ri in runs_id:
            for num in range(number):
                files.append(base_dir + '/' + file_name.format(ri[0], ri[1], num) + ':' + tree_path)
    else:
        for num in range(1, number):
            files.append(base_dir + '/' + file_name.format(num) + ':' + tree_path)

    return files

In [None]:
%%time
# MC
for batch in uproot4.iterate(get_all_files(base_dir, file_name, number, tree_path)):
    print(repr(batch))

In [33]:
%%time
df = uproot4.lazy(get_all_files(base_dir, file_name, number, tree_path))

In [35]:
%%time
df = uproot.lazy(get_all_files(base_dir, file_name, number, tree_path))

CPU times: user 1min 37s, sys: 1.39 s, total: 1min 38s
Wall time: 1min 38s


In [36]:
df.cache

AttributeError: no field named 'cache'

(https://github.com/scikit-hep/awkward-1.0/blob/1.0.0/src/awkward/highlevel.py#L1066)