In [1]:
import parsl
import os
import logging
from tqdm import tqdm
import json
import time

import numpy as np

from parsl.app.app import python_app

from coffea.util import save, load
from coffea import hist
from coffea.processor import run_parsl_job
from coffea.processor.parsl.parsl_executor import parsl_executor

chunksize=500000

In [2]:
# from configs.unl_condor import config
# from configs.unl_slurm import config
from configs.notre_dame import config

parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7f5ea0bfdb70>

In [3]:
with open('metadata/samplefiles.json') as f:
    datasets = json.load(f)['Hbb_2017']

In [4]:
start = time.time()
treenames = ['otree', 'Events']  # deal with mixed skims and full derived trees
final_accumulator = run_parsl_job(
    datasets,
    treenames,
    load('boostedHbbProcessor.coffea'), # the hbb analysis worker
    parsl_executor, 
    executor_args={'config': None},
    data_flow=parsl.dfk(),
    chunksize=chunksize
)
dt = time.time() - start

parsl version: 0.8.0


Preprocessing: 100%|██████████| 324/324 [01:38<00:00,  3.30files/s] 
Processing: 100%|██████████| 791/791 [06:19<00:00,  2.08items/s] 


In [5]:
num_events = sum(parsl_executor.counts.values())
nbins = sum(sum(arr.size for arr in h._sumw.values()) for h in final_accumulator.values() if isinstance(h, hist.Hist))
nfilled = sum(sum(np.sum(arr>0) for arr in h._sumw.values()) for h in final_accumulator.values() if isinstance(h, hist.Hist))

print('processed: {:,d} events'.format(num_events))
print('total time: {:.2f} min'.format(dt / 60))
print('{:.2f} μs/event'.format(dt / num_events * 1e6))
print('{:.2f} Mevent/s'.format(num_events / dt / 1e6))

print("filled {:.1f} bins".format(nbins/ 1e6))
print("nonzero bins: {:.1f}%".format(100 * nfilled / nbins))


processed: 290,837,857 events
total time: 8.19 min
1.69 μs/event
0.59 Mevent/s
filled 298.5 bins
nonzero bins: 0.8%


In [6]:
save(final_accumulator, 'hists.coffea')

In [7]:
parsl.dfk().cleanup()
parsl.clear()