In [70]:
import uproot
from deserialization_hacks import tree_arrays
import awkward as ak
import numpy as np
import time
import os
import h5py
from functools import partial
import timeit

In [2]:
def filter_branch(branch):
        k = branch.name

        if not "Aux" in k:
            return False

        # the following don't contain data (in split files)
        if k.endswith("."):
            return False
        if "SG::" in k:
            return False
        if k.endswith("Base"):
            return False

        # are often empty
        # see https://github.com/scikit-hep/uproot4/issues/126
        # -> now fixed, but my custom deserialization does not work yet with them
        if "DescrTags" in k:
            return False
        
        return True

In [40]:
def regroup(array_dict):
    regrouped = {}
    for k_top in set(k.split(".")[0] for k in array_dict):
        if k_top == "EventInfoAux":
            # skip that for now - let's use EventInfoAuxDyn
            continue
        if k_top == "EventInfoAuxDyn":
            k_top = "EventInfoAux"
        # zip will put together jagged arrays with common offsets
        def ak_zip(depth_limit=2):
            return ak.zip(
                {k.replace(k_top, "")[1:] : array_dict[k] for k in array_dict if k_top in k},
                depth_limit=depth_limit
            )
        # for some containers this will work 2 levels, for some only up to 1
        try:
            v = ak_zip(depth_limit=2)
        except ValueError:
            v = ak_zip(depth_limit=1)
        regrouped[k_top.replace("AuxDyn", "").replace("Aux", "")] = v
    # lets restructure such that we get TrigMatchedObjets.<trigger-name>
    # instead of AnalysisHLT_<trigger_name>.TrigMatchedObjects
    trig_matched_objects = ak.zip(
        {
            k.replace("AnalysisHLT_", "") : regrouped[k].TrigMatchedObjects
            for k in regrouped if "AnalysisHLT" in k
        },
        depth_limit=1
    )
    for k in list(regrouped.keys()):
        if "AnalysisHLT" in k:
            regrouped.pop(k)
    regrouped["TrigMatchedObjects"] = trig_matched_objects
    return ak.zip(regrouped, depth_limit=1)

In [19]:
rootfile = "user.nihartma.22884623.EXT0._000001.DAOD_PHYSLITE.test.pool.root"
rootfile_large_baskets = "/home/nikolai/code/AODCompressedFileCreation/run/DAOD_PHYSLITE.zlib_lvl5_basket1073741824_flush10000_split99__AOD.14795494._005958/DAOD_PHYSLITE.DAOD_PHYSLITE.zlib_lvl5_basket1073741824_flush10000_split99__AOD.14795494._005958.pool.root"

In [20]:
%%time
# file from user.nihartma.physlite_test_ttbar_split99.001_EXT0
f = uproot.open(rootfile)
tree = f["CollectionTree"]
array_dict = tree_arrays(tree, filter_branch=filter_branch)

CPU times: user 6.85 s, sys: 25.9 ms, total: 6.87 s
Wall time: 6.88 s


In [21]:
events = ak.zip(array_dict, depth_limit=1)

In [42]:
events_regrouped = regroup(array_dict)

In [57]:
ak.to_parquet(events, "events_benchmark.parquet")

In [58]:
ak.to_parquet(events_regrouped, "events_benchmark_regrouped.parquet")

In [59]:
def write_hdf(filename, events, compression="lzf"):
    form, length, container = ak.to_buffers(events)
    with h5py.File(filename, "w") as file:
        group = file.create_group("awkward")
        for k in container:
            v = container[k]
            group.create_dataset(k, shape=v.shape, dtype=v.dtype, data=v, compression=compression)
        group.attrs["form"] = form.tojson()
        group.attrs["length"] = length

In [60]:
write_hdf("events_benchmark_lzf.h5", events, compression="lzf")

In [61]:
write_hdf("events_benchmark_lzf_regrouped.h5", events_regrouped, compression="lzf")

In [62]:
write_hdf("events_benchmark_gzip.h5", events, compression="gzip")

In [63]:
write_hdf("events_benchmark_gzip_regrouped.h5", events_regrouped, compression="gzip")

In [97]:
def write_npz(filename, events):
    form, length, container = ak.to_buffers(events)
    npz_container = dict(container, form=np.array(form.tojson()), length=np.array(length))
    np.savez_compressed(filename, **npz_container)

In [98]:
write_npz("events_benchmark.npz", events)

In [100]:
write_npz("events_benchmark_regrouped.npz", events_regrouped)

In [64]:
def benchmark_root(rootfile):
    f = uproot.open(rootfile)
    tree = f["CollectionTree"]
    array_dict = tree_arrays(tree, filter_branch=filter_branch)

In [65]:
def benchmark_parquet(filename):
    return ak.from_parquet(filename)

In [66]:
def benchmark_hdf(filename):
    with h5py.File(filename, "r") as file:
        group = file["awkward"]
        reconstituted = ak.from_buffers(
            ak.forms.Form.fromjson(group.attrs["form"]),
            group.attrs["length"],
            {k: np.asarray(v) for k, v in group.items()},
        )

In [123]:
def benchmark_npz(filename):
    with np.load(filename) as npf:
        container = dict(npf.items())
        form = ak.forms.Form.fromjson(str(container.pop("form")))
        length = int(container.pop("length"))
        return ak.from_buffers(
            form,
            length,
            container
        )

In [77]:
len(events.fields)

1121

In [71]:
min(timeit.Timer(partial(benchmark_parquet, "events_benchmark.parquet")).repeat(number=1, repeat=5))

0.569578504000674

In [72]:
min(timeit.Timer(partial(benchmark_parquet, "events_benchmark_regrouped.parquet")).repeat(number=1, repeat=5))

0.5887951499971678

In [73]:
min(timeit.Timer(partial(benchmark_hdf, "events_benchmark_gzip.h5")).repeat(number=1, repeat=5))

1.9937425289972452

In [74]:
min(timeit.Timer(partial(benchmark_hdf, "events_benchmark_gzip_regrouped.h5")).repeat(number=1, repeat=5))

1.5731718110037036

In [75]:
min(timeit.Timer(partial(benchmark_hdf, "events_benchmark_lzf.h5")).repeat(number=1, repeat=5))

1.4563617479943787

In [76]:
min(timeit.Timer(partial(benchmark_hdf, "events_benchmark_lzf_regrouped.h5")).repeat(number=1, repeat=5))

1.091762331998325

In [126]:
min(timeit.Timer(partial(benchmark_npz, "events_benchmark.npz")).repeat(number=1, repeat=5))

1.9842722680041334

In [127]:
min(timeit.Timer(partial(benchmark_npz, "events_benchmark_regrouped.npz")).repeat(number=1, repeat=5))

1.5383685880005942

In [78]:
min(timeit.Timer(partial(benchmark_root, rootfile)).repeat(number=1, repeat=5))

6.026773760997457

In [79]:
min(timeit.Timer(partial(benchmark_root, rootfile_large_baskets)).repeat(number=1, repeat=5))

4.355107371004124

In [128]:
!ls -lah events_benchmark*

-rw-r--r-- 1 nikolai nikolai 101M Jan 26 17:31 events_benchmark_gzip.h5
-rw-r--r-- 1 nikolai nikolai  89M Jan 26 17:31 events_benchmark_gzip_regrouped.h5
-rw-r--r-- 1 nikolai nikolai 137M Jan 26 17:31 events_benchmark_lzf.h5
-rw-r--r-- 1 nikolai nikolai 113M Jan 26 17:31 events_benchmark_lzf_regrouped.h5
-rw-r--r-- 1 nikolai nikolai  92M Jan 26 17:51 events_benchmark.npz
-rw-r--r-- 1 nikolai nikolai 121M Jan 26 17:31 events_benchmark.parquet
-rw-r--r-- 1 nikolai nikolai  82M Jan 26 17:52 events_benchmark_regrouped.npz
-rw-r--r-- 1 nikolai nikolai 118M Jan 26 17:31 events_benchmark_regrouped.parquet
