In [1]:
import uproot4
from deserialization_hacks import tree_arrays
import awkward1 as ak
import numpy as np
import matplotlib.pyplot as plt
import time
import os

# Read branches

In [2]:
def filter_branch(branch):
        k = branch.name

        if not "Aux" in k:
            return False

        # the following don't contain data (in split files)
        if k.endswith("."):
            return
        if "SG::" in k:
            return
        if k.endswith("Base"):
            return

        # are often empty
        # see https://github.com/scikit-hep/uproot4/issues/126
        if "DescrTags" in k:
            return
    
        interpretation = str(branch.interpretation)

        # skip triple-jagged vectors and sets
        if (interpretation.count("AsVector") > 1) and ("AsString" in interpretation):
            return False
        if interpretation.count("AsVector") > 2:
            return False
        if "AsSet" in interpretation:
            # what are these anyways?
            return False
        
        return True

In [3]:
%%time
# file from user.nihartma.physlite_test_ttbar_split99.001_EXT0
f = uproot4.open("user.nihartma.22884623.EXT0._000001.DAOD_PHYSLITE.test.pool.root")
tree = f["CollectionTree"]
array_dict = tree_arrays(tree, filter_branch=filter_branch)

CPU times: user 9.53 s, sys: 1.23 s, total: 10.8 s
Wall time: 8.13 s


In [4]:
len(array_dict)

1117

What did we miss (assuming fully split Aux branches)?

In [5]:
all_aux = [
    k.split("/")[-1] for k in tree.keys("/(.*Aux\..+|.*AuxDyn\..+)/i")
    if not "xAOD::" in k
    and len(tree[k].branches) == 0
]
set(all_aux).difference(array_dict.keys())

{'EventInfoAux.detDescrTags.first',
 'EventInfoAux.detDescrTags.second',
 'EventInfoAuxDyn.streamTagDets',
 'EventInfoAuxDyn.streamTagRobs',
 'METAssoc_AnalysisMETAux.overlapIndices',
 'METAssoc_AnalysisMETAux.overlapTypes'}

# Write to parquet
We can make a flat dictionary of branches (but the branches may be jagged arrays). That's something we can store in parquet already now. As of now parquet can't store jagged arrays of structs, so those need to be exploded as well.

In [6]:
d_exploded = {}
for key, array in array_dict.items():
    keys = ak.keys(array)
    if len(keys) == 0:
        d_exploded[key] = array
    for subkey in keys:
        d_exploded[f"{key}.{subkey}"] = array[subkey]

Events_flat = ak.zip(d_exploded, depth_limit=1)

In [7]:
ak.to_parquet(Events_flat, "physlite.parquet", explode_records=True)

this gives us already in the default settings a fast readable format with about the same size that these branches originally took on disk

In [8]:
os.stat("physlite.parquet").st_size / (1024 ** 2)

120.44293880462646

In [9]:
sum([tree[k].compressed_bytes for k in array_dict]) / (1024 ** 2)

117.07681941986084

In [10]:
%%time
ak.from_parquet("physlite.parquet")

CPU times: user 2.09 s, sys: 438 ms, total: 2.53 s
Wall time: 521 ms


<Array [...] type='10000 * {"EventInfoAux.runNumber": ?int64, "EventInfoAux.even...'>

# Build an nicer structure

We can also structure this much nicer and remove duplicated indices (e.g. all electron properties share the same offsets) - the naming conventions help us:

In [11]:
def regroup(array_dict):
    regrouped = {}
    for k_top in set(k.split(".")[0] for k in array_dict):
        # zip will put together jagged arrays with common offsets
        def ak_zip(depth_limit=2):
            return ak.zip(
                {k.replace(k_top, "")[1:] : array_dict[k] for k in array_dict if k_top in k},
                depth_limit=depth_limit
            )
        # for some containers this will work 2 levels, for some only up to 1
        try:
            v = ak_zip(depth_limit=2)
        except ValueError:
            v = ak_zip(depth_limit=1)
        regrouped[k_top.replace("AuxDyn", "").replace("Aux", "")] = v
    # lets restructure such that we get TrigMatchedObjets.<trigger-name>
    # instead of AnalysisHLT_<trigger_name>.TrigMatchedObjects
    trig_matched_objects = ak.zip(
        {
            k.replace("AnalysisHLT_", "") : regrouped[k].TrigMatchedObjects
            for k in regrouped if "AnalysisHLT" in k
        },
        depth_limit=1
    )
    for k in list(regrouped.keys()):
        if "AnalysisHLT" in k:
            regrouped.pop(k)
    regrouped["TrigMatchedObjects"] = trig_matched_objects
    return ak.zip(regrouped, depth_limit=1)

In [12]:
Events = regroup(array_dict)

In [13]:
Events.AnalysisElectrons.pt

<Array [[], [3.37e+03], ... [6.27e+03]] type='10000 * var * float32'>

In [14]:
Events.AnalysisElectrons.eta

<Array [[], [-0.861], ... [-1.21], [-0.238]] type='10000 * var * float32'>

The total in-memory size got a bit smaller due to the removed duplicated indices

In [15]:
Events.nbytes / (1024 ** 2)

360.5864849090576

In [16]:
Events_flat.nbytes / (1024 ** 2)

401.86948585510254

# Store in HDF5

One can store this in basically any data format using `ak.to_arrayset` which will separate the underlying 1-d arrays (content and indices for jagged arrays) and a json spec for the structure. See https://awkward-array.org/how-to-convert-arrayset.html

In [17]:
form, container, num_partitions = ak.to_arrayset(Events)

HDF5 is rather well suited for this since we can put the json form directly into the metadata.

In [18]:
import h5py

In [19]:
with h5py.File("physlite.h5", "w") as file:
    group = file.create_group("awkward")
    for k in container:
        v = container[k]
        group.create_dataset(k, shape=v.shape, dtype=v.dtype, data=v, compression="gzip")
    group.attrs["form"] = form.tojson()
    group.attrs["length"] = len(Events)

This file is quite a bit smaller. That makes sense since the compression can't compress away duplicated indices of different split branches (which we had before in the ROOT or parquet file).

In [20]:
os.stat("physlite.h5").st_size / (1024 ** 2)

89.55333232879639

Reading is also quite fast, although not as fast as with the parquet file (can be improved by using a faster compression, e.g. "lzf" at the cost of larger file size)

In [21]:
%%time
with h5py.File("physlite.h5", "r") as file:
    group = file["awkward"]
    reconstituted = ak.from_arrayset(
        ak.forms.Form.fromjson(group.attrs["form"]),
        {k: np.asarray(v) for k, v in group.items()},
    )

CPU times: user 1.66 s, sys: 116 ms, total: 1.77 s
Wall time: 1.78 s


Reading only requested columns also works e.g. via "LazyArrays".

In [22]:
class LazyGet:
    def __init__(self, group):
        self.group = group
    
    def __getitem__(self, key):
        print(f"Reading array {key}")
        return np.asarray(self.group[key])

In [23]:
file = h5py.File("physlite.h5", "r")
group = file["awkward"]

lazy = ak.from_arrayset(
    ak.forms.Form.fromjson(group.attrs["form"]),
    LazyGet(group),
    lazy=True,
    lazy_lengths=group.attrs["length"]
)

In [24]:
%time
lazy.AnalysisElectrons.pt

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs
Reading array node958-offsets
Reading array node964


<Array [[], [3.37e+03], ... [6.27e+03]] type='10000 * var * float32'>

# Behaviour and dynamic quantities
Working in a bit more object-oriented way can be done with "behaviours" (see https://awkward-array.readthedocs.io/en/latest/ak.behavior.html).

For example, coffea has LorentzVectors for awkward array:

In [25]:
import coffea
from coffea.nanoevents.methods import vector
ak.behavior.update(vector.behavior)

The coffea `PtEtaPhiELorentzVector` calls the mass `mass`, but we call it `m`, so let's override that:

In [26]:
@ak.mixin_class(ak.behavior)
class xAODParticle(vector.PtEtaPhiELorentzVector):
    @property
    def mass(self):
        return self.m

Now, if name our Particles "xAODParticle" we can do all the LorentzVector stuff with them:

In [27]:
for collection in ["Electrons", "Jets", "Photons", "Muons"]:
    Events[f"Analysis{collection}"].layout.content.setparameter("__record__", "xAODParticle")

In [28]:
Events.AnalysisElectrons

<xAODParticleArray [[], ... firstEgMotherPdgId: -11}]] type='10000 * var * xAODP...'>

In [29]:
Events.AnalysisElectrons.delta_r(Events.AnalysisElectrons.nearest(Events.AnalysisJets))

<Array [[], [0.104], ... [0.0422], [0.943]] type='10000 * var * ?float32'>

Or something for track Particles:

In [30]:
@ak.mixin_class(ak.behavior)
class xAODTrackParticle(vector.LorentzVector):
    "see https://gitlab.cern.ch/atlas/athena/-/blob/21.2/Event/xAOD/xAODTracking/Root/TrackParticle_v1.cxx#L82"
    @property
    def theta(self):
        return self["theta"]
    
    @property
    def phi(self):
        return self["phi"]

    @property
    def p(self):
        return 1. / np.abs(self.qOverP)
    
    @property
    def x(self):
        return self.p * np.sin(self.theta) * np.cos(self.phi)
    
    @property
    def y(self):
        return self.p * np.sin(self.theta) * np.sin(self.phi)

    @property
    def z(self):
        return self.p * np.cos(self.theta)
    
    @property
    def t(self):
        return np.sqrt(139.570 ** 2 + sef.x ** 2 + self.y ** 2 + self.z ** 2)
    
    

In [31]:
for k in ak.keys(Events):
    if not "TrackParticles" in k:
        continue
    Events[k].layout.content.setparameter("__record__", "xAODTrackParticle")

In [32]:
Events.InDetTrackParticles

<xAODTrackParticleArray [[{phi: 2.36, ... ] type='10000 * var * xAODTrackParticl...'>

In [33]:
Events.InDetTrackParticles.pt

<Array [[1.11e+05, 1.7e+04, ... 5.57e+03, 739]] type='10000 * var * float32'>

# ElementLinks

Non-cyclic references can be implemented by just adding new indices and reusing the same contents. E.g let's link Electrons to their track particles:

In [34]:
def element_links(collection1, links, collection2):
    # Note: For proper handling one should read the
    # EventFormat.m_branchNames, EventFormat.m_branchHashes mapping to link to the correct collection
    # (possibly use UnionArray)
    # Also one could see if there is a better way to replicate the exact structure
    # instead of hardcoding
    return ak.Array(
        ak.layout.ListOffsetArray64(
            collection1.layout.offsets,
            ak.layout.ListArray64(
                links.layout.content.starts,
                links.layout.content.stops,
                ak.layout.IndexedArray64(
                    ak.layout.Index64(
                        ak.flatten(
                            links.m_persIndex
                            + ak.Array(np.array(collection2.layout.offsets[:-1])),
                            axis=None
                        )
                    ),
                    collection2.layout.content
                )
            )
        )
    )

In [35]:
Events["AnalysisElectrons", "trackParticles"] = element_links(
    Events.AnalysisElectrons,
    Events.AnalysisElectrons.trackParticleLinks,
    Events.GSFTrackParticles
)

In [36]:
Events.AnalysisElectrons.trackParticles

<xAODTrackParticleArray [[], ... chiSquared: 68}]]] type='10000 * var * var * xA...'>

In [37]:
# first track particle pt for each electron
Events.AnalysisElectrons.trackParticles[:,:,0].pt

<Array [[], [4.13e+03], ... [5.68e+03]] type='10000 * var * float32'>