# Convert a "friend-tree-like" augmentation to an optional field

In [1]:
import uproot
import awkward as ak
import numpy as np
import io
import os
import numba as nb
from physlite_experiments.utils import zip_physlite, filter_name
from physlite_experiments.deserialization_hacks import tree_arrays

In [2]:
filename = "user.nihartma.22884623.EXT0._000001.DAOD_PHYSLITE.test.pool.root"

In [3]:
if os.path.exists(filename):
    tree = uproot.open(f"{filename}:CollectionTree")
else:
    data = requests.get(
        "https://lcg-lrz-http.grid.lrz.de:443/pnfs/lrz-muenchen.de/data/atlas/dq2/atlaslocalgroupdisk/rucio/user/nihartma/ce/c4/user.nihartma.22884623.EXT0._000001.DAOD_PHYSLITE.test.pool.root",
        verify=os.environ["X509_CERT_DIR"],
        cert=os.environ["X509_USER_PROXY"]
    ).content
    rootfile = uproot.open(io.BytesIO(data))
    tree = rootfile["CollectionTree"]

In [4]:
array_dict = tree_arrays(tree, filter_name=filter_name)

In [5]:
events = zip_physlite(array_dict)

Let's assume we would like to store the `egammaClusters` only for events with at least 2 Electrons. To later join this "friend", we also have to store the `eventNumber` (potentially this might need to be extended with dsid or runNumber ...). In reality, this might be created from a different file, e.g. from a PHYS file to extend PHYSLITE.

In [6]:
friend = ak.zip(
    {
        "eventNumber": events["EventInfo", "eventNumber"],
        "egammaClusters": events.egammaClusters
    },
    depth_limit=1
)

In [7]:
friend = ak.packed(friend[ak.num(events.Electrons) > 2])

For fun, let's also shuffle this to see if the join operation later on does the right thing:

In [8]:
friend = ak.packed(friend[np.random.permutation(len(friend))])

doing so, we can potentially save storage:

In [9]:
friend.nbytes

32924

In [10]:
events.egammaClusters.nbytes

1239052

Currently (July 2021), awkward array does not support join operations, but we can create an index (to be used in an `IndexedOptionArray`) that will join the two arrays using numba. The index will have `-1` for the entries that have been filtered out and otherwise an index that will join the friend (more or less like SQL "LEFT OUTER JOIN").

In [11]:
@nb.njit
def nb_get_option_index(
    event_numbers,
    argsort_event_numbers,
    event_numbers_friend,
    argsort_event_numbers_friend,
):
    out = np.empty(len(event_numbers), dtype=np.int64)
    j = 0
    for i in range(len(event_numbers)):
        if (j < len(event_numbers_friend)) and (
            event_numbers_friend[argsort_event_numbers_friend[j]]
            == event_numbers[argsort_event_numbers[i]]
        ):
            out[argsort_event_numbers[i]] = argsort_event_numbers_friend[j]
            j += 1
        else:
            out[argsort_event_numbers[i]] = -1
    return out

In [12]:
def get_option_index(event_numbers, event_numbers_friend):
    return nb_get_option_index(
        event_numbers,
        np.argsort(event_numbers),
        event_numbers_friend,
        np.argsort(event_numbers_friend)
    )

In [13]:
joined_egammaClusters = ak.Array(
    ak.layout.IndexedOptionArray64(
        ak.layout.Index64(
            get_option_index(ak.to_numpy(events.EventInfo.eventNumber), ak.to_numpy(friend.eventNumber))
        ),
        friend.egammaClusters.layout
    )
)

This array could then be used as an optional field in the top level array:

In [14]:
augmented_events = ak.with_field(events, joined_egammaClusters, "egammaClusters")

In [15]:
augmented_events.egammaClusters

<Array [None, None, None, ... None, None, None] type='10000 * option[var * {"cal...'>

In [16]:
augmented_events.egammaClusters.type

10000 * option[var * {"calE": float32, "calEta": float32, "calPhi": float32, "e_sampl": var * float32, "ETA2CALOFRAME": float32, "ETACALOFRAME": float32, "PHI2CALOFRAME": float32, "PHICALOFRAME": float32, "constituentClusterLinks": var * {"m_persKey": int32, "m_persIndex": int32}, "eta_sampl": var * float32}]

In [17]:
augmented_events.egammaClusters.calE

<Array [None, None, None, ... None, None, None] type='10000 * option[var * float32]'>

In [18]:
augmented_events[ak.num(augmented_events.Electrons) > 2].egammaClusters.calE

<Array [[2.94e+04, 9.96e+03, ... 1.73e+04]] type='44 * option[var * float32]'>

Check if `egammaClusters` are actually filled for all events passing the selection:

In [19]:
assert not ak.any(ak.is_none(augmented_events[ak.num(augmented_events.Electrons) > 2].egammaClusters))

Check if the content for an example field is the same as for the original array:

In [20]:
assert ak.all(
    events[ak.num(events.Electrons) > 2].egammaClusters.calE
    == augmented_events[ak.num(augmented_events.Electrons) > 2].egammaClusters.calE
)

Storing the resulting augmented array will reduce storage most significantly if there are multiple fields. Otherwise one has to check if the extra index needed can be compressed sufficiently well (which it might for tight selections due to many repeated `-1`s).

However it seems, we can't store this natively in arrow/parquet (yet?) ...

In [21]:
ak.to_parquet(augmented_events, "augmented.parquet")

ArrowNotImplementedError: Lists with non-zero length null components are not supported