**Note:** This notebook needs the `awkward` and `vector` packages. You can install them with the following command:

In [None]:
#!pip install awkward vector

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import awkward as ak
import vector
from tqdm.auto import tqdm

The dataset is the same as in [`CNNTopTaggingPreprocessing.ipynb`](CNNTopTaggingPreprocessing.ipynb) and you can download it from https://desycloud.desy.de/index.php/s/llbX3zpLhazgPJ6 (1.6 GB) (see [arXiv:1707.08966](https://arxiv.org/abs/1707.08966))

Here we are going to preprocess the dataset such that we get an adjacency matrix of the 7 nearest neighbors and the coordinates $p_\mathrm{T}, \eta, \phi, E$ relativ to the center of mass of the jet. 

Adjust the following path

In [None]:
data_path = Path("./top_tagging")

In [None]:
df = pd.read_hdf(data_path / "train.h5", "table", stop=10000)

First load the list of up to 200 particles into a numpy array

In [None]:
jet_4mom = df.loc[:, :"PZ_199"].to_numpy().reshape(-1, 200, 4)

In [None]:
jet_4mom

Next, we need to convert this to an awkward vector to be able to to LorentzVector arithmetics

In [None]:
def to_ak_vec(jet_4mom):
    p4 = ak.from_regular(vector.zip({key: jet_4mom[..., i] for i, key in enumerate(["e", "px", "py", "pz"])}))
    return p4[~((p4.e == 0) & (p4.px == 0) & (p4.py == 0) & (p4.pz == 0))]

In [None]:
p4 = to_ak_vec(jet_4mom)

This will now have a variable length of jet constituents for each event:

In [None]:
p4

In [None]:
ak.num(p4)

Since we want to have the coordinates relative to the center of mass, we need to sum the constituents:

In [None]:
def vec_sum(p4):
    # sum not yet working with vector without converting again https://github.com/scikit-hep/vector/issues/92
    p4_sum = ak.sum(p4, axis=-1)
    return vector.zip({k1: p4_sum[k2] for k1, k2 in [("px", "x"), ("py", "y"), ("pz", "z"), ("e", "t")]})

In [None]:
vec_sum(p4)

We will scale the transverse momentum and energy relative to the total sum and the eta and phi relative to the center of mass. Furthermore we will pad and clip the array to up to 100 particles:

In [None]:
def transform_for_traindata(p4, npad=100):
    p4_sum = vec_sum(p4)
    p4_train = ak.concatenate(
        [
            _p[..., np.newaxis] for _p in [
                p4.pt / p4_sum.pt,
                p4.deltaeta(p4_sum),
                p4.deltaphi(p4_sum),
                p4.e / p4_sum.e
            ]
        ],
        axis=-1
    )
    return ak.fill_none(ak.pad_none(p4_train, npad, axis=1, clip=True), [0, 0, 0, 0], axis=-2).to_numpy().astype(np.float16)

In [None]:
transform_for_traindata(p4)

For the adjacency matrix we need to look at the distance in the $\eta-\phi$ plane (`deltaR`) for each pair (using `ak.cartesian`) of jet constituents and get the top-$K$ indices that sort these distances for each constituent (using `ak.argsort`)

In [None]:
def get_knn_indices(array, K, chunksize=1000):
    out = []
    for start in tqdm(range(0, len(array), chunksize)):
        # do this in chunks to save memory
        chunk = array[start: start + chunksize]
        p1, p2 = ak.unzip(ak.cartesian([chunk, chunk], axis=-1, nested=True))
        i1, i2 = ak.unzip(ak.argcartesian([chunk, chunk], nested=True))
        p1, p2 = [p[i1 != i2] for p in [p1, p2]] # exclude self
        dr = p1.deltaR(p2)
        out.append(ak.values_astype(ak.argsort(dr)[..., :K], np.uint8))
    return ak.concatenate(out)

This will be fastest when we convert the vector into $p_\mathrm{T}, \phi, \eta$ coordinates beforehand.

In [None]:
knn_indices = get_knn_indices(p4.to_rhophieta(), 7)

In [None]:
knn_indices[0].tolist()

We need to convert this into 0-padded adjacency matrices for up to 100 particles:

In [None]:
def to_adjacency(nn_indices, npad=100):
    nn_indices = nn_indices[nn_indices < npad][:, :npad] # clip at 100 particles
    # add self loops
    nn_indices = ak.concatenate([ak.local_index(nn_indices, axis=1)[..., np.newaxis], nn_indices], axis=2)
    # find the indices where the adjacency matrices should be 1
    ii, jj, kk = ak.unzip(ak.zip([ak.local_index(nn_indices, axis=0), ak.local_index(nn_indices, axis=1), nn_indices]))
    ii, jj, kk = [ak.flatten(x, axis=None).to_numpy() for x in [ii, jj, kk]]
    # create and fill the adjacency matrices
    m = np.zeros((len(nn_indices), npad, npad), dtype=np.uint8)
    m[ii, jj, kk] = 1
    return m

In [None]:
adj = to_adjacency(knn_indices)

In [None]:
plt.imshow(adj[10], cmap="Greys_r", interpolation="none")

Now, store everything:

In [None]:
def preprocess(path, nexamples, npad=100, K=7):
    df = pd.read_hdf(path, "table", stop=nexamples)
    p4 = to_ak_vec(df.loc[:, :"PZ_199"].to_numpy().reshape(-1, 200, 4))
    x = transform_for_traindata(p4, npad=npad)
    y = df["is_signal_new"].to_numpy().astype(bool)
    adj = to_adjacency(get_knn_indices(p4.to_rhophieta(), K), npad=npad)
    return x, adj, y

In [None]:
x, adj, y = preprocess(data_path / "train.h5", 100000)

In [None]:
np.savez_compressed("top_tagging_100k.npz", x=x, adj=adj, y=y)

In [None]:
!ls -lah top_tagging_100k.npz