In [1]:
from collections import defaultdict
import awkward as ak
import numba
import numpy as np
import pandas as pd
import h5py
import vector
vector.register_numba()
vector.register_awkward()

#import matplotlib.pyplot as plt
#from matplotlib.colors import LogNorm
#import mplhep as hep
#hep.style.use(hep.style.ROOT)

# Predictions on fully-matched dataset of model trained on inclusive dataset

In [2]:
filename_test = "/afs/cern.ch/work/m/mmarcheg/ttHbb/ttHbb_SPANet/datasets/test_file_test_29753.h5"
filename_pred = "/afs/cern.ch/work/m/mmarcheg/ttHbb/ttHbb_SPANet/spanet_output/predictions/predictions_version_4_fullymatched_test_29753.h5"
df_test = h5py.File(filename_test,'r')
df_pred = h5py.File(filename_pred,'r')

In [3]:
df_pred["TARGETS"]["h"].keys()

<KeysViewHDF5 ['assignment_probability', 'b1', 'b2', 'detection_probability', 'marginal_probability']>

## Compute the jet assignment efficiency
We extract the predicted and true indices for the individual quarks.
Jets are correctly assigned when the predicted jet index is equal to the true index. We can compute the efficiency as the ratio of the correctly assigned jets over the total number of jets.

In [4]:
idx_b1_pred = df_pred["TARGETS"]["h"]["b1"][()]
idx_b2_pred = df_pred["TARGETS"]["h"]["b2"][()]
idx_b1_pred

array([0, 0, 2, ..., 0, 4, 1])

In [5]:
idx_b1_true = df_test["TARGETS"]["h"]["b1"][()]
idx_b2_true = df_test["TARGETS"]["h"]["b2"][()]
idx_b1_true

array([0, 0, 2, ..., 0, 4, 1])

In [6]:
idx_h_pred = ak.concatenate((ak.unflatten(idx_b1_pred, ak.ones_like(idx_b1_pred)), ak.unflatten(idx_b2_pred, ak.ones_like(idx_b2_pred))), axis=1)
idx_h_true = ak.concatenate((ak.unflatten(idx_b1_true, ak.ones_like(idx_b1_true)), ak.unflatten(idx_b2_true, ak.ones_like(idx_b2_true))), axis=1)
idx_h_pred

<Array [[0, 1], [0, 6], ... [4, 5], [1, 3]] type='29753 * var * int64'>

In [7]:
idx_h_true

<Array [[0, 1], [0, 2], ... [4, 5], [1, 3]] type='29753 * var * int64'>

In [8]:
is_correct_higgs = ak.sum(idx_h_pred == idx_h_true, axis=1) == 2
is_correct_higgs

<Array [True, False, True, ... True, True] type='29753 * bool'>

In [9]:
n_tot = len(is_correct_higgs)
n_tot

29753

In [10]:
n_correct = ak.sum(is_correct_higgs)
n_correct

18392

In [11]:
eff_h = n_correct / n_tot
eff_h

0.6181561523207744

In [12]:
idx_q1_pred = df_pred["TARGETS"]["t1"]["q1"][()]
idx_q2_pred = df_pred["TARGETS"]["t1"]["q2"][()]
idx_b_pred = df_pred["TARGETS"]["t1"]["b"][()]
idx_q1_true = df_test["TARGETS"]["t1"]["q1"][()]
idx_q2_true = df_test["TARGETS"]["t1"]["q2"][()]
idx_b_true = df_test["TARGETS"]["t1"]["b"][()]
idx_b_pred

array([6, 5, 0, ..., 1, 0, 4])

In [13]:
idx_thad_pred = ak.concatenate(
     (ak.unflatten(idx_q1_pred, ak.ones_like(idx_q1_pred)),
     ak.unflatten(idx_q2_pred, ak.ones_like(idx_q2_pred)),
     ak.unflatten(idx_b_pred, ak.ones_like(idx_b_pred))),
     axis=1)
idx_thad_true = ak.concatenate(
     (ak.unflatten(idx_q1_true, ak.ones_like(idx_q1_true)),
     ak.unflatten(idx_q2_true, ak.ones_like(idx_q2_true)),
     ak.unflatten(idx_b_true, ak.ones_like(idx_b_true))),
     axis=1)
idx_thad_pred

<Array [[2, 4, 6], [1, 4, ... 3, 0], [2, 5, 4]] type='29753 * var * int64'>

In [14]:
is_correct_thad = ak.sum(idx_thad_pred == idx_thad_true, axis=1) == 3
is_correct_thad

<Array [True, True, True, ... True, False] type='29753 * bool'>

In [15]:
n_correct = ak.sum(is_correct_thad)
n_tot = len(is_correct_thad)
eff_thad = n_correct / n_tot
eff_thad

0.6524047995160152

In [16]:
idx_b_pred = df_pred["TARGETS"]["t2"]["b"][()]
idx_b_true = df_test["TARGETS"]["t2"]["b"][()]
idx_tlep_pred = ak.unflatten(idx_b_pred, ak.ones_like(idx_b_pred))
idx_tlep_true = ak.unflatten(idx_b_true, ak.ones_like(idx_b_pred))
idx_tlep_pred

<Array [[5], [2], [1], ... [4], [1], [0]] type='29753 * var * int64'>

In [17]:
is_correct_tlep = ak.sum(idx_tlep_pred == idx_tlep_true, axis=1) == 1
is_correct_tlep

<Array [True, False, True, ... True, False] type='29753 * bool'>

In [18]:
n_correct = ak.sum(is_correct_tlep)
n_tot = len(is_correct_tlep)
eff_tlep = n_correct / n_tot
eff_tlep

0.7325311733270594

## Resulting Higgs and top reconstruction efficiencies
The Higgs and top efficiencies are defined as the number of events with all the jets correctly assigned over the total number of events:

$\epsilon = \frac{N_{assigned}}{N_{tot}}$

In [19]:
eff_h, eff_thad, eff_tlep

(0.6181561523207744, 0.6524047995160152, 0.7325311733270594)

## Check permutations of output indices
We want to check how many events are predicted to have `q1` and `q2` swapped in the reconstruction of the hadronic top. If there are any events where `q1` and `q2` have swapped indices, they have to be taken into account in the numerator of the efficiency, since we don't distinguish between `q1` and `q2`.
We also check how many events are predicted to have `b1` and `b2` swapped in the reconstruction of the Higgs to check the Higgs reconstruction efficiency.

In [20]:
idx_q1_pred = df_pred["TARGETS"]["t1"]["q1"][()]
idx_q2_pred = df_pred["TARGETS"]["t1"]["q2"][()]
idx_b_pred = df_pred["TARGETS"]["t1"]["b"][()]
idx_q1_true = df_test["TARGETS"]["t1"]["q1"][()]
idx_q2_true = df_test["TARGETS"]["t1"]["q2"][()]
idx_b_true = df_test["TARGETS"]["t1"]["b"][()]
idx_b_true

array([6, 5, 0, ..., 1, 0, 0])

In [21]:
has_mismatched_q1 = idx_q1_pred != idx_q1_true
has_mismatched_q2 = idx_q2_pred != idx_q2_true
has_mismatched_q1_q2 = has_mismatched_q1 & has_mismatched_q2
has_swapped_q1_q2 = (idx_q1_pred == idx_q2_true) & (idx_q2_pred == idx_q1_true)
sum(has_swapped_q1_q2)

0

In [22]:
idx_q1_pred[:20].tolist()

[2, 1, 4, 3, 3, 1, 6, 1, 2, 0, 2, 3, 4, 0, 4, 0, 0, 0, 2, 2]

In [23]:
idx_q2_pred[:20].tolist()

[4, 4, 6, 6, 4, 5, 7, 5, 4, 2, 6, 5, 5, 3, 5, 4, 2, 2, 3, 4]

In [24]:
idx_b_pred[:20].tolist()

[6, 5, 0, 1, 6, 3, 1, 3, 1, 4, 1, 2, 1, 8, 2, 2, 4, 3, 1, 1]

In [25]:
idx_q1_true[:20].tolist()

[2, 1, 4, 3, 3, 1, 6, 1, 2, 0, 2, 3, 4, 3, 1, 0, 0, 0, 2, 2]

In [26]:
idx_q2_true[:20].tolist()

[4, 4, 6, 6, 6, 5, 7, 5, 4, 2, 6, 5, 5, 4, 4, 4, 2, 2, 3, 4]

In [27]:
idx_b_true[:20].tolist()

[6, 5, 0, 1, 5, 3, 4, 3, 1, 1, 1, 2, 2, 2, 2, 1, 4, 3, 1, 1]

In [28]:
idx_b1_pred = df_pred["TARGETS"]["h"]["b1"][()]
idx_b2_pred = df_pred["TARGETS"]["h"]["b2"][()]
idx_b1_true = df_test["TARGETS"]["h"]["b1"][()]
idx_b2_true = df_test["TARGETS"]["h"]["b2"][()]
idx_b1_true

array([0, 0, 2, ..., 0, 4, 1])

In [29]:
has_swapped_b1_b2 = (idx_b1_pred == idx_b2_true) & (idx_b2_pred == idx_b1_true)
sum(has_swapped_b1_b2)

0

In [30]:
idx_b1_pred[:20].tolist()

[0, 0, 2, 0, 2, 0, 3, 0, 0, 1, 4, 4, 0, 2, 1, 1, 5, 5, 4, 0]

In [31]:
idx_b2_pred[:20].tolist()

[1, 6, 3, 4, 5, 2, 4, 2, 5, 5, 5, 6, 2, 4, 3, 6, 6, 6, 5, 3]

In [32]:
idx_b1_true[:20].tolist()

[0, 0, 2, 0, 2, 0, 3, 0, 0, 3, 4, 4, 0, 0, 3, 2, 1, 4, 4, 0]

In [33]:
idx_b2_true[:20].tolist()

[1, 2, 3, 4, 4, 4, 5, 2, 5, 5, 5, 6, 1, 8, 5, 6, 6, 6, 5, 3]