In [None]:
# this notebook needs pyjet - an interface to the FastJet clustering algorithm:
# http://fastjet.fr/
# https://github.com/scikit-hep/pyjet
#!pip install --user pyjet

# Top tagging with n-subjettiness
Here we will try to construct the "n-subjettiness" variables (see [arXiv:1011.2268](https://arxiv.org/pdf/1011.2268.pdf)) and train a simple ML classifier on these instead of the more "deep learning" approach discussed in [CNNTopTagging.ipynb](CNNTopTagging.ipynb).

The n-subjettiness is defined as:

\begin{equation}
\tau_n = \frac{1}{R_0\sum_k p_{\mathrm{T},k}}\sum_k p_{\mathrm{T}, k} \min(\Delta R_{1, k}, \Delta R_{2, k}, \dots, \Delta R_{n, k})
\end{equation}

Where $k$ runs over all constituents of our jet and $n$ runs over re-clustered subjets with the hypothesis that the jet contains $n$ subjets. $\Delta R$ is a distance in the $\eta-\phi$ plane and $R_0$ the radius parameter of the original jet clustering (in our case $R_0=0.8$). We can now construct these variables with different hypotheses for $n$ and feed them into a ML algorithm. Intuitively, $\tau_n$ measures how well a jet can be described as being composed out of $n$ subjets. In our case we expect the QCD jets to be very "1-subjetty" and the top-quark jets to be very "3-subjetty".

![Figure 4 from arXiv:1011.2268 [hep-ph]](figures/top_tagging/10112268_Fig4.svg "Figure 4 from arXiv:1011.2268 [hep-ph]")

In [None]:
import pyjet
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_hdf("/large_tmp/LMU_DA_ML/top_tagging/train.h5", "table", stop=100000)

In [None]:
def get_nsubjettiness_features(df):
    
    """
    Calculate n-subjettiness variables.
    Caveat: not sure if this is correct
    - the resulting distributions look a bit different from arXiv:1011.2268
    """
    
    import tqdm # visual gimmick: used to show progress bar
    
    R_0 = 0.8

    # we have up to 200 constituents for each jet (the rest are padded with 0)
    jet_columns = sum([["{}_{}".format(var, i) for var in ["E", "PX", "PY", "PZ"]] for i in range(200)], [])
    
    # make plain array of jets with constituents for each jet (one dimension more)
    jet_array = df[jet_columns].values.reshape(-1, 200, 4)
    
    # we want to calculate n-subjettiness for 1, 2, 3, 4 subjets
    n_subjets_vars = {1 : [], 2 : [], 3 : [], 4 : []}
    
    for jet in tqdm.tqdm_notebook(jet_array):
        # create structured numpy array in the right format for pyjet
        jet = jet.astype(np.float64).view(
            dtype=[("E", np.float64), ("px", np.float64), ("py", np.float64), ("pz", np.float64)]
        ).reshape(-1)
        
        # throw out 0-padded values for non-existent constituents
        jet = jet[jet["E"]!=0]
        
        # change of coordinate system: calculate pt, eta, phi from px, py, pz
        pt  = np.sqrt(jet["px"] ** 2 + jet["py"] ** 2)
        eta = np.arcsinh(jet["pz"] / pt)
        phi = np.arctan2(jet["py"], jet["px"])
        
        subjets_list = []
        for n_subjets in n_subjets_vars:

            # skip if we have less than n_subjets constituents
            if len(jet) < n_subjets:
                n_subjets_vars[n_subjets].append(0.)
                continue

            # run the exclusive-kt clustering for each n-jet hypothesis
            subjets = pyjet.cluster(jet, R=R_0, p=1, ep=True).exclusive_jets(n_subjets)
            # find the closest distance of each subjet to all constituents
            dR = []
            for subjet in subjets:
                dR.append(np.sqrt((subjet.eta - eta) ** 2 + (subjet.phi - phi) ** 2))
            closest_dR = np.stack(dR, axis=1).min(axis=1)
            # calculate the actual n-subjettiness
            n_subjets_vars[n_subjets].append(
                (closest_dR * pt).sum() / (pt.sum() * R_0)
            )
            
    return pd.DataFrame(n_subjets_vars)

In [None]:
df_nsub = get_nsubjettiness_features(df)
# make nicer for seaborn
df_nsub.columns = ["tau_1", "tau_2", "tau_3", "tau_4"]
df_nsub["y"] = df.is_signal_new.values
df_nsub.loc[df_nsub.y==1, "y"] = "Top"
df_nsub.loc[df_nsub.y==0, "y"] = "QCD"
df_nsub.y = pd.Categorical(df_nsub.y)

In [None]:
df_nsub.head()

In [None]:
pg = sns.pairplot(df_nsub.iloc[::100], hue="y")
for iy in range(len(pg.axes)):
    for ix in range(len(pg.axes)):
        if not ix == iy:
            pg.axes[ix][iy].set_xlim(0, 0.5)
            pg.axes[ix][iy].set_ylim(0, 0.5)

In [None]:
x = df_nsub[["tau_1", "tau_2", "tau_3", "tau_4"]].values
y = df.is_signal_new.values

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
bdt = GradientBoostingClassifier()

In [None]:
bdt.fit(x, y)

In [None]:
scores = bdt.predict_proba(x)

In [None]:
import matplotlib.pyplot as plt

In [None]:
opts = dict(bins=100, range=(0, 1), alpha=0.5)
plt.hist(scores[:,1][y==0], **opts)
plt.hist(scores[:,1][y==1], **opts)
plt.yscale("log")

In [None]:
from sklearn.metrics import roc_curve

In [None]:
roc = roc_curve(y, scores[:,1])

In [None]:
plt.plot(roc[1], 1. / roc[0])
plt.yscale("log")

In [None]:
fpr, tpr, thr = roc

In [None]:
np.max(tpr[fpr < 0.001])

In [None]:
1. / np.min(fpr[tpr > 0.3])

In [None]:
df_test = pd.read_hdf("/large_tmp/LMU_DA_ML/top_tagging/test.h5", "table", stop=100000)
df_test_nsub = get_nsubjettiness_features(df_test)

In [None]:
x_test = df_test_nsub.values
y_test = df_test.is_signal_new.values

In [None]:
scores_test = bdt.predict_proba(x_test)

In [None]:
roc_test = roc_curve(y_test, scores_test[:,1])
plt.plot(roc_test[1], 1. / roc_test[0])
plt.yscale("log")

In [None]:
fpr_test, tpr_test, thr_test = roc_test
print(np.max(tpr_test[fpr_test < 0.001]))
print(1. / np.min(fpr_test[tpr_test > 0.3]))