# Pre-processing - BB1
Here I will pre-process the data for the black box 1 (BB1) dataset of the LHC Olympics 2020. I will calculate some variables to use them on classification algorithms. Ultimately this will multiple functions in the benchtools package.

In [1]:
# Importing the main libraries
import h5py                             # to handle .h5 files
import pyjet as fj                      # jet clustering
import numpy as np                      # for arrays
import pandas as pd                     # manipulation of tables
import os.path                          # to handle directories
from os import path
from tqdm import tqdm                   # progress bar

In [None]:
# We load the file with the data as a dataframe
# The file can be downloaded at https://zenodo.org/record/4536624
df = pd.read_hdf("../../events_LHCO2020_BlackBox1.h5")

In [2]:
# Choose the number of events to analize
n_events = 100000

In [None]:
# Random sample of the events
dfsample = df.sample(n=n_events)

# And save it on an .h5 file for reproducibility
if path.exists("../data/events_LHCO2020_BlackBox1_tiny_{}.h5".format(n_events))!= True: 
    dfsample.to_hdf("../data/events_LHCO2020_BlackBox1_tiny_{}.h5".format(n_events), key='df', mode='w',complevel=5,complib='blosc')

In [3]:
# We load it on a dataframe (if the file is already created there is no need to run the previous cell)
events_tiny = pd.read_hdf("../data/events_LHCO2020_BlackBox1_tiny_{}.h5".format(n_events))

In [4]:
# Verifying the shape (n_events x nro. hadrones*3(caracteristicas))
events_tiny.shape

(100000, 2100)

Le agregamos la información sobre señal o fondo al dataframe

In [5]:
# Reading the key for the file. It's an ASCII file. Each line is the information about signal of background for the event.
with open("../../events_LHCO2020_BlackBox1.masterkey", 'r') as f:
    data = f.read()
# To a dataframe
df_label = pd.read_csv(pd.compat.StringIO(data), header=None)
# Rename it to avoid problems concatenating
df_label = df_label.rename(columns={0: 'label'})
df_label.head()

Unnamed: 0,label
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [6]:
# We get the elements with the same index as the ones randomly choosen.
df_label = df_label.loc[events_tiny.index]
df_label.head()

Unnamed: 0,label
275159,0.0
798939,0.0
132980,0.0
205190,0.0
614851,0.0


In [7]:
# Concatenating the data and label dataframes
events_tiny = pd.concat([events_tiny, df_label], axis=1, sort=False)
events_tiny.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2091,2092,2093,2094,2095,2096,2097,2098,2099,label
275159,0.377711,-1.988629,-0.233306,0.798198,-0.510899,-2.892249,1.943474,-0.031163,-2.308198,11.07995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
798939,0.387657,-2.223124,1.263529,1.950297,-2.22746,1.860344,0.785118,-2.108313,1.602075,2.289307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132980,0.2373,-1.584368,-0.098571,0.389354,-1.816357,1.134515,0.402276,-1.802233,2.874365,1.045797,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
205190,5.207173,-2.462768,-1.188516,0.179832,-0.191899,-2.557838,0.616229,-1.88371,-1.571692,0.792802,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
614851,0.40488,-1.895718,2.249287,1.381925,-0.190406,-3.012272,1.042806,-1.184693,-0.116218,2.473634,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Functions to calculate the variables

I will calculate the variables:
#### Angular distance between two jets

$$
\Delta R = \sqrt{(\phi_1-\phi_2)^2+(\eta_1-\eta_2)^2}
$$
  
#### N-Subjettiness

The 0-, 1- and 2-subjettiness are defined as:

$$
\begin{align}
\tau_0(\beta) &= \sum_{i\in J} p_{T_i}\Delta R^\beta \\
\tau_1(\beta) &= \frac{1}{\tau_0(\beta)} \sum_{i\in J} p_{T_i}\Delta R_{a_1,i}^\beta \\
\tau_2(\beta) &= \frac{1}{\tau_0(\beta)}\sum_{i\in J} p_{T_i} \text{min}(\Delta R_{a1,i}^\beta,\Delta R_{a_2,i})
\end{align}
$$
    To generate a dimensionless variable:
$$
\tau_{21}=\frac{\tau_2}{\tau_1}
$$

In [8]:
def deltaR(x, y):
    return ((x.phi-y.phi)**2 + (x.eta-y.eta)**2)**0.5

def subjettiness(cndts, cnsts):
    d0 = sum(c.pt for c in cnsts)
    ls = []
    for c in cnsts:
        dRs = [deltaR(c,cd) for cd in cndts]
        ls += [c.pt * min(dRs)]
    return sum(ls)/d0

def tau21(jet,subR=0.2):
    '''Input: jet from the jet clustering result '''
    jet_substruct_features = {}        
    seq = fj.cluster(jet, R=subR, algo='kt')
    cnsts = jet.constituents()
    cndts1 = seq.exclusive_jets(1)
    tau1 = subjettiness(cndts1, cnsts)
    if (len(cnsts)>1):
        cndts2 = seq.exclusive_jets(2)
        tau2 = subjettiness(cndts2, cnsts)
    else: 
        tau2 = 0
    
    try:
        return tau2/tau1
    
    except ZeroDivisionError:
        return 0

### Clustering and table
We will do the clustering of the data separately for the signal and for the background, so we will create a function that does the grouping and generates a table. The table will contain $p_T$, $m$, $\eta$, $\phi$, $E$, $\tau_{21}$, $m_{jj}$ and $\Delta R_{12}$ of the two main jets, as well as the * no. hadrons * of the event. 

In [9]:
def tabla(data):
    n_events = data.shape[0]                   # no. of events (1000)
    n_hadrones_gen = int((data.shape[1]-1)/3)  # no. of hadrons (700) 
                                               # [-1 to eliminate the label column, /3 for the 3 values for each hadron]
    data_ss = data.iloc[:,:-1]                 # the dataframe without labels
    
    # Defining the variables
    df = pd.DataFrame(columns=['pT_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 'tau_21_j1',  
                                'pT_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2', 'tau_21_j2',
                                'm_jj', 'deltaR_j12', 'label'])

    for event in tqdm(range(n_events)):

        pseudojets_input = np.zeros(len([data for data in data_ss.iloc[event,::3] if data > 0]), dtype= fj.DTYPE_PTEPM) 

        for hadron in range(n_hadrones_gen):
            if (data_ss.iloc[event,hadron*3] > 0): ## si pT > 0 

                ## Filling the array with pT, eta y phi for each hadron
                pseudojets_input[hadron]['pT'] = data_ss.iloc[event,hadron*3] 
                pseudojets_input[hadron]['eta'] = data_ss.iloc[event,hadron*3+1]
                pseudojets_input[hadron]['phi'] = data_ss.iloc[event,hadron*3+2]

                pass
            pass

        ## Returns a "ClusterSequence" (pyjet type of list)
        secuencia = fj.cluster(pseudojets_input, R=1.0, p=-1) 

        ## With inclusive_jets you get all the clustered jets
        ## and filter the ones with pT greater than 20.
        
        ## Making a list of the pseudojets
        jets = secuencia.inclusive_jets(ptmin=20) 

        # Getting the variables from the most energetic jet
        pT_j1 = jets[0].pt
        m_j1 = np.abs(jets[0].mass)
        eta_j1 = jets[0].eta
        phi_j1 = jets[0].phi
        E_j1 = jets[0].e
        tau_21_j1= tau21(jets[0])

        # Try getting the variables for the second most energetic jet (if it exists)
        try:
            pT_j2 = jets[1].pt
            m_j2 = np.abs(jets[1].mass)
            eta_j2 = jets[1].eta
            phi_j2 = jets[1].phi
            E_j2 = jets[1].e
            tau_21_j2= tau21(jets[1])
    
        # If not, all zero
        except IndexError:
            pT_j2 = 0
            m_j2 = 0
            eta_j2 = 0
            phi_j2 = 0
            E_j2 = 0
            tau_21_j2 = 0
        
        # Calculating the other variables
        deltaR_j12 = deltaR(jets[0], jets[1])
        mjj = m_j1 + m_j2
        n_hadrones = data.iloc[event,:].astype(bool).sum(axis=0)/3
        label = data.iloc[event,-1]

        # Adding it to the dataframe
        entry = pd.DataFrame([[pT_j1, m_j1, eta_j1, phi_j1, E_j1, tau_21_j1,  
                                pT_j2, m_j2, eta_j2, phi_j2, E_j2, tau_21_j2, 
                                mjj,deltaR_j12, n_hadrones, label]],
                            columns=['pT_j1', 'm_j1', 'eta_j1', 'phi_j1', 'E_j1', 'tau_21_j1',  
                                    'pT_j2', 'm_j2', 'eta_j2', 'phi_j2', 'E_j2', 'tau_21_j2',
                                    'm_jj', 'deltaR_j12', 'n_hadrones', 'label'])
        df = df.append(entry, sort=True)
    return df

Using the function on the BB1 dataset:

In [10]:
# Generating the table
df = tabla(events_tiny)
df.head()

100%|██████████| 100000/100000 [43:16<00:00, 38.51it/s]


Unnamed: 0,E_j1,E_j2,deltaR_j12,eta_j1,eta_j2,label,m_j1,m_j2,m_jj,n_hadrones,pT_j1,pT_j2,phi_j1,phi_j2,tau_21_j1,tau_21_j2
0,1563.704411,1802.812644,3.154878,0.215797,0.724064,0.0,162.112941,153.211835,315.324776,77.0,1519.754514,1410.194772,-0.320496,2.793171,0.8894,0.978155
0,2672.474719,1776.179488,2.69911,0.659932,0.648318,0.0,999.760867,549.878495,1549.639362,288.0,2021.928491,1387.062287,-1.640937,1.058148,0.356523,0.360868
0,1419.017839,1758.557371,3.277478,-0.147723,0.725017,0.0,151.422868,640.423695,791.846563,214.0,1395.659808,1285.009488,0.979668,-2.179476,0.585259,0.181974
0,1305.230912,3047.866378,3.474055,0.214087,-1.718316,0.0,70.102505,252.674687,322.777192,117.0,1274.038565,1055.652281,1.325886,-1.561132,0.690229,0.240786
0,1537.733425,1370.397248,3.120373,0.271324,0.093318,0.0,55.673065,118.221098,173.894163,72.0,1481.845642,1359.365265,0.849572,-2.26572,0.570342,0.95722


Saving the dataframe as a csv:

In [11]:
outname = 'eventosBB1_{}.csv'.format(n_events)
outdir = '../data'
# If the path does not exist, then create it
if not os.path.exists(outdir):
    os.makedirs(outdir)

path = os.path.join(outdir, outname)    
df.to_csv(path, sep=',', index=False)