In [1]:
import uproot 
import numpy as np 
import pandas as pd 
import h5py 
import tqdm 
import numba
import os 
from collections import OrderedDict


In [2]:
class pdgid():
    """
    This is a function for configurating PDG id.
    """
    def __init__(self):
        self.w_plus = 24
        self.w_minus = -24
        self.down = 1
        self.anti_down = -1 
        self.up = 2
        self.anti_up = -2 
        self.strange = 3 
        self.anti_strange = -3
        self.charm = 4
        self.anti_charm = -4
        self.bottom = 5 
        self.anti_bottom = -5
        self.top = 6
        self.anti_top = -6
        self.higgs = 25
        self.electron = 11 
        self.positron = -11
        self.electron_neutrino = 12
        self.anti_electron_neutrino = -12
        self.muon = 13
        self.anti_muon = -13
        self.muon_neutrino = 14
        self.anti_muon_neutrino = -14
        self.tau = 17 
        self.anti_tau = -17
        self.tau_neutrino = 18
        self.anti_tau_neutrino = -18
        self.z_plus = 23
        self.z_minus = -23
PID = pdgid()
class IO_module:
    """
    This is a I/O module for HEP-jet-assignment project.
    This module can help to load the data from root files/npz files and output as npz/hdf5 files.
    """
    def __init__(self, PATH, MODEL, MULTI=False):
        self.path = PATH
        self.multi = MULTI
        self.model = MODEL
        self.require_lepton = ["ttbar_lep_left", "ttbar_lep_right"]
    def read_ROOT(self) -> dict:
        # If loading multi-root files, using this function to concatenate dataset.
        if self.multi:
            files_under_path = os.listdir(self.path)
            file_list = [os.path.join(self.path, a) for a in files_under_path]
            num_of_files = len(file_list)
            _data = []
            count = 1
            for a in file_list:
                try:
                    print(f"Padding root file from {a}. Progress: {count}/{num_of_files}.")
                    tmp = uproot.open(a)['Delphes']
                    _data.append(tmp)
                    count += 1
                except:
                    print('Please check input file path.')
            for i in tqdm.trange(num_of_files):
                if i == 0 :
                    _particle_event = _data[i].array('Event')
                    _particle_pt = _data[i].array('Particle.PT')
                    _particle_eta = _data[i].array('Particle.Eta')
                    _particle_phi = _data[i].array('Particle.Phi')
                    _particle_pid = _data[i].array('Particle.PID')
                    _particle_M1 = _data[i].array('Particle.M1')
                    _particle_M2 = _data[i].array('Particle.M2')
                    _particle_D1 = _data[i].array('Particle.D1')
                    _particle_D2 = _data[i].array('Particle.D2')
                    _particle_status = _data[i].array('Particle.Status')
                    _particle_rapidity = _data[i].array('Particle.Rapidity')
                    _particle_mass = _data[i].array('Particle.Mass')
                    _particle_charge = _data[i].array('Particle.Charge')

                    _jet_event = _data[i].array('Event')
                    _jet_pt = _data[i].array('Jet.PT')
                    _jet_eta = _data[i].array('Jet.Eta')
                    _jet_phi = _data[i].array('Jet.Phi')
                    _jet_btag = _data[i].array('Jet.BTag')
                    _jet_area = _data[i].array('Jet.Area')
                    _jet_mass = _data[i].array('Jet.Mass')
                    _jet_charge = _data[i].array('Jet.Charge')
                    _num_of_jets = _data[i].array('Jet')
                else: 
                    _particle_event = np.concatenate((_particle_event, _data[i].array('Event')))
                    _particle_pt = np.concatenate((_particle_pt, _data[i].array('Particle.PT')))
                    _particle_eta = np.concatenate((_particle_eta, _data[i].array('Particle.Eta')))
                    _particle_phi = np.concatenate((_particle_phi, _data[i].array('Particle.Phi')))
                    _particle_pid = np.concatenate((_particle_pid,_data[i].array('Particle.PID')))
                    _particle_M1 = np.concatenate((_particle_M1, _data[i].array('Particle.M1')))
                    _particle_M2 = np.concatenate((_particle_M2, _data[i].array('Particle.M2')))
                    _particle_D1 = np.concatenate((_particle_D1, _data[i].array('Particle.D1')))
                    _particle_D2 = np.concatenate((_particle_D2, _data[i].array('Particle.D2')))
                    _particle_status = np.concatenate((_particle_status, _data[i].array('Particle.Status')))
                    _particle_rapidity = np.concatenate((_particle_rapidity, _data[i].array('Particle.Rapidity')))
                    _particle_mass = np.concatenate((_particle_mass, _data[i].array('Particle.Mass')))
                    _particle_charge = np.concatenate((_particle_charge, _data[i].array('Particle.Charge')))

                    _jet_event = np.concatenate((_jet_event,_data[i].array('Event')))
                    _jet_pt = np.concatenate((_jet_pt, _data[i].array('Jet.PT')))
                    _jet_eta = np.concatenate((_jet_eta, _data[i].array('Jet.Eta')))
                    _jet_phi = np.concatenate((_jet_phi, _data[i].array('Jet.Phi')))
                    _jet_btag = np.concatenate((_jet_btag, _data[i].array('Jet.BTag')))
                    _jet_area = np.concatenate((_jet_area, _data[i].array('Jet.Area')))
                    _jet_mass = np.concatenate((_jet_mass, _data[i].array('Jet.Mass')))
                    _jet_charge = np.concatenate((_jet_charge, _data[i].array('Jet.Charge')))
                    _num_of_jets =  np.concatenate((_num_of_jets, _data[i].array('Jet')))
                
            if self.model in self.require_lepton:
                for i in tqdm.trange(num_of_files):
                    if i == 0 :
                        _missing_et_met = _data[i].array('MissingET.MET')
                        _missing_et_eta = _data[i].array('MissingET.Eta')
                        _missing_et_phi = _data[i].array('MissingET.Phi')
                        
                        _muon_pt = _data[i].array('Muon.PT')
                        _muon_eta = _data[i].array('Muon.Eta')
                        _muon_phi = _data[i].array('Muon.Phi')
                        
                        _electron_pt = _data[i].array('Electron.PT')
                        _electron_eta = _data[i].array('Electron.Eta')
                        _electron_phi = _data[i].array('Electron.Phi')
                    else: 
                        _missing_et_met = np.concatenate((_missing_et_met, _data[i].array('MissingET.MET')))
                        _missing_et_eta = np.concatenate((_missing_et_eta, _data[i].array('MissingET.Eta')))
                        _missing_et_phi = np.concatenate((_missing_et_phi, _data[i].array('MissingET.Phi')))
                        
                        _muon_pt = np.concatenate((_muon_pt, _data[i].array('Muon.PT')))
                        _muon_eta = np.concatenate((_muon_eta, _data[i].array('Muon.Eta')))
                        _muon_phi = np.concatenate((_muon_phi, _data[i].array('Muon.Phi')))
                        
                        _electron_pt = np.concatenate((_electron_pt, _data[i].array('Electron.PT')))
                        _electron_eta = np.concatenate((_electron_eta, _data[i].array('Electron.Eta')))
                        _electron_phi = np.concatenate((_electron_phi, _data[i].array('Electron.Phi')))

        else:
            _data = uproot.open(self.path)['Delphes']
            
            print("Loading particle information.")
            _particle_event = _data.array('Event')
            _particle_pt = _data.array('Particle.PT')
            _particle_eta = _data.array('Particle.Eta')
            _particle_phi = _data.array('Particle.Phi')
            _particle_pid = _data.array('Particle.PID')
            _particle_M1 = _data.array('Particle.M1')
            _particle_M2 = _data.array('Particle.M2')
            _particle_D1 = _data.array('Particle.D1')
            _particle_D2 = _data.array('Particle.D2')
            _particle_status = _data.array('Particle.Status')
            _particle_rapidity = _data.array('Particle.Rapidity')
            _particle_mass = _data.array('Particle.Mass')
            _particle_charge = _data.array('Particle.Charge')
            
            print("Loading jet information.")
            _jet_event = _data.array('Event')
            _jet_pt = _data.array('Jet.PT')
            _jet_eta = _data.array('Jet.Eta')
            _jet_phi = _data.array('Jet.Phi')
            _jet_btag = _data.array('Jet.BTag')
            _jet_area = _data.array('Jet.Area')
            _jet_mass = _data.array('Jet.Mass')
            _jet_charge = _data.array('Jet.Charge')
            _num_of_jets = _data.array('Jet')
            
            print("Loading MET information.")
            _missing_et_met = _data.array('MissingET.MET')
            _missing_et_eta = _data.array('MissingET.Eta')
            _missing_et_phi = _data.array('MissingET.Phi')

            print("Loading muon information.")
            _muon_pt = _data.array('Muon.PT')
            _muon_eta = _data.array('Muon.Eta')
            _muon_phi = _data.array('Muon.Phi')

            print("Loading electron information.")
            _electron_pt = _data.array('Electron.PT')
            _electron_eta = _data.array('Electron.Eta')
            _electron_phi = _data.array('Electron.Phi')
        
        jet_dataset = {
            "event": _jet_event, 
            "pt": _jet_pt, 
            "eta": _jet_eta, 
            "phi": _jet_phi, 
            "btag": _jet_btag, 
            "area": _jet_area, 
            "mass": _jet_mass, 
            "num_of_jets": _num_of_jets, 
            "charge": _jet_charge,                                     
        }
        particle_dataset = {
            "event": _particle_event, 
            "pt": _particle_pt, 
            "eta": _particle_eta, 
            "phi": _particle_phi, 
            "pid": _particle_pid, 
            "M1": _particle_M1, 
            "M2": _particle_M2, 
            "D1": _particle_D1, 
            "D2": _particle_D2, 
            "status": _particle_status, 
            "rapidity": _particle_rapidity, 
            "mass": _particle_mass, 
            "charge": _particle_charge,                                      
        }
        if self.model in self.require_lepton:
            muon_dataset = {
                "pt": _muon_pt, 
                "eta": _muon_eta, 
                "phi": _muon_phi, 
            }
            electron_dataset = {
                "pt": _electron_pt, 
                "eta": _electron_eta, 
                "phi": _electron_phi, 
            }
            MissingET_dataset = {
                "MET": _missing_et_met, 
                "eta": _missing_et_eta, 
                "phi": _missing_et_phi, 
            }
            dataset = {
                "particle": particle_dataset,
                "jet": jet_dataset,
                "muon": muon_dataset,
                "electron": electron_dataset,
                "MissingET": MissingET_dataset,
            }
        else: 
            dataset = {
                "particle": particle_dataset,
                "jet": jet_dataset,
            }
        return dataset

In [3]:
path = 'data/ttbar_data/tag_1_delphes_events_1.root'
MODEL = 'ttbar'
GENERATOR = 'py8'
read_dataset = IO_module( path, MODEL, MULTI = False)

# Setting `STATUS_CODE` for different shower generator.
if GENERATOR == 'py8':
    STATUS_CODE = 22
elif GENERATOR == 'herwig7':
    STATUS_CODE = 11
else: 
    print("Please select a correct shower generator. 1. py8, 2. herwig7.")

MAX_NUM_OF_JETS = 20

# Setting barcode, `NUM_OF_PARTON`, and `NUM_OF_DAUGHTER` for different model
if MODEL == "ttbar":
    """
    Barcode system
    t t~ W+ W- b b~ 
    0 0  0  0  0 0
    daughter of top and W+: 101000 ----> 40
    daughter of top and b: 101000 ----> 34
    daughter of anti top and W-: 100100 ----> 20
    daughter of anti top and b~: 100001 ----> 17
    """
    barcode = np.array([34, 40, 40, 17, 20, 20])
    NUM_OF_PARTON = 6
    NUM_OF_DAUGHTER = 6
elif MODEL == "ttbar_lep_left":
    """
    Barcode system
    t t~ W+ W- b b~ 
    0 0  0  0  0 0
    daughter of top and W+: 101000 ----> 40
    daughter of top and b: 101000 ----> 34
    daughter of anti top and W-: 100100 ----> 20
    daughter of anti top and b~: 100001 ----> 17
    """
    barcode = np.array([34, 40, 40, 17, 20, 20])
    NUM_OF_PARTON = 4
    NUM_OF_DAUGHTER = 6
elif MODEL == "ttbar_lep_right":
    """
    Barcode system
    t t~ W+ W- b b~ 
    0 0  0  0  0 0
    daughter of top and W+: 101000 ----> 40
    daughter of top and b: 101000 ----> 34
    daughter of anti top and W-: 100100 ----> 20
    daughter of anti top and b~: 100001 ----> 17
    """
    barcode = np.array([34, 40, 40, 17, 20, 20])
    NUM_OF_PARTON = 4
    NUM_OF_DAUGHTER = 6
elif MODEL == "ttH":
    """
    Barcode system
    t t~ W+ W- b b~ H
    0 0  0  0  0 0  0
    daughter of t and b = 1000100  ----> 68
    daughter of t and W+ = 1010000 ----> 80
    daughter of t~ and W- = 0101000 ----> 34
    daughter of t~ and b~ = 0100010 ----> 40
    daughter of H = 0000001 ----> 1
    """
    barcode = np.array([68, 80, 80, 34, 40, 40, 1, 1])
    NUM_OF_PARTON = 8
    NUM_OF_DAUGHTER = 8
elif MODEL == "four_top":
    """
    Barcode system
    t1 t2 t1~ t2~ W+1 W-1 W+2 W-2 b1 b2 b1~ b2~             describe          barcode   sequence
    0  0   0   0   0   0   0   0  0  0   0   0

    1  0   0   0   1   0   0   0  0  0   0   0  <--- daughter of t1 and W+1   2176         2,3
    1  0   0   0   0   0   0   0  1  0   0   0  <--- daughter of t1 and b1    2056          1
    0  0   1   0   0   1   0   0  0  0   0   0  <--- daughter of t1~ and W-1  576          5,6
    0  0   1   0   0   0   0   0  0  1   0   0  <--- daughter of t1~ and b1~  516           4

    0  1   0   0   0   0   1   0  0  0   0   0  <--- daughter of t2 and W+2   1056         7,8
    0  1   0   0   0   0   0   0  0  1   0   0  <--- daughter of t2 and b2    1028          9
    0  0   0   1   0   0   0   1  0  0   0   0  <--- daughter of t2~ and W-2  272          11.12
    0  0   0   1   0   0   0   0  0  0   0   1  <--- daughter of t2~ and b2~  257           10

    """
    barcode = np.array([2056, 2176, 2176, 516, 576, 576, 1028, 1056, 1056, 257, 272, 272])
    NUM_OF_PARTON = 12
    NUM_OF_DAUGHTER = 12
elif MODEL == "ZH":
    """
    Barcode system
    Z H W+ W- b b~ 
    0 0  0  0  0 0
    daughter of higgs and W+: 011000 ----> 24
    daughter of higgs and W-: 010100 ----> 20
    daughter of Z and b: 100010 ----> 34
    daughter of Z and b~: 100001 ----> 33
    """
    barcode = np.array([24, 24, 20, 20, 34, 33])
    NUM_OF_PARTON = 6
    NUM_OF_DAUGHTER = 6
else:
    print("Please select a correct model.")

In [4]:
dataset = read_dataset.read_ROOT()

Loading particle information.
Loading jet information.
Loading MET information.
Loading muon information.
Loading electron information.


In [5]:
dataset.keys()

dict_keys(['particle', 'jet'])

In [6]:
class calc_helper:
    @staticmethod
    def deltaPhi(phi1: float,phi2: float) -> float:
        """
        This is a function deltaPhi value between two target.
        phi1: phi value from target one. 
        phi2: phi value from target two. 
        """
        phi = phi1-phi2
        while phi >= np.pi: phi -= np.pi*2.
        while phi < -np.pi: phi += np.pi*2.
        return phi
    
    @staticmethod
    def delta_R(eta1: float, phi1: float, eta2: float, phi2: float) -> np.float64:
        """
        This is a function delta_R value between two target.
        phi1: phi value from target one. 
        eta1: eta value from target one. 
        phi2: phi value from target two. 
        eta2: eta value from target two. 
        """
        return np.sqrt(calc_helper.deltaPhi(phi1,phi2)**2+(eta1-eta2)**2).astype(np.float64)

class process_methods:  
    @staticmethod
    def event_selection(MODEL: str, **kargs) -> np.ndarray:
        marker_event = []
        marker_jet = []
        marker_btag = []
        PT = kargs['pt']
        ETA = kargs['eta']
        BTAG = kargs['btag']
        print("MODE: {0}, Number of events: {1}.".format(MODEL, len(PT)))
        
        requirement = {
            "ttbar": [2, 6],
            "ttH": [2, 8],
            "four_top": [2, 12],
            "ttbar_lep_left": [2, 4],
            "ttbar_lep_right": [2, 4], 
            "ZH": [2, 6],
        }     
        
        if MODEL != 'ttbar_lep_left' and MODEL != "ttbar_lep_right":
            print("Start jet marking.")
            for i in tqdm.trange(len(PT)):

                _marker_jet = []
                _marker_btag = []

                for j in range(len(PT[i])):
                    if BTAG[i][j] == 1 and PT[i][j] > 25 and np.abs(ETA[i][j]) < 2.5:
                        _marker_btag.append(1) 
                    else: 
                        _marker_btag.append(0) 

                    if PT[i][j] > 25 and np.abs(ETA[i][j]) <= 2.5:
                        _marker_jet.append(1)
                    else:
                        _marker_jet.append(0)

                marker_jet.append(np.asanyarray(_marker_jet, dtype=object))
                marker_btag.append(np.asanyarray(_marker_btag, dtype=object))

            marker_jet = np.asanyarray(marker_jet, dtype=object)
            marker_btag = np.asanyarray(marker_btag, dtype=object)
            print("Start event marking.")
            for i in tqdm.trange(len(PT)):

                if np.sum(marker_jet[i] == 1) >= requirement[MODEL][1] and np.sum(marker_btag[i] == 1) >= requirement[MODEL][0] :
                    marker_event.append(1)
                else:
                    marker_event.append(0)
            marker_event = np.asanyarray(marker_event, dtype=object)
        else: 
            PHI = kargs['phi']

            ELECTRON_PT = [ list(x) if x.size>=2 else x.item() if x.size==1 else 99999 for x in kargs['electron_pt']]
            ELECTRON_ETA = [ list(x) if x.size>=2 else x.item() if x.size==1 else 99999 for x in kargs['electron_eta']]
            ELECTRON_PHI = [ list(x) if x.size>=2 else x.item() if x.size==1 else 99999 for x in kargs['electron_phi']]

            MUON_PT = [ list(x) if x.size>=2 else x.item() if x.size==1 else 99999 for x in kargs['muon_pt']]
            MUON_ETA = [ list(x) if x.size>=2 else x.item() if x.size==1 else 99999 for x in kargs['muon_eta']]
            MUON_PHI = [ list(x) if x.size>=2 else x.item() if x.size==1 else 99999 for x in kargs['muon_phi']]

            marker_lepton = []
            LEPTON_PT = []
            LEPTON_ETA = []
            LEPTON_PHI = []
            for a,b in zip(ELECTRON_PT, MUON_PT):
                _tmp = []
                if isinstance(a, float) or isinstance(a, int):
                    _tmp.append(a)
                elif isinstance(a, list):
                    for c in a:
                        _tmp.append(c)
                else: 
                    print('error', type(a))

                if isinstance(b, float) or isinstance(b, int):
                    _tmp.append(b)
                elif isinstance(b, list):
                    for c in b:
                        _tmp.append(c)
                else: 
                    print('error', type(b))
                LEPTON_PT.append(_tmp)

            for a,b in zip(ELECTRON_ETA, MUON_ETA):
                _tmp = []
                if isinstance(a, float) or isinstance(a, int):
                    _tmp.append(a)
                elif isinstance(a, list):
                    for c in a:
                        _tmp.append(c)
                else: 
                    print('error', type(a))

                if isinstance(b, float) or isinstance(b, int):
                    _tmp.append(b)
                elif isinstance(b, list):
                    for c in b:
                        _tmp.append(c)
                else: 
                    print('error', type(b))
                LEPTON_ETA.append(_tmp)
            for a,b in zip(ELECTRON_PHI, MUON_PHI):
                _tmp = []
                if isinstance(a, float) or isinstance(a, int):
                    _tmp.append(a)
                elif isinstance(a, list):
                    for c in a:
                        _tmp.append(c)
                else: 
                    print('error', type(a))

                if isinstance(b, float) or isinstance(b, int):
                    _tmp.append(b)
                elif isinstance(b, list):
                    for c in b:
                        _tmp.append(c)
                else: 
                    print('error', type(b))
                LEPTON_PHI.append(_tmp)
            print("Start jet marking.")
            for i in tqdm.trange(len(PT)):
                _marker_event = []
                _marker_jet = []
                _marker_btag = []
                for j in range(len(PT[i])):
                    if BTAG[i][j] == 1 and PT[i][j] > 25 and np.abs(ETA[i][j]) < 2.5:
                        _marker_btag.append(1) 
                    else: 
                        _marker_btag.append(0) 

                    if PT[i][j] > 25 and np.abs(ETA[i][j]) <= 2.5:
                        _marker_jet.append(1)
                    else:
                        _marker_jet.append(0)
                marker_jet.append(np.asanyarray(_marker_jet, dtype=object))
                marker_btag.append(np.asanyarray(_marker_btag, dtype=object))

            marker_jet = np.asanyarray(marker_jet, dtype=object)
            marker_btag = np.asanyarray(marker_btag, dtype=object)

            #Remove electron from jets catogary
            for i in tqdm.trange(len(PT)):
                for j in range(len(PT[i])):
                    for k in range(len(LEPTON_PT[i])):
                        if delta_R(ETA[i][j], PHI[i][j], LEPTON_ETA[i][k], LEPTON_PHI[i][k]) < 0.4:
                            marker_jet[i][j] = 0
                        else : pass 

            for i in tqdm.trange(len(LEPTON_PT)):
                _marker_lepton = []
                for j in range(len(LEPTON_PT[i])):
                    if LEPTON_PT[i][j] > 25 and np.abs(LEPTON_ETA[i][j]) < 2.5:
                        _marker_lepton.append(1)
                    else :
                        _marker_lepton.append(0)
                marker_lepton.append(np.asanyarray(_marker_lepton, dtype=object))
            marker_lepton = np.asanyarray(marker_lepton, dtype=object)
            print("Start event marking.")
            for i in tqdm.trange(len(PT)):
                if np.sum(marker_jet[i] == 1) >= 4 and np.sum(marker_btag[i] == 1) >= 2 and np.sum(marker_lepton[i] ==1) == 1 and (np.array(LEPTON_PT[i]) != 99999).sum() == 1:
                    marker_event.append(1)
                else:
                    marker_event.append(0)
            marker_event = np.asanyarray(marker_event, dtype=object)
        return marker_event
    
    @staticmethod
    def shifted_particle_tracing(dataset: pd.core.frame.DataFrame, PID_daughter: int, idx: int) -> int:
        """
        This is a frunction tracing the on-flying particle. 
        """
        if (dataset.iloc[idx,6] == PID_daughter):
            return int(dataset.iloc[idx,4])
        
    @staticmethod
    def daughter_finder(dataset: pd.core.frame.DataFrame, PID: int, not_stable_FSP=True, **kargs) -> dict:
        if not_stable_FSP:
            last_idx = np.array(dataset[dataset["PID"] == PID]["Index"])[-1]
            D1 = np.array(dataset[dataset["PID"] == PID]["Daughter_1"])[-1]
            D2 = np.array(dataset[dataset["PID"] == PID]["Daughter_2"])[-1]
            if dataset["PID"][int(D1)] < 0:
                D3 = D2 
                D2 = D1 
                D1 = D3
            _result = {
                "mother_idx": last_idx,
                "daughter_1_idx": D1,
                "daughter_2_idx": D2,
            }
        else:
            STATUS = kargs['status']
            MODEL = kargs['model']
            if MODEL == "ttbar_lep_right" or MODEL == "ttbar_lep_left":
                for i in range(len(dataset)):
                    if(dataset.iloc[i,1] == STATUS and dataset.iloc[i,6] == PID ): 
                        daughter_index = int(dataset.iloc[i,0])
                if( dataset.iloc[daughter_index,6] == PID ):
                    shifted_particle_index = dataset.iloc[daughter_index, 4]


                while dataset.iloc[shifted_particle_index,6] == PID:
                    init_shifted_particle_index = shifted_particle_index
                    shifted_particle_index = shifted_particle_tracing(dataset, PID, init_shifted_particle_index)       

                dauthter_idx_1 = dataset.iloc[init_shifted_particle_index, 4]
                daughter_pid_1 = dataset.iloc[dauthter_idx_1, 6]

                dauthter_idx_2 = dataset.iloc[init_shifted_particle_index, 5]
                daughter_pid_2 = dataset.iloc[dauthter_idx_2, 6]

                _result = {
                    "mother_idx": init_shifted_particle_index,
                    "daughter_1_idx": dauthter_idx_1,
                    "daughter_2_idx": dauthter_idx_2,
                }
        return _result

    #tracing the daughters
    #Input two daughter of top/top_bar and find their daughter
    @staticmethod
    def quark_finder(dataset: pd.core.frame.DataFrame, mother_idx_1: int, mother_idx_2: int) -> dict:
        """
        This is a function finding the daughter of bosons.
        This function will no longer be deprecated. 
        Check before calling this function.
        """
        #Specific two daughter of top
        def W_b_specifier(dataset, input_1_idx, input_2_idx):
            if dataset.iloc[int(input_1_idx),6] == PID.w_plus or dataset.iloc[int(input_1_idx),6] == PID.w_minus :
                return int(input_1_idx), int(dataset.iloc[int(input_1_idx),6]), int(input_2_idx)
            elif dataset.iloc[int(input_1_idx),6] == PID.bottom or dataset.iloc[int(input_1_idx),6] == PID.anti_bottom :
                return  int(input_2_idx), int(dataset.iloc[int(input_1_idx),6]), int(input_1_idx)
            else :
                pass
                #print("Please check your data.")

        W_boson_idx, mother_pid, b_quark_idx = W_b_specifier(dataset, mother_idx_1, mother_idx_2)

        #Find the two daughters of boson
        daughter_1_idx = dataset.iloc[W_boson_idx, 4]
        daughter_1_pid = dataset.iloc[daughter_1_idx, 6]
        daughter_2_idx = dataset.iloc[W_boson_idx, 5]
        daughter_2_pid = dataset.iloc[daughter_2_idx, 6]

        if daughter_1_pid == mother_pid or daughter_2_pid == mother_pid:

            init_idx = W_boson_idx
            daughter_pid = daughter_1_pid
            if daughter_2_pid == mother_pid:
                daughter_pid = daughter_2_pid
            while daughter_pid == mother_pid :
                daughter_1_idx = dataset.iloc[int(init_idx), 4]
                daughter_2_idx = dataset.iloc[int(init_idx), 5]

                daughter_1_pid = dataset.iloc[int(daughter_1_idx), 6]
                daughter_2_pid = dataset.iloc[int(daughter_2_idx), 6]

                daughter_pid = daughter_1_pid
                init_idx = daughter_1_idx
                if daughter_2_pid == mother_pid:
                    daughter_pid = daughter_2_pid
                    init_idx = daughter_2_idx
        _result = {
                "b_idx": b_quark_idx,
                "W_daughter_1_idx": daughter_1_idx,
                "W_dauthter_2_idx": daughter_2_idx,
            }
        return  _result

In [7]:
dataset["particle"].keys()

dict_keys(['event', 'pt', 'eta', 'phi', 'pid', 'M1', 'M2', 'D1', 'D2', 'status', 'rapidity', 'mass', 'charge'])

In [9]:

if MODEL == 'ttbar_lep_left' or MODEL == "ttbar_lep_right":
    marker_event = process_methods.event_selection(MODEL, 
                                    pt=dataset["jet"]["pt"], 
                                    eta=dataset["jet"]["eta"], 
                                    phi=dataset["jet"]["phi"],
                                    btag=dataset["jet"]["btag"],
                                    electron_pt=dataset["electron"]["pt"],
                                    electron_eta=dataset["electron"]["eta"],
                                    electron_phi=dataset["electron"]["phi"],
                                    muon_pt=dataset["muon"]["pt"],
                                    muon_eta=dataset["muon"]["eta"],
                                    muon_phi=dataset["muon"]["phi"],
                                    )
else:
    marker_event = process_methods.event_selection(MODEL, pt=dataset["jet"]["pt"], eta=dataset["jet"]["eta"], btag=dataset["jet"]["btag"])
#     del marker_jet, marker_btag, marker_lepton
passed = np.where(marker_event == 1)[0]

  1%|          | 974/100000 [00:00<00:10, 9737.99it/s]

MODE: ttbar, Number of events: 100000.
Start jet marking.


100%|██████████| 100000/100000 [00:09<00:00, 10090.91it/s]
 27%|██▋       | 27071/100000 [00:00<00:00, 135364.53it/s]

Start event marking.


100%|██████████| 100000/100000 [00:00<00:00, 133311.68it/s]


In [10]:
class helper:
    @staticmethod
    def to_dataframe(DATASET: dict, index: int) -> pd.core.frame.DataFrame:
        idx = np.linspace(0, len( DATASET["pt"][index])-1, num = len( DATASET["pt"][index]) )
        dic = {
             "Index": idx,
            "Status":  DATASET["status"][index],
            "Mother_1":  DATASET["M1"][index],
            "Mother_2":  DATASET["M2"][index],
            "Daughter_1":  DATASET["D1"][index],
            "Daughter_2":  DATASET["D2"][index],
            "PID": DATASET["pid"][index],
            "PT":  DATASET["pt"][index],
            "Eta":  DATASET["eta"][index],
            "Phi":  DATASET["phi"][index],
            "Mass":  DATASET["mass"][index],
        }
        return pd.DataFrame(dic)
    @staticmethod
    def fetch_kinematics_properties_from_dataset(dataset: pd.core.frame.DataFrame, index: np.ndarray, colume_name: str)-> list:
        _result = [dataset[colume_name][_index] for _index in index]
        return _result

In [11]:
%%time 
daughter_t1 = [process_methods.daughter_finder(helper.to_dataframe(dataset["particle"], i), PID.top) for i in passed]
daughter_t2 = [process_methods.daughter_finder(helper.to_dataframe(dataset["particle"], i), PID.anti_top) for i in passed]
daughter_t1_W = [process_methods.daughter_finder(helper.to_dataframe(dataset["particle"], passed[i]), 24) for i in range(len(passed))]
daughter_t2_W = [process_methods.daughter_finder(helper.to_dataframe(dataset["particle"], passed[i]), -24) for i in range(len(passed))]
daughter_t1_W_idx = np.array([[ dic[item] for item in dic if item in ["daughter_1_idx", "daughter_2_idx"]] for dic in daughter_t1_W])
daughter_t2_W_idx = np.array([[ dic[item] for item in dic if item in ["daughter_1_idx", "daughter_2_idx"]] for dic in daughter_t1_W])

daughter_t1_W_1 = daughter_t1_W_idx[:,0]
daughter_t1_W_2 = daughter_t1_W_idx[:,1]
daughter_t1_b = np.array([a["daughter_2_idx"] for a in daughter_t1])

daughter_t2_W_1 = daughter_t2_W_idx[:,0]
daughter_t2_W_2 = daughter_t2_W_idx[:,1]
daughter_t2_b = np.array([a["daughter_2_idx"] for a in daughter_t2])


CPU times: user 1min 57s, sys: 88.9 ms, total: 1min 57s
Wall time: 1min 57s


In [12]:
daughter_t1_W[:10]

[{'mother_idx': 427.0, 'daughter_1_idx': 469, 'daughter_2_idx': 470},
 {'mother_idx': 940.0, 'daughter_1_idx': 991, 'daughter_2_idx': 992},
 {'mother_idx': 1174.0, 'daughter_1_idx': 1225, 'daughter_2_idx': 1226},
 {'mother_idx': 997.0, 'daughter_1_idx': 1042, 'daughter_2_idx': 1043},
 {'mother_idx': 498.0, 'daughter_1_idx': 501, 'daughter_2_idx': 502},
 {'mother_idx': 169.0, 'daughter_1_idx': 205, 'daughter_2_idx': 206},
 {'mother_idx': 546.0, 'daughter_1_idx': 621, 'daughter_2_idx': 622},
 {'mother_idx': 382.0, 'daughter_1_idx': 406, 'daughter_2_idx': 407},
 {'mother_idx': 302.0, 'daughter_1_idx': 339, 'daughter_2_idx': 340},
 {'mother_idx': 794.0, 'daughter_1_idx': 803, 'daughter_2_idx': 804}]

In [13]:
if MODEL == 'ttbar':
    parton_idx = np.stack((daughter_t1_b, daughter_t1_W_1, daughter_t1_W_2, daughter_t2_b, daughter_t2_W_1, daughter_t2_W_2), axis=1)
parton_idx

array([[ 424,  469,  470,  464,  469,  470],
       [ 937,  991,  992,  928,  991,  992],
       [1171, 1225, 1226, 1159, 1225, 1226],
       ...,
       [1024, 1030, 1031, 1028, 1030, 1031],
       [1275, 1326, 1327, 1306, 1326, 1327],
       [ 402,  429,  430,  418,  429,  430]], dtype=int32)

In [14]:
%%time 

sourece_features = ["PT", "Eta", "Phi", "Mass", "PID"]

storage_name = ["parton_pt", "parton_eta", "parton_phi", "parton_mass", "parton_pdgid"]

parton_features = OrderedDict()
for a, b in tqdm.tqdm(zip(sourece_features, storage_name), total=(len(sourece_features)), desc="Storing parton's kinematics information"):
    parton_features[b] = np.array([helper.fetch_kinematics_properties_from_dataset(helper.to_dataframe(dataset["particle"], passed[i]), parton_idx[i], a) for i in range(len(passed))])

parton_barcode = np.tile(barcode, (len(passed),1))
parton_features["parton_barcode"] = parton_barcode

for a in dataset.keys():
    for b in dataset[a].keys():
        dataset[a][b] = dataset[a][b][passed]


Storing parton's kinematics information: 100%|██████████| 5/5 [01:03<00:00, 12.66s/it]

CPU times: user 1min 3s, sys: 27.7 ms, total: 1min 3s
Wall time: 1min 3s





In [15]:
def deltaR_matching(NUM_OF_PARTON, NUM_OF_JET, PARTON_ETA, PARTON_PHI, JET_ETA, JET_PHI, CUTS, MODEL):
    """
    This is a function for doing delta R matching.
    PARTON_ETA: Array, a list of partons's eta in a event.
    PARTON_PHI: Array, a list of partons's phi in a event.
    JET_ETA: Array, a list of jet's eta in a event.
    JET_PHI: Array, a list of jet's phi in a event.
    """
    _dR_between_parton_jet = []
    
    _parton_jet_index = np.full(NUM_OF_PARTON, -1)
    _jet_parton_index = np.full(NUM_OF_JET, -1)
    
    _jet_to_parton_list = np.zeros(len(PARTON_ETA))
    _parton_to_jet_list = np.zeros(len(JET_ETA))

    j = 0
    a = 0
    b = 0
    while a < NUM_OF_PARTON :
        for b in range( NUM_OF_JET ):
            _dR_between_parton_jet.append(calc_helper.delta_R( PARTON_ETA[a], PARTON_PHI[a], JET_ETA[b], JET_PHI[b]))
            j +=1
        a += 1 

    array = np.reshape(np.array(_dR_between_parton_jet), [NUM_OF_PARTON, NUM_OF_JET])
    array_index = [x for x in range(len(PARTON_ETA))]

    _dataset = pd.DataFrame(index = array_index, data = array).T
    
    for j in range(len(PARTON_ETA)):
        min_val = _dataset.stack().min()
        if min_val < CUTS:
            min_idx, min_col = _dataset.stack().idxmin()
            
            _jet_to_parton_list[j] = int(min_idx)
            _parton_to_jet_list[j] = int(min_col)
            _dataset = _dataset.drop([min_col], axis=1)
            _dataset = _dataset.drop([min_idx], axis=0)

        else:
            _jet_to_parton_list[j] = 'Nan'
            _parton_to_jet_list[j] = 'Nan'
    for k in range(NUM_OF_PARTON, NUM_OF_JET):
        _parton_to_jet_list[k] = 'Nan'

    if MODEL == 'ttbar' or MODEL == 'ZH':
        for j in range(len(JET_ETA)):
            if _parton_to_jet_list[j] == 0 :
                _parton_jet_index[0] = int(_jet_to_parton_list[j])
            else: 
                pass

            if _parton_to_jet_list[j] == 1 :
                _parton_jet_index[1] = int(_jet_to_parton_list[j])
            else: 
                pass
            if _parton_to_jet_list[j] == 2 :
                _parton_jet_index[2] = int(_jet_to_parton_list[j])
            else: 
                pass

            if _parton_to_jet_list[j] == 3 :
                _parton_jet_index[3] = int(_jet_to_parton_list[j])
            else:
                pass

            if _parton_to_jet_list[j] == 4 :
                _parton_jet_index[4] = int(_jet_to_parton_list[j])
            else:
                pass

            if _parton_to_jet_list[j] == 5 :
                _parton_jet_index[5] = int(_jet_to_parton_list[j])
            else: 
                pass
    elif MODEL == 'ttH':
        for j in range(len(JET_ETA)):
            if _parton_to_jet_list[j] == 0 :
                _parton_jet_index[0] = int(_jet_to_parton_list[j])
            else: 
                pass

            if _parton_to_jet_list[j] == 1 :
                _parton_jet_index[1] = int(_jet_to_parton_list[j])
            else: 
                pass
            if _parton_to_jet_list[j] == 2 :
                _parton_jet_index[2] = int(_jet_to_parton_list[j])
            else: 
                pass

            if _parton_to_jet_list[j] == 3 :
                _parton_jet_index[3] = int(_jet_to_parton_list[j])
            else:
                pass

            if _parton_to_jet_list[j] == 4 :
                _parton_jet_index[4] = int(_jet_to_parton_list[j])
            else:
                pass

            if _parton_to_jet_list[j] == 5 :
                _parton_jet_index[5] = int(_jet_to_parton_list[j])
            else: 
                pass
            
            if _parton_to_jet_list[j] == 6 :
                _parton_jet_index[6] = int(_jet_to_parton_list[j])
            else: 
                pass
            
            if _parton_to_jet_list[j] == 7 :
                _parton_jet_index[7] = int(_jet_to_parton_list[j])
            else: 
                pass
    elif MODEL == 'four_top':
        for j in range(len(JET_ETA)):
            if _parton_to_jet_list[j] == 0 :
                _parton_jet_index[0] = int(_jet_to_parton_list[j])
            else: 
                pass

            if _parton_to_jet_list[j] == 1 :
                _parton_jet_index[1] = int(_jet_to_parton_list[j])
            else: 
                pass
            if _parton_to_jet_list[j] == 2 :
                _parton_jet_index[2] = int(_jet_to_parton_list[j])
            else: 
                pass

            if _parton_to_jet_list[j] == 3 :
                _parton_jet_index[3] = int(_jet_to_parton_list[j])
            else:
                pass

            if _parton_to_jet_list[j] == 4 :
                _parton_jet_index[4] = int(_jet_to_parton_list[j])
            else:
                pass

            if _parton_to_jet_list[j] == 5 :
                _parton_jet_index[5] = int(_jet_to_parton_list[j])
            else: 
                pass
            
            if _parton_to_jet_list[j] == 6 :
                _parton_jet_index[6] = int(_jet_to_parton_list[j])
            else: 
                pass
            
            if _parton_to_jet_list[j] == 7 :
                _parton_jet_index[7] = int(_jet_to_parton_list[j])
            else: 
                pass
            
            if _parton_to_jet_list[j] == 8 :
                _parton_jet_index[8] = int(_jet_to_parton_list[j])
            else: 
                pass
            
            if _parton_to_jet_list[j] == 9 :
                _parton_jet_index[9] = int(_jet_to_parton_list[j])
            else: 
                pass
            
            if _parton_to_jet_list[j] == 10 :
                _parton_jet_index[10] = int(_jet_to_parton_list[j])
            else: 
                pass
            
            if _parton_to_jet_list[j] == 11 :
                _parton_jet_index[11] = int(_jet_to_parton_list[j])
            else: 
                pass
    elif MODEL == 'ttbar_lep_left' or MODEL == 'ttbar_lep_right':
        for j in range(len(JET_ETA)):
            if _parton_to_jet_list[j] == 0 :
                _parton_jet_index[0] = int(_jet_to_parton_list[j])
            else: 
                pass

            if _parton_to_jet_list[j] == 1 :
                _parton_jet_index[1] = int(_jet_to_parton_list[j])
            else: 
                pass
            if _parton_to_jet_list[j] == 2 :
                _parton_jet_index[2] = int(_jet_to_parton_list[j])
            else: 
                pass

            if _parton_to_jet_list[j] == 3 :
                _parton_jet_index[3] = int(_jet_to_parton_list[j])
            else:
                pass
    else:
        print("Delta R matching faild, please check your model.")

    ll = len(JET_ETA)
    for k in range(NUM_OF_PARTON):
        for m in range(ll):
            if _jet_to_parton_list[k] == int(m):
                _jet_parton_index[int(m)] = _parton_to_jet_list[k]
            else: pass

    
    return _jet_parton_index, _parton_jet_index

In [17]:
%%time 
result = [deltaR_matching(NUM_OF_PARTON, 
                          len(dataset['jet']['eta'][i]), 
                          parton_features["parton_eta"][i], 
                          parton_features["parton_phi"][i], 
                          dataset['jet']['eta'][i], 
                          dataset['jet']['phi'][i],
                          0.4, 
                          MODEL) for i in range(len(passed))]


CPU times: user 1min 23s, sys: 24 ms, total: 1min 23s
Wall time: 1min 23s


In [18]:
result[0]

(array([-1, -1, -1,  1,  0, -1,  2]), array([ 4,  3,  6, -1, -1, -1]))

In [26]:
%%time
MAX_JET_MULTIPLICITY = 30
jet_parton_index = np.array([np.pad(x[0], (0, MAX_JET_MULTIPLICITY - len(x[0])), 'constant', constant_values=(0, -999)).tolist() for x in result])
parton_jet_index = np.array([x[1].tolist() for x in result])

CPU times: user 395 ms, sys: 108 ms, total: 503 ms
Wall time: 407 ms


In [27]:
jet_parton_index

array([[  -1,   -1,   -1, ..., -999, -999, -999],
       [   0,   -1,   -1, ..., -999, -999, -999],
       [  -1,    1,   -1, ..., -999, -999, -999],
       ...,
       [  -1,    1,   -1, ..., -999, -999, -999],
       [   3,    1,    2, ..., -999, -999, -999],
       [  -1,    1,    0, ..., -999, -999, -999]])

In [28]:
parton_jet_index

array([[ 4,  3,  6, -1, -1, -1],
       [ 0,  4,  7, -1, -1, -1],
       [ 5,  1,  4, -1, -1, -1],
       ...,
       [ 3,  1, -1, -1, -1, -1],
       [-1,  1,  2,  0, -1, -1],
       [ 2,  1,  5, -1, -1, -1]])

In [None]:
source = np.array(delta_R_matrix).reshape(6,7)
print(source)
test = np.argpartition(source,7)
print(test)
source[test]


In [None]:
np.array(delta_R_matrix)[test]

In [None]:
test = np.argpartition(delta_R_matrix, 6)
delta_R_matrix[test[:3,:3]]

In [None]:
# parton_pdgid = np.zeros((len(passed), NUM_OF_DAUGHTER), dtype=np.int8)
# parton_barcode = np.zeros((len(passed), NUM_OF_DAUGHTER), dtype=np.int8)
# parton_pt = np.zeros((len(passed), NUM_OF_DAUGHTER))
# parton_eta = np.zeros((len(passed), NUM_OF_DAUGHTER))
# parton_phi = np.zeros((len(passed), NUM_OF_DAUGHTER))
# parton_mass = np.zeros((len(passed), NUM_OF_DAUGHTER))

# for i in tqdm.trange(len(passed)):
#     idx = passed[i]
#     for j in range(NUM_OF_DAUGHTER):
#         ix = int(parton_array[i][j])
#         parton_pdgid[i][j] = particle.pid[idx][ix]
#         parton_barcode[i][j] = barcode[j]
#         parton_pt[i][j] = particle.pt[idx][ix]
#         parton_eta[i][j] = particle.eta[idx][ix]
#         parton_phi[i][j] = particle.phi[idx][ix]
#         parton_mass[i][j] = particle.mass[idx][ix]
        
#     if MODEL == 'ttbar_lep_left' or MODEL == "ttbar_lep_right":
#         print("Recording simulation lepton kinematic properties.")
#         simulation_lepton_pdgid = np.zeros(len(passed))
#         simulation_lepton_barcode = np.zeros(len(passed))
#         simulation_lepton_pt = np.zeros(len(passed))
#         simulation_lepton_eta = np.zeros(len(passed))
#         simulation_lepton_phi = np.zeros(len(passed))
#         simulation_lepton_mass = np.zeros(len(passed))
#         simulation_neutrino_pdgid = np.zeros(len(passed))
#         simulation_neutrino_barcode = np.zeros(len(passed))
#         simulation_neutrino_pt = np.zeros(len(passed))
#         simulation_neutrino_eta = np.zeros(len(passed))
#         simulation_neutrino_phi = np.zeros(len(passed))
#         simulation_neutrino_mass = np.zeros(len(passed))

#         if MODEL == 'ttbar_lep_left':
#             for i in tqdm.trange(len(passed)):
#                 for j in range(1,3):
#                     if parton_pdgid[i][j] == -11 or parton_pdgid[i][j] == -13:
#                         simulation_lepton_pdgid[i] = parton_pdgid[i][j]
#                         simulation_lepton_barcode[i] = parton_barcode[i][j]
#                         simulation_lepton_pt[i] = parton_pt[i][j]
#                         simulation_lepton_eta[i] = parton_eta[i][j]
#                         simulation_lepton_phi[i] = parton_phi[i][j]
#                         simulation_lepton_mass[i] = parton_mass[i][j]
                        
#                     else: 
#                         simulation_neutrino_pdgid[i] = parton_pdgid[i][j]
#                         simulation_neutrino_barcode[i] = parton_barcode[i][j]
#                         simulation_neutrino_pt[i] = parton_pt[i][j]
#                         simulation_neutrino_eta[i] = parton_eta[i][j]
#                         simulation_neutrino_phi[i] = parton_phi[i][j]
#                         simulation_neutrino_mass[i] = parton_mass[i][j]

#             parton_pdgid = np.delete(parton_pdgid, [1,2], 1)
#             parton_barcode = np.delete(parton_barcode, [1,2], 1)
#             parton_pt = np.delete(parton_pt, [1,2], 1)
#             parton_eta = np.delete(parton_eta, [1,2], 1)
#             parton_phi = np.delete(parton_phi, [1,2], 1)
#             parton_mass = np.delete(parton_mass, [1,2], 1)

#         elif MODEL == "ttbar_lep_right":
#             for i in tqdm.trange(len(passed)):
#                 for j in range(4,6):
#                     if parton_pdgid[i][j] == 11 or parton_pdgid[i][j] == 13:
#                         simulation_lepton_pdgid[i] = parton_pdgid[i][j]
#                         simulation_lepton_barcode[i] = parton_barcode[i][j]
#                         simulation_lepton_pt[i] = parton_pt[i][j]
#                         simulation_lepton_eta[i] = parton_eta[i][j]
#                         simulation_lepton_phi[i] = parton_phi[i][j]
#                         simulation_lepton_mass[i] = parton_mass[i][j]
#                     else: 
#                         simulation_neutrino_pdgid[i] = parton_pdgid[i][j]
#                         simulation_neutrino_barcode[i] = parton_barcode[i][j]
#                         simulation_neutrino_pt[i] = parton_pt[i][j]
#                         simulation_neutrino_eta[i] = parton_eta[i][j]
#                         simulation_neutrino_phi[i] = parton_phi[i][j]
#                         simulation_neutrino_mass[i] = parton_mass[i][j]
#             parton_pdgid = np.delete(parton_pdgid, [4, 5], 1)
#             parton_barcode = np.delete(parton_barcode, [4, 5], 1)
#             parton_pt = np.delete(parton_pt, [4, 5], 1)
#             parton_eta = np.delete(parton_eta, [4, 5], 1)
#             parton_phi = np.delete(parton_phi, [4, 5], 1)
#             parton_mass = np.delete(parton_mass, [4, 5], 1)
#         else: 
#             print("Wrong model, please check your model setting.")