In [1]:
# This notebook is for creating a training sample and a separate testing sample
# that includes lepton pt,eta,phi, met and ht from event 
# file all_jets_fullRun2_v1.parquet

In [2]:
import awkward as ak
import numba
import numpy as np
import pandas as pd
import awkward as ak
import h5py
import vector
vector.register_numba()
vector.register_awkward()

#import matplotlib.pyplot as plt
#from matplotlib.colors import LogNorm
#import mplhep as hep
#hep.style.use(hep.style.ROOT)

In [3]:
filename = "/afs/cern.ch/work/m/mmarcheg/ttHbb/ttHbb_SPANet/test_dataset/output_test_v3/ttHTobb_ttToSemiLep.parquet"
df = ak.from_parquet(filename)

In [4]:
df.JetGood

In [5]:
jets = ak.with_name(df.JetGood, name="Momentum4D")
leptons = ak.with_name(df.LeptonGood, name="Momentum4D")
met = ak.with_name(df.MET, name="Momentum4D")
jets

In [6]:
frac_train = 0.8
ntot = ak.count(met)
index_train_max = int(np.ceil(frac_train*ntot))
ntot

1277812

In [7]:
jets_train = jets[:index_train_max]
leptons_train = leptons[:index_train_max]
met_train = met[:index_train_max]

jets_test = jets[index_train_max:]
leptons_test = leptons[index_train_max:]
met_test = met[index_train_max:]

ak.count(met_train), ak.count(met_test)

(1022250, 255562)

In [60]:
def create_groups(file):
    file.create_group("TARGETS/t1") # hadronic top -> q1 q2 b
    file.create_group("TARGETS/t2") # leptonic top -> b
    file.create_group("TARGETS/h") # higgs -> b1 b2
    file.create_group("INPUTS")
    file.create_group("INPUTS/Source")
    file.create_group("INPUTS/Lepton")
    file.create_group("INPUTS/Met")
    file.create_group("INPUTS/ht")
    return file

def create_targets(file, particle, jets):
    multiindex = ak.zip([ak.local_index(jets, i) for i in range(jets.ndim)])
    
    if particle == "h":
        mask = jets.prov == 1 # H->b1b2
        multiindex2 = multiindex[mask]
        
        b1_array = []
        b2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                b1_array.append(-1)
                b2_array.append(-1)
            elif len(i) == 1:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(-1)
            elif len(i) == 2:
                b1_array.append(i[0].tolist()[1])
                b2_array.append(i[1].tolist()[1])
        
        file.create_dataset("TARGETS/h/b1", np.shape(b1_array), dtype='int64', data=b1_array)
        file.create_dataset("TARGETS/h/b2", np.shape(b2_array), dtype='int64', data=b2_array)
        
    elif particle == "t1":
        mask = jets.prov == 5 # W->q1q2 from t1
        multiindex2 = multiindex[mask]
        
        q1_array = []
        q2_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                q1_array.append(-1)
                q2_array.append(-1)
            elif len(i) == 1:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(-1)
            elif len(i) == 2:
                q1_array.append(i[0].tolist()[1])
                q2_array.append(i[1].tolist()[1])
                
        mask = jets.prov == 2 # t1->Wb 
        multiindex2 = multiindex[mask]
        
        had_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                had_b_array.append(-1)
            elif len(i) == 1:
                had_b_array.append(i[0].tolist()[1])
                
        file.create_dataset("TARGETS/t1/q1", np.shape(q1_array), dtype='int64', data=q1_array)
        file.create_dataset("TARGETS/t1/q2", np.shape(q2_array), dtype='int64', data=q2_array)
        file.create_dataset("TARGETS/t1/b", np.shape(had_b_array), dtype='int64', data=had_b_array)
                
    elif particle == "t2":
        mask = jets.prov == 3 # t2->b 
        multiindex2 = multiindex[mask]
        
        lep_b_array = []

        for index,i in enumerate(multiindex2):
            if len(i) == 0:
                lep_b_array.append(-1)
            elif len(i) == 1:
                lep_b_array.append(i[0].tolist()[1])

        file.create_dataset("TARGETS/t2/b", np.shape(lep_b_array), dtype='int64', data=lep_b_array)

def create_inputs(file, jets, lep, met):
    pt_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.pt, 16, clip=True), 0))
    mask = ~(pt_array == 0)
    mask_ds = file.create_dataset("INPUTS/Jet/MASK", np.shape(mask), dtype='bool', data=mask)
    pt_ds = file.create_dataset("INPUTS/Jet/pt", np.shape(pt_array), dtype='float32', data=pt_array)

    phi_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.phi, 16, clip=True), 0))
    phi_ds = file.create_dataset("INPUTS/Jet/phi", np.shape(phi_array), dtype='float32', data=phi_array)

    eta_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.eta, 16, clip=True), 0))
    eta_ds = file.create_dataset("INPUTS/Jet/eta", np.shape(eta_array), dtype='float32', data=eta_array)

    btag = ak.to_numpy(ak.fill_none(ak.pad_none(jets.btag, 16, clip=True), 0))
    btag_ds = file.create_dataset("INPUTS/Jet/btag", np.shape(btag), dtype='float32', data=btag)
    
    # Fill Lepton
    pt_array = ak.to_numpy(lep.pt)
    pt_ds = file.create_dataset("INPUTS/Lepton/pt", np.shape(pt_array), dtype='float32', data=pt_array)

    phi_array = ak.to_numpy(lep.phi)
    phi_ds = file.create_dataset("INPUTS/Lepton/phi", np.shape(phi_array), dtype='float32', data=phi_array)

    eta_array = ak.to_numpy(lep.eta)
    eta_ds = file.create_dataset("INPUTS/Lepton/eta", np.shape(eta_array), dtype='float32', data=eta_array)

    # Fill Met
    pt_array = ak.to_numpy(met.pt)
    pt_ds = file.create_dataset("INPUTS/Met/pt", np.shape(pt_array), dtype='float32', data=pt_array)

    phi_array = ak.to_numpy(met.phi)
    phi_ds = file.create_dataset("INPUTS/Met/phi", np.shape(phi_array), dtype='float32', data=phi_array)

    eta_array = ak.to_numpy(met.eta)
    eta_ds = file.create_dataset("INPUTS/Met/eta", np.shape(eta_array), dtype='float32', data=eta_array)

    # Fill ht
    pt_array = ak.to_numpy(ak.fill_none(ak.pad_none(jets.pt, 15, clip=True), 0))
    ht_array = np.sum(pt_array, axis=1)
    ht_ds = file.create_dataset("INPUTS/ht/ht", np.shape(ht_array), dtype='float32', data=ht_array)

In [61]:
mask_fullymatched = ak.sum(jets_test.matched == True, axis=1)>=6
jets_test_fullymatched = jets_test[mask_fullymatched]

In [62]:
higgs = jets_test_fullymatched[jets_test_fullymatched.prov == 1]
mask_match = ak.num(higgs) == 2

w_or_t_jets = jets_test_fullymatched[(jets_test_fullymatched.prov == 5)|(jets_test_fullymatched.prov == 2)]
mask_match = mask_match & (ak.num(w_or_t_jets) == 3)

lep_top = jets_test_fullymatched[jets_test_fullymatched.prov == 3]
mask_match = mask_match & (ak.num(lep_top) == 1)

jets_test_fullymatched = jets_test_fullymatched[mask_match]
jets_test_fullymatched

In [63]:
# Prepare files for inputs and targets
test_file = h5py.File("test_lep_met_ht_matched_v2.h5", "w")
test_file = create_groups(test_file)

In [64]:
create_targets(test_file, "h", jets_test_fullymatched)
create_targets(test_file, "t1", jets_test_fullymatched)
create_targets(test_file, "t2", jets_test_fullymatched)

In [65]:
# Create input arrays in the files
create_inputs(test_file, jets_test_fullymatched, leptons_test, met_test)

In [66]:
test_file

<HDF5 file "test_lep_met_ht_matched_v2.h5" (mode r+)>

In [67]:
# Print the index of b-jets
test_file["TARGETS"]["t1"]["b"][:5]

array([6, 5, 0, 1, 5])

In [69]:
# Print the btag score of jets
test_file["INPUTS"]["Jet"]["btag"][:5]

array([[0.49682617, 0.43969727, 0.02072144, 0.00827789, 0.0051384 ,
        0.9663086 , 0.8642578 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.5883789 , 0.328125  , 0.9995117 , 0.00226212, 0.03013611,
        0.8149414 , 0.40551758, 0.00651169, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.9765625 , 0.99316406, 0.9995117 , 0.83251953, 0.05593872,
        0.0138855 , 0.01021576, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.10681152, 0.9790039 , 0.0066452 , 0.00436401, 0.9995117 ,
        0.99609375, 0.00643921, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.02229309, 0.9980469 , 0.99560547, 0.00672913, 0.06240845,
        0.95751953, 0.00355911, 

In [70]:
test_file.close()