In [327]:
#Import libraries
import uproot
import numpy as np
import awkward as ak
import matplotlib.pyplot as plt
import time
import math
import pandas as pd

In [328]:
#Define helper functions. These functions display relevant info about a branch if needed.
def dp(branch, length):
    if length > len(branch):
        length = len(branch)  # Ensures we do not go out of range if length is greater than the branch size
    for i, value in enumerate(branch[:length]):
        print("Value:", value, "   Index:", i)

def ratio(branch):
    ratio = ak.count_nonzero(branch)/ak.count(branch)
    print(ratio) #Display ratio of number of elements that pass

In [329]:
#Display all columns
pd.set_option('display.max_columns', None)

## Begin with Vertex Data

In [330]:
#Load Background Vertex Data
bkg_mdt = uproot.concatenate("bkg_files/*:trees_DV_","MSVtx_nMDT")
bkg_rpc = uproot.concatenate("bkg_files/*:trees_DV_","MSVtx_nRPC")
bkg_tgc = uproot.concatenate("bkg_files/*:trees_DV_","MSVtx_nTGC")
bkg_ntrks = uproot.concatenate("bkg_files/*:trees_DV_","MSVtx_nTrks")
bkg_vtx_eta = uproot.concatenate("bkg_files/*:trees_DV_","MSVtx_eta")
bkg_vtx_phi = uproot.concatenate("bkg_files/*:trees_DV_","MSVtx_phi")
bkg_sumpt = uproot.concatenate("bkg_files/*:trees_DV_","MSVtx_sumTrackPt0p2Cone")

In [331]:
#Create bakg array features with just values (no field name)
mdt = ak.Array(bkg_mdt.MSVtx_nMDT)
rpc = ak.Array(bkg_rpc.MSVtx_nRPC)
tgc = ak.Array(bkg_tgc.MSVtx_nTGC)
ntrks = ak.Array(bkg_ntrks.MSVtx_nTrks)
vtx_eta = ak.Array(bkg_vtx_eta.MSVtx_eta)
vtx_phi = ak.Array(bkg_vtx_phi.MSVtx_phi)
sumpt = ak.Array(bkg_sumpt.MSVtx_sumTrackPt0p2Cone)

In [332]:
#Before flattening to numpy, I need to reference event # to each value of vertex
##### NOTE: LOOK FOR A WORKAROUND THAT DOESN'T INVOLVE FLATTENING TO NUMPY #####

dummy_vtx= ak.zeros_like((mdt)) #Create a dummy array that captures the structure of the vertex branches
lengths = ak.num(dummy) 
eventnp = ak.to_numpy((ak.concatenate([ak.Array([i]*lengths[i]) for i in range(len(mdt))])))
eventnp_series = pd.Series(eventnp)

In [333]:
#Define function to duplicate necessary rows in a series

#Create a duplication index that corresponds to rows >1 vertex per event
dup_index=[]
for i in range(len(eventnp)-1):
    if eventnp[i]==eventnp[i+1]:
        dup_index.append(i)
        
#This function duplicates a row that exists in a pandas dataframe. 
    #Necessary for mapping event-level data to vtx level dimension

def dup_row(df,index):
    for i in index:
        # Index of the row to duplicate
        row_index = i

        # Split the DataFrame into two parts: before and after the row to be duplicated
        before_row = df.iloc[:row_index + 1]
        after_row = df.iloc[row_index + 1:]

        # Duplicate the row
        duplicated_row = df.iloc[row_index:row_index + 1]

        # Concatenate the parts along with the duplicated row in between
        df = pd.concat([before_row, duplicated_row, after_row], ignore_index=True)
    return df

In [334]:
#Calculate vertex multiplicity
mult = np.zeros(len(dummy_vtx))
for i in range(len(dummy_vtx)):
    mult[i] = ak.num(dummy_vtx[i],axis=0)

In [335]:
#Create series for vertex data 
mdt_series = pd.Series(ak.to_numpy(ak.flatten(mdt)))
rpc_series = pd.Series(ak.to_numpy(ak.flatten(rpc)))
tgc_series = pd.Series(ak.to_numpy(ak.flatten(tgc)))
ntrks_series = pd.Series(ak.to_numpy(ak.flatten(ntrks)))
vtx_eta_series = pd.Series(ak.to_numpy(ak.flatten(vtx_eta)))
vtx_phi_series = pd.Series(ak.to_numpy(ak.flatten(vtx_phi)))
sumpt_series = pd.Series(ak.to_numpy(ak.flatten(sumpt)))
mult_series=dup_row(pd.Series(mult), dup_index)

In [336]:
#Create vertex data Frame
df_vtx = pd.concat({"Event #":eventnp_series, "Mult":mult_series, "MDT":mdt_series,"RPC":rpc_series,"TGC":tgc_series,"ntrks":ntrks_series,"Vtx_eta":vtx_eta_series,"Vtx_phi":vtx_phi_series,"Vtx_Sum_pT":sumpt_series},axis=1)

In [337]:
df_vtx   #Most of these values are correspond to no vertex present.

Unnamed: 0,Event #,Mult,MDT,RPC,TGC,ntrks,Vtx_eta,Vtx_phi,Vtx_Sum_pT
0,0,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0
1,1,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0
2,2,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0
3,3,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0
4,4,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0
...,...,...,...,...,...,...,...,...,...
107390,107286,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0
107391,107287,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0
107392,107288,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0
107393,107289,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0


In [338]:
#Check pct of values that are bad vertices
pct = ak.sum(mdt==-99999)/ak.count(mdt)
print(pct*100)

88.55905768424974


In [339]:
#Verify length of dataframe is correct
len(ak.flatten(dummy)) == len(df_vtx)

True

In [340]:
#Inspect what this looks like with erroneos values removed
df_vtx_clean = df_vtx[df_vtx['MDT']!=-99999]
df_vtx_clean   #Looks good, will remove after data is complete

Unnamed: 0,Event #,Mult,MDT,RPC,TGC,ntrks,Vtx_eta,Vtx_phi,Vtx_Sum_pT
15,15,1.0,348,269,0,3,-0.516751,-0.077480,14812.751151
35,35,1.0,786,619,64,6,-0.892273,2.179616,8976.236513
61,61,1.0,664,446,104,6,0.929319,1.601917,12842.178683
72,72,1.0,1133,891,97,8,0.905441,0.758930,6912.988572
76,76,1.0,354,272,0,4,-0.562586,-0.095184,4940.262423
...,...,...,...,...,...,...,...,...,...
107348,107244,1.0,359,13,248,4,1.575715,0.658992,11393.661010
107350,107246,1.0,1503,7,841,5,-1.530452,-1.342251,9598.069259
107360,107256,1.0,393,513,0,3,-0.629165,-1.441538,22073.654373
107368,107264,1.0,946,120,838,6,1.434448,1.678872,5689.162224


## Jet Data

In [341]:
#Load Jet Branches
bkg_jet_pt = uproot.concatenate("bkg_files/*:trees_DV_","jet_pT")
bkg_jet_logRatio = uproot.concatenate("bkg_files/*:trees_DV_","jet_logRatio")
bkg_jet_jvt = uproot.concatenate("bkg_files/*:trees_DV_","jet_jvt")
bkg_jet_eta = uproot.concatenate("bkg_files/*:trees_DV_","jet_eta")
bkg_jet_phi = uproot.concatenate("bkg_files/*:trees_DV_","jet_phi")

#Convert to Awkward
jet_pt = ak.Array(jet_pT_raw.jet_pT)
jet_logRatio = ak.Array(jet_logRatio_raw.jet_logRatio)
jet_jvt = ak.Array(jet_jvt_raw.jet_jvt)
jet_eta = ak.Array(jet_eta_raw.jet_eta)
jet_phi = ak.Array(jet_phi_raw.jet_phi)



In [342]:
#Create dummy_jet as index reference
dummy_jet=ak.zeros_like(jet_pt)

In [343]:
#Calculate dR for jet-vertex
jet_vtx_etaPairs = ak.cartesian({"x":jet_eta, "y":vtx_eta}, axis=1) 
jet_vtx_phiPairs = ak.cartesian({"x":jet_phi, "y":vtx_phi}, axis=1) 

delta_jet_eta = abs(jet_vtx_etaPairs["x"] - jet_vtx_etaPairs["y"])
delta_jet_phi = abs(jet_vtx_phiPairs["x"] - jet_vtx_phiPairs["y"])

delta_jet = (delta_jet_eta * delta_jet_eta) + (delta_jet_phi * delta_jet_phi)

In [344]:
#This function will find the nth smallest values  for each event, and create attach a corresponding index to reference in the training set
def find_nth_smallest_values(awk_array, n_values):
    nth_smallest_values = {n: [] for n in n_values}
    indices_of_nth_smallest_values = {n: [] for n in n_values}

    for branch in awk_array:
        branch_length = len(branch)
        sorted_array = np.sort(ak.to_numpy(branch)) if branch_length else []
        
        for n in n_values:
            if branch_length >= n:
                value = sorted_array[n-1]
                original_index = np.where(ak.to_numpy(branch) == value)[0][0]
                nth_smallest_values[n].append(value)
                indices_of_nth_smallest_values[n].append(original_index)
            else:
                nth_smallest_values[n].append(np.nan)
                indices_of_nth_smallest_values[n].append(-1)

    # Convert lists to awkward arrays before returning
    for n in n_values:
        nth_smallest_values[n] = ak.Array(nth_smallest_values[n])
        indices_of_nth_smallest_values[n] = ak.Array(indices_of_nth_smallest_values[n])
    
    return nth_smallest_values, indices_of_nth_smallest_values

#This function will select the element of a given array that corresponds to the elements determined by nth smallest values
def element_select(awk_array, index_array):

    result = [branch[index] if (0 <= index < len(branch)) else np.nan for branch, index in zip(awk_array, index_array)]

    # Otherwise, keep it as a list or convert to a NumPy array if preferred
    result_np = np.array(result)

    return result_np


In [345]:
#Before running, check which indices of jet_pt have 0 length. Use these indices to check in your training data set
#Be prepared for divide by zero warning!
jet_length = ak.num(dummy_jet)

# Finding indices where lengths are zero
jet_length_indices = ak.where(jet_length == 0)[0]

# Print or use the zero length indices
print("Indices with zero length in jet_pt:", jet_length_indices)   #No jets have zero length.

Indices with zero length in jet_pt: []


In [346]:
#This is the main function that finds the smallest delta jet per event and maps it correctly to the appropriate 
#index in desired jet array. n_indices is used to compute the first n jets that correspond to the n smallest deltas

def jet_index_mod(jet_index, dummy_vtx, dummy_jet):
    # Initialize the corrected indices array
    jet_index_cor = np.empty_like(jet_index)
    
    # Iterate over each index in jet_index
    for i in range(len(jet_index)):
        len_a = len(dummy_vtx[i])  # Length of the corresponding entry in sig_eta
        len_b = len(dummy_jet[i])  # Length of the corresponding entry in jet_pt
        k = jet_index[i]           # Current flat index from the Cafrtesian product

        # Calculate the corrected index for jet_pt
        jet_index_cor[i] = k % len_b

    return jet_index_cor

# Array of indices for which you want to find the N-th smallest values
n_indices = [1,2,3]   #Define how many jets you'd like to display in the training set

# Call the master function once instead of multiple times for different values
nth_smallest_deltas, nth_indices = find_nth_smallest_values(delta_jet, n_indices)

#Create the appopriate arrays with selected jet elements

for n in n_indices:
    # Extract the current jet_index
    jet_index = ak.to_numpy(nth_indices[n])
    
    # Correct the jet_index based on the lengths in sig_eta and jet_pt
    corrected_jet_index = jet_index_mod(jet_index, dummy_vtx, dummy_jet)
    
    # Now you can use corrected_jet_index for further operations, e.g., accessing jet_pt
    globals()[f'jet_pt_{n}'] = element_select(jet_pt, corrected_jet_index)
    globals()[f'jet_eta_{n}'] = element_select(jet_eta, corrected_jet_index)
    globals()[f'jet_phi_{n}'] = element_select(jet_phi, corrected_jet_index)
    globals()[f'jet_logRatio_{n}'] = element_select(jet_logRatio, corrected_jet_index)
    globals()[f'jet_jvt_{n}'] = element_select(jet_jvt, corrected_jet_index)
    globals()[f'delta_jet_{n}'] = element_select(delta_jet, corrected_jet_index)


In [347]:
# Create pandas Series for each array
series_list = []
for n in n_indices:  # For each of the N-th smallest values
    jet_pt_series = pd.Series(globals()[f'jet_pt_{n}'], name=f'jet_pt_{n}')
    jet_eta_series = pd.Series(globals()[f'jet_eta_{n}'], name=f'jet_eta_{n}')
    jet_phi_series = pd.Series(globals()[f'jet_phi_{n}'], name=f'jet_phi_{n}')
    jet_logRatio_series = pd.Series(globals()[f'jet_logRatio_{n}'], name=f'jet_logRatio_{n}')
    jet_jvt_series = pd.Series(globals()[f'jet_jvt_{n}'], name=f'jet_jvt_{n}')
    delta_jet_series = pd.Series(globals()[f'delta_jet_{n}'], name=f'delta_jet_{n}')
    
    # Concatenate the Series for the current N-th smallest values into a DataFrame
    nth_dataframe = pd.concat([jet_pt_series, jet_eta_series, jet_phi_series, jet_logRatio_series, jet_jvt_series, delta_jet_series], axis=1)
    
    # Append the DataFrame to the list
    series_list.append(nth_dataframe)

# Concatenate all DataFrames vertically
df_jet = pd.concat(series_list, axis=1)

In [348]:
#Map to vertex index
df_jet = dup_row(df_jet, dup_index)

#Check length of dataframe to make sure it agrees with df_vtx
df_jet

Unnamed: 0,jet_pt_1,jet_eta_1,jet_phi_1,jet_logRatio_1,jet_jvt_1,delta_jet_1,jet_pt_2,jet_eta_2,jet_phi_2,jet_logRatio_2,jet_jvt_2,delta_jet_2,jet_pt_3,jet_eta_3,jet_phi_3,jet_logRatio_3,jet_jvt_3,delta_jet_3
0,28.996857,-1.959088,0.461107,-0.470726,0.988170,1.999930e+10,1237.142750,1.458728,-2.732167,-0.377053,0.898570,1.999935e+10,24.126797,-0.065182,0.062034,0.003970,0.986200,1.999960e+10
1,15.079721,-3.837992,-2.108300,-0.733645,-0.100000,1.999841e+10,885.581063,-2.149996,-3.106153,-0.504855,0.996025,1.999855e+10,21.923271,-3.033454,-2.159912,-1.294541,-0.100000,1.999856e+10
2,13.829452,-1.407122,-3.065858,-1.107421,0.000000,1.999871e+10,9.419872,-1.164427,-2.355565,-1.204881,0.000000,1.999890e+10,13.439811,-2.651500,-0.680673,-0.455490,0.000000,1.999893e+10
3,17.729188,-4.023110,-2.286916,-0.373216,-0.100000,1.999834e+10,26.013756,-4.076387,-1.716907,-0.678501,-0.100000,1.999844e+10,10.887927,-3.552225,-1.620498,-0.342823,-0.100000,1.999857e+10
4,6.571671,-1.371131,-3.095470,-0.588383,0.000000,1.999871e+10,16.260961,0.238942,-2.517350,-2.544390,0.000000,1.999914e+10,9.643190,-1.150869,-0.902023,-0.312297,0.000000,1.999919e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107390,145.851031,-2.115791,-2.852622,-2.459502,0.000000,1.999861e+10,1014.189938,-1.320149,-2.512095,-0.012703,0.908859,1.999883e+10,36.183793,-0.397878,-2.852291,-0.587980,0.933840,1.999895e+10
107391,14.568896,-2.693866,-1.993347,-999.000000,0.000000,1.999866e+10,1145.069625,0.184501,-2.400172,-0.014807,0.994574,1.999916e+10,80.141961,0.611706,-2.786465,0.122620,0.998418,1.999917e+10
107392,19.796543,-4.496781,-1.680207,-1.012308,-0.100000,1.999836e+10,58.562191,-3.350639,-1.320141,-0.649090,-0.100000,1.999867e+10,9.420502,-1.404429,-2.960955,-999.000000,0.000000,1.999873e+10
107393,17.801785,-4.597728,-2.952024,-999.000000,-0.100000,1.999809e+10,51.539613,-0.291970,-1.436939,-1.474495,0.000764,1.999925e+10,814.860750,-1.538431,-0.101332,-0.041433,0.993123,1.999927e+10


In [349]:
#Concatenate the two dataframes and inspect
df_vtx_jet = pd.concat([df_vtx, df_jet], axis=1)

In [350]:
df_vtx_jet

Unnamed: 0,Event #,Mult,MDT,RPC,TGC,ntrks,Vtx_eta,Vtx_phi,Vtx_Sum_pT,jet_pt_1,jet_eta_1,jet_phi_1,jet_logRatio_1,jet_jvt_1,delta_jet_1,jet_pt_2,jet_eta_2,jet_phi_2,jet_logRatio_2,jet_jvt_2,delta_jet_2,jet_pt_3,jet_eta_3,jet_phi_3,jet_logRatio_3,jet_jvt_3,delta_jet_3
0,0,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,28.996857,-1.959088,0.461107,-0.470726,0.988170,1.999930e+10,1237.142750,1.458728,-2.732167,-0.377053,0.898570,1.999935e+10,24.126797,-0.065182,0.062034,0.003970,0.986200,1.999960e+10
1,1,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,15.079721,-3.837992,-2.108300,-0.733645,-0.100000,1.999841e+10,885.581063,-2.149996,-3.106153,-0.504855,0.996025,1.999855e+10,21.923271,-3.033454,-2.159912,-1.294541,-0.100000,1.999856e+10
2,2,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,13.829452,-1.407122,-3.065858,-1.107421,0.000000,1.999871e+10,9.419872,-1.164427,-2.355565,-1.204881,0.000000,1.999890e+10,13.439811,-2.651500,-0.680673,-0.455490,0.000000,1.999893e+10
3,3,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,17.729188,-4.023110,-2.286916,-0.373216,-0.100000,1.999834e+10,26.013756,-4.076387,-1.716907,-0.678501,-0.100000,1.999844e+10,10.887927,-3.552225,-1.620498,-0.342823,-0.100000,1.999857e+10
4,4,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,6.571671,-1.371131,-3.095470,-0.588383,0.000000,1.999871e+10,16.260961,0.238942,-2.517350,-2.544390,0.000000,1.999914e+10,9.643190,-1.150869,-0.902023,-0.312297,0.000000,1.999919e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107390,107286,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,145.851031,-2.115791,-2.852622,-2.459502,0.000000,1.999861e+10,1014.189938,-1.320149,-2.512095,-0.012703,0.908859,1.999883e+10,36.183793,-0.397878,-2.852291,-0.587980,0.933840,1.999895e+10
107391,107287,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,14.568896,-2.693866,-1.993347,-999.000000,0.000000,1.999866e+10,1145.069625,0.184501,-2.400172,-0.014807,0.994574,1.999916e+10,80.141961,0.611706,-2.786465,0.122620,0.998418,1.999917e+10
107392,107288,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,19.796543,-4.496781,-1.680207,-1.012308,-0.100000,1.999836e+10,58.562191,-3.350639,-1.320141,-0.649090,-0.100000,1.999867e+10,9.420502,-1.404429,-2.960955,-999.000000,0.000000,1.999873e+10
107393,107289,1.0,-99999,-99999,-99999,-99999,-99999.0,-99999.0,-99999.0,17.801785,-4.597728,-2.952024,-999.000000,-0.100000,1.999809e+10,51.539613,-0.291970,-1.436939,-1.474495,0.000764,1.999925e+10,814.860750,-1.538431,-0.101332,-0.041433,0.993123,1.999927e+10


In [351]:
#Inspect clean (What this looks like with erroneous vertices removed. Just a check for now.)
df_vtx_jet_clean = df_vtx_jet[df_vtx_jet['MDT']!= -99999] 
df_vtx_jet_clean
#Looks Good!

Unnamed: 0,Event #,Mult,MDT,RPC,TGC,ntrks,Vtx_eta,Vtx_phi,Vtx_Sum_pT,jet_pt_1,jet_eta_1,jet_phi_1,jet_logRatio_1,jet_jvt_1,delta_jet_1,jet_pt_2,jet_eta_2,jet_phi_2,jet_logRatio_2,jet_jvt_2,delta_jet_2,jet_pt_3,jet_eta_3,jet_phi_3,jet_logRatio_3,jet_jvt_3,delta_jet_3
15,15,1.0,348,269,0,3,-0.516751,-0.077480,14812.751151,1131.384375,-0.506697,-0.257358,-0.646162,0.993258,0.032457,25.284336,-1.340412,-0.601828,-0.450745,0.065803,0.953358,4.279414,-1.345918,-1.121843,-0.579970,0.331109,1.778214
35,35,1.0,786,619,64,6,-0.892273,2.179616,8976.236513,1234.027625,-0.881845,2.140195,0.006201,0.990799,0.001663,11.807636,-0.198379,2.840555,0.712338,0.000000,0.918330,8.094094,-0.710317,0.101340,-1.279725,0.000000,4.352337
61,61,1.0,664,446,104,6,0.929319,1.601917,12842.178683,1011.941500,0.955463,1.655387,-0.413563,1.000000,0.003543,43.275977,1.384035,1.318174,-1.982363,1.000000,0.287276,8.536609,1.313195,2.119973,-0.637014,0.000000,0.415742
72,72,1.0,1133,891,97,8,0.905441,0.758930,6912.988572,687.284625,0.927673,0.726632,0.577885,0.960834,0.001537,34.001961,0.250312,0.766721,-0.272485,0.997251,0.429254,40.427914,-0.564507,0.010195,-1.068103,0.969564,2.721351
76,76,1.0,354,272,0,4,-0.562586,-0.095184,4940.262423,629.444062,-0.744940,-0.132211,-1.230658,0.992054,0.034624,12.665992,-0.207462,1.053014,-999.000000,0.000000,1.444470,17.690818,-0.090972,-1.616239,-0.801931,0.400756,2.536029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107348,107244,1.0,359,13,248,4,1.575715,0.658992,11393.661010,870.460625,1.568147,0.750515,-0.325631,0.992422,0.008434,12.769302,2.119348,1.048101,-0.198653,0.000000,0.446942,23.639617,1.381605,1.299445,0.247697,0.000000,0.447858
107350,107246,1.0,1503,7,841,5,-1.530452,-1.342251,9598.069259,856.955438,-1.482701,-1.319592,-0.087235,0.997414,0.002794,53.231535,-1.269135,-1.816236,0.105958,0.974725,0.292949,64.509305,0.290300,0.207084,0.096601,0.990000,5.715572
107360,107256,1.0,393,513,0,3,-0.629165,-1.441538,22073.654373,1375.531000,-0.611510,-1.400218,0.273299,1.000000,0.002019,18.615428,-0.147189,-1.217119,-1.267676,0.067641,0.282665,12.764722,0.203252,-2.093118,-999.000000,0.000000,1.117475
107368,107264,1.0,946,120,838,6,1.434448,1.678872,5689.162224,839.558875,1.415776,1.704664,0.039104,0.999872,0.001014,229.211109,-0.036955,1.769148,-0.145164,0.998659,2.173179,8.796845,0.143881,2.723663,-1.066577,0.000000,2.757153


## Now add Track Data

In [352]:
#Background track data 
bkg_tracketa = uproot.concatenate("bkg_files/*:trees_DV_","track_eta")
bkg_trackphi = uproot.concatenate("bkg_files/*:trees_DV_","track_phi")
bkg_trackpt = uproot.concatenate("bkg_files/*:trees_DV_","track_pT")

#Create awkard arrays
track_eta = ak.Array(bkg_tracketa.track_eta)
track_phi = ak.Array(bkg_trackphi.track_phi)
track_pt = ak.Array(bkg_trackpt.track_pT)

In [353]:
#Create dummy_jet as index reference
dummy_track=ak.zeros_like(track_pt)

In [354]:
#Find dR for track-vtx pair
track_vtx_etaPairs = ak.cartesian({"x":track_eta, "y":vtx_eta}, axis=1) 
track_vtx_phiPairs = ak.cartesian({"x":track_phi, "y":vtx_phi}, axis=1) 

delta_track_eta = abs(track_vtx_etaPairs["x"] - track_vtx_etaPairs["y"])
delta_track_phi = abs(track_vtx_phiPairs["x"] - track_vtx_phiPairs["y"])

delta_track = (delta_track_eta * delta_track_eta) + (delta_track_phi * delta_track_phi)

In [355]:
#Before running, check which indices of track_pt have 0 length. Use these indices to check in your training data set
#Be prepared for divide by zero warning!
lengths = ak.num(dummy_track)

# Finding indices where lengths are zero
zero_length_indices = ak.where(lengths == 0)[0]

# Print or use the zero length indices
print("Indices with zero length in track_pt:", zero_length_indices)

Indices with zero length in track_pt: []


In [356]:
#This is the main function that finds the smallest delta track per event and maps it correctly to the appropriate index in
#desired track array. n_indices is used to compute the first n tracks that correspond to the n smallest deltas

def track_index_mod(track_index, dummy_vtx, dummy_track):
    # Initialize the corrected indices array
    track_index_cor = np.empty_like(track_index)
    
    # Iterate over each index in track_index
    for i in range(len(track_index)):
        len_a = len(dummy_vtx[i])  # Length of the corresponding entry in sig_eta
        len_b = len(dummy_track[i])  # Length of the corresponding entry in track_pt
        k = track_index[i]           # Current flat index from the Cartesian product

        # Calculate the corrected index for track_pt
        track_index_cor[i] = k % len_b

    return track_index_cor

# Array of indices for which you want to find the N-th smallest values
n_indices = [1,2,3]   #Choose # of tracks to display in training set

# Call the master function once instead of multiple times for different values
nth_smallest_deltas, nth_indices = find_nth_smallest_values(delta_track, n_indices)

for n in n_indices:
    # Extract the current track_index
    track_index = ak.to_numpy(nth_indices[n])
    
    # Correct the track_index based on the lengths in sig_eta and track_pt
    corrected_track_index = track_index_mod(track_index, dummy_vtx, dummy_track)
    
    # Now you can use corrected_track_index for further operations, e.g., accessing track_pt
    globals()[f'track_pt_{n}'] = element_select(track_pt, corrected_track_index)
    globals()[f'delta_track_{n}'] = element_select(delta_track, corrected_track_index)


In [357]:
# Create pandas Series for each array
series_list = []
for n in n_indices:  # For each of the N-th smallest values
    track_pt_series = pd.Series(globals()[f'track_pt_{n}'], name=f'track_pt_{n}')
    delta_track_series = pd.Series(globals()[f'delta_track_{n}'], name=f'delta_track_{n}')
    
    # Concatenate the Series for the current N-th smallest values into a DataFrame
    nth_dataframe = pd.concat([track_pt_series, delta_track_series], axis=1)
    
    # Append the DataFrame to the list
    series_list.append(nth_dataframe)

# Concatenate all DataFrames vertically
df_track = pd.concat(series_list, axis=1)

In [358]:
#Map to vertex index and inspect
df_track=dup_row(df_track, dup_index)
df_vtx_track = pd.concat([df_vtx, df_track], axis=1)

In [359]:
#Inspect Clean (Just a check with bad vertices removed)
df_vtx_track_clean = df_vtx_track[df_vtx_track['MDT']!= -99999] 
df_vtx_track_clean   #Inspect dimension and make sure it agrees with that of df_jet_vtx_clean

Unnamed: 0,Event #,Mult,MDT,RPC,TGC,ntrks,Vtx_eta,Vtx_phi,Vtx_Sum_pT,track_pt_1,delta_track_1,track_pt_2,delta_track_2,track_pt_3,delta_track_3
15,15,1.0,348,269,0,3,-0.516751,-0.077480,14812.751151,1.671232,0.002017,0.961285,0.008059,1.375790,0.008294
35,35,1.0,786,619,64,6,-0.892273,2.179616,8976.236513,367.948293,0.001240,3.870365,0.001419,58.440482,0.001452
61,61,1.0,664,446,104,6,0.929319,1.601917,12842.178683,3.176802,0.000670,17.585302,0.000709,10.538464,0.000982
72,72,1.0,1133,891,97,8,0.905441,0.758930,6912.988572,1.845835,0.000683,29.897336,0.001425,80.814554,0.001462
76,76,1.0,354,272,0,4,-0.562586,-0.095184,4940.262423,107.580620,0.002954,6.667366,0.004745,2.544301,0.008668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107348,107244,1.0,359,13,248,4,1.575715,0.658992,11393.661010,8.727006,0.002084,59.587031,0.002771,8.216458,0.003100
107350,107246,1.0,1503,7,841,5,-1.530452,-1.342251,9598.069259,5.516313,0.000299,6.119189,0.000693,17.096308,0.001185
107360,107256,1.0,393,513,0,3,-0.629165,-1.441538,22073.654373,2.455945,0.000427,9.970311,0.001365,21.829408,0.001726
107368,107264,1.0,946,120,838,6,1.434448,1.678872,5689.162224,43.185385,0.000306,11.478952,0.000459,15.971504,0.001159


In [360]:
#Concatenate vertex, jet and track dataframes
df_vtx_jet_track= pd.concat([df_vtx, df_jet, df_track], axis=1)

## Build Training Set

In [361]:
#Create target and concatenate
label_series=pd.Series(np.zeros(len(eventnps)),name='Label')
df = pd.concat([df_vtx_jet_track,label_series],axis=1)

In [362]:
#Now that all data is present, remove bad vertices for final dataframe to be used for training
df_train = df[df['MDT']!= -99999] 
df_train
#Looks Good!

Unnamed: 0,Event #,Mult,MDT,RPC,TGC,ntrks,Vtx_eta,Vtx_phi,Vtx_Sum_pT,jet_pt_1,jet_eta_1,jet_phi_1,jet_logRatio_1,jet_jvt_1,delta_jet_1,jet_pt_2,jet_eta_2,jet_phi_2,jet_logRatio_2,jet_jvt_2,delta_jet_2,jet_pt_3,jet_eta_3,jet_phi_3,jet_logRatio_3,jet_jvt_3,delta_jet_3,track_pt_1,delta_track_1,track_pt_2,delta_track_2,track_pt_3,delta_track_3,Label
15,15,1.0,348,269,0,3,-0.516751,-0.077480,14812.751151,1131.384375,-0.506697,-0.257358,-0.646162,0.993258,0.032457,25.284336,-1.340412,-0.601828,-0.450745,0.065803,0.953358,4.279414,-1.345918,-1.121843,-0.579970,0.331109,1.778214,1.671232,0.002017,0.961285,0.008059,1.375790,0.008294,0.0
35,35,1.0,786,619,64,6,-0.892273,2.179616,8976.236513,1234.027625,-0.881845,2.140195,0.006201,0.990799,0.001663,11.807636,-0.198379,2.840555,0.712338,0.000000,0.918330,8.094094,-0.710317,0.101340,-1.279725,0.000000,4.352337,367.948293,0.001240,3.870365,0.001419,58.440482,0.001452,0.0
61,61,1.0,664,446,104,6,0.929319,1.601917,12842.178683,1011.941500,0.955463,1.655387,-0.413563,1.000000,0.003543,43.275977,1.384035,1.318174,-1.982363,1.000000,0.287276,8.536609,1.313195,2.119973,-0.637014,0.000000,0.415742,3.176802,0.000670,17.585302,0.000709,10.538464,0.000982,0.0
72,72,1.0,1133,891,97,8,0.905441,0.758930,6912.988572,687.284625,0.927673,0.726632,0.577885,0.960834,0.001537,34.001961,0.250312,0.766721,-0.272485,0.997251,0.429254,40.427914,-0.564507,0.010195,-1.068103,0.969564,2.721351,1.845835,0.000683,29.897336,0.001425,80.814554,0.001462,0.0
76,76,1.0,354,272,0,4,-0.562586,-0.095184,4940.262423,629.444062,-0.744940,-0.132211,-1.230658,0.992054,0.034624,12.665992,-0.207462,1.053014,-999.000000,0.000000,1.444470,17.690818,-0.090972,-1.616239,-0.801931,0.400756,2.536029,107.580620,0.002954,6.667366,0.004745,2.544301,0.008668,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107348,107244,1.0,359,13,248,4,1.575715,0.658992,11393.661010,870.460625,1.568147,0.750515,-0.325631,0.992422,0.008434,12.769302,2.119348,1.048101,-0.198653,0.000000,0.446942,23.639617,1.381605,1.299445,0.247697,0.000000,0.447858,8.727006,0.002084,59.587031,0.002771,8.216458,0.003100,0.0
107350,107246,1.0,1503,7,841,5,-1.530452,-1.342251,9598.069259,856.955438,-1.482701,-1.319592,-0.087235,0.997414,0.002794,53.231535,-1.269135,-1.816236,0.105958,0.974725,0.292949,64.509305,0.290300,0.207084,0.096601,0.990000,5.715572,5.516313,0.000299,6.119189,0.000693,17.096308,0.001185,0.0
107360,107256,1.0,393,513,0,3,-0.629165,-1.441538,22073.654373,1375.531000,-0.611510,-1.400218,0.273299,1.000000,0.002019,18.615428,-0.147189,-1.217119,-1.267676,0.067641,0.282665,12.764722,0.203252,-2.093118,-999.000000,0.000000,1.117475,2.455945,0.000427,9.970311,0.001365,21.829408,0.001726,0.0
107368,107264,1.0,946,120,838,6,1.434448,1.678872,5689.162224,839.558875,1.415776,1.704664,0.039104,0.999872,0.001014,229.211109,-0.036955,1.769148,-0.145164,0.998659,2.173179,8.796845,0.143881,2.723663,-1.066577,0.000000,2.757153,43.185385,0.000306,11.478952,0.000459,15.971504,0.001159,0.0


In [363]:
#Check NAN values
nan_counts = df_train.isna().sum()
print(nan_counts)
#Looks Good

Event #           0
Mult              0
MDT               0
RPC               0
TGC               0
ntrks             0
Vtx_eta           0
Vtx_phi           0
Vtx_Sum_pT        0
jet_pt_1          0
jet_eta_1         0
jet_phi_1         0
jet_logRatio_1    0
jet_jvt_1         0
delta_jet_1       0
jet_pt_2          0
jet_eta_2         0
jet_phi_2         0
jet_logRatio_2    0
jet_jvt_2         0
delta_jet_2       0
jet_pt_3          0
jet_eta_3         0
jet_phi_3         0
jet_logRatio_3    0
jet_jvt_3         0
delta_jet_3       0
track_pt_1        0
delta_track_1     0
track_pt_2        0
delta_track_2     0
track_pt_3        0
delta_track_3     0
Label             0
dtype: int64


In [364]:
#Export for training
df_clean.to_csv('bkg_train',index=False)