In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from tqdm import tqdm
import json 

# Importing the Main tabular data

In [2]:
signal = pd.read_csv('/teamspace/studios/this_studio/Preprocessing Eflow Images/data/dataset3/sig_btagged2jets.txt', sep = '\t')
background = pd.read_csv('/teamspace/studios/this_studio/Preprocessing Eflow Images/data/dataset3/bg_btagged2jets.txt', sep = '\t')

In [3]:
#Adding Labels in the DataFrame
signal['label'] = 1
background['label'] = 0

In [4]:
print(list(signal.columns))

['eventno', 'jetmultiplicity', 'axion_Eta', 'axion_Phi', 'axion_pT', 'axion_Px', 'axion_Py', 'axion_Pz', 'axion_E', 'Z_Eta', 'Z_Phi', 'Z_pT', 'Z_Px', 'Z_Py', 'Z_Pz', 'Z_E', 'highb_Eta', 'highb_Phi', 'highb_pT', 'highb_Px', 'highb_Py', 'highb_Pz', 'highb_E', 'lowb_Eta', 'lowb_Phi', 'lowb_pT', 'lowb_Px', 'lowb_Py', 'lowb_Pz', 'lowb_E', 'genphoton_Eta', 'genphoton_Phi', 'genphoton_pT', 'genphoton_Px', 'genphoton_Py', 'genphoton_Pz', 'genphoton_E', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'jet3_Eta', 'jet3_Phi', 'jet3_pT', 'jet3_Px', 'jet3_Py', 'jet3_Pz', 'jet3_E', 'jet4_Eta', 'jet4_Phi', 'jet4_pT', 'jet4_Px', 'jet4_Py', 'jet4_Pz', 'jet4_E', 'isophoton_Eta', 'isophoton_Phi', 'isophoton_pT', 'isophoton_Px', 'isophoton_Py', 'isophoton_Pz', 'isophoton_E', 'jet1_btag', 'jet2_btag', 'jet3_btag', 'jet4_btag', 'invmass_2j1p', 'delta_eta_jet1_jet2', 'delta_phi_jet1_jet2', 'deltaR_jet1

In [5]:
print(list(background.columns))

['eventno', 'jetmultiplicity', 'highb_Eta', 'highb_Phi', 'highb_pT', 'highb_Px', 'highb_Py', 'highb_Pz', 'highb_E', 'lowb_Eta', 'lowb_Phi', 'lowb_pT', 'lowb_Px', 'lowb_Py', 'lowb_Pz', 'lowb_E', 'genphoton_Eta', 'genphoton_Phi', 'genphoton_pT', 'genphoton_Px', 'genphoton_Py', 'genphoton_Pz', 'genphoton_E', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'jet3_Eta', 'jet3_Phi', 'jet3_pT', 'jet3_Px', 'jet3_Py', 'jet3_Pz', 'jet3_E', 'jet4_Eta', 'jet4_Phi', 'jet4_pT', 'jet4_Px', 'jet4_Py', 'jet4_Pz', 'jet4_E', 'isophoton_Eta', 'isophoton_Phi', 'isophoton_pT', 'isophoton_Px', 'isophoton_Py', 'isophoton_Pz', 'isophoton_E', 'jet1_btag', 'jet2_btag', 'jet3_btag', 'jet4_btag', 'invmass_2j1p', 'delta_eta_jet1_jet2', 'delta_phi_jet1_jet2', 'deltaR_jet1_jet2', 'label']


In [6]:
print(f"Signal Events: {len(signal)}")
print(f"Background Events: {len(background)}")

Signal Events: 886
Background Events: 1529


### ADDITIONAL STEP: Renaming the isophoton columns to isophoton1

In [7]:
rename_dict = {'isophoton_Eta' : 'isophoton1_Eta', 
              'isophoton_Phi' : 'isophoton1_Phi', 
              'isophoton_pT' : 'isophoton1_pT', 
              'isophoton_Px' : 'isophoton1_Px', 
              'isophoton_Py' : 'isophoton1_Py', 
              'isophoton_Pz' : 'isophoton1_Pz', 
              'isophoton_E' : 'isophoton1_E'}

signal.rename(rename_dict, inplace = True, axis = 1)
background.rename(rename_dict, inplace = True, axis = 1)

# Aggregating Node Features

In [8]:
def get_node_features(x:pd.DataFrame,*, cols = ['Eta', 'Phi', 'pT', 'E'], num_isophoton_cols:int = None, num_jets_cols:int = None) -> list[np.array]:
    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        list_of_node_features = []
        list_of_node_labels = []
        list_of_btag_labels = []
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_node_features = []
                event_node_labels = []
                jet_btag = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if np.sum(np.abs(isophoton_features)) > 0 :
                        event_node_features.append(isophoton_features.tolist())
                        event_node_labels.append(0)
                        jet_btag.append(0)
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) > 0):
                        event_node_features.append(jet_features.tolist())
                        event_node_labels.append(1)
                        jet_btag.append(event['jet' + str(jet_idx) + '_btag'])
                
                list_of_node_features.append(np.array(event_node_features))
                list_of_node_labels.append(np.array(event_node_labels))
                list_of_btag_labels.append(np.array(jet_btag))
                pbar.update(1)
        print('Process Complemented :)')
        return list_of_node_features, list_of_node_labels, list_of_btag_labels
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')

In [9]:
signal_node_features, signal_node_labels, signal_btag = get_node_features(signal, 
                                                                          num_isophoton_cols = 1, 
                                                                          num_jets_cols = 4)

background_node_features, background_node_labels, background_btag = get_node_features(background, 
                                                                                      num_isophoton_cols = 1, 
                                                                                      num_jets_cols = 4)

Initializing the Process...


Progress: 100%|██████████| 886/886 [00:00<00:00, 965.43it/s]


Process Complemented :)
Initializing the Process...


Progress: 100%|██████████| 1529/1529 [00:01<00:00, 903.12it/s]

Process Complemented :)





In [10]:
# Shape of node_features_list
print(len(signal_node_features))           # Number of events (graphs)
print(len(signal_node_features[0]))        # Number of nodes (5: isophoton, jet1, jet2, jet3, jet4)
print(len(signal_node_features[0][0]))     # Number of features per node

886
3
4


In [11]:
# Shape of node_features_list
print(len(background_node_features))           # Number of events (graphs)
print(len(background_node_features[0]))        # Number of nodes (3: isophoton, jet1, jet2, jet3, jet4)
print(len(background_node_features[0][0]))     # Number of features per node

1529
3
4


In [12]:
signal_node_features[0] #Each Row represents node feature

array([[ 2.14053e-01,  2.49454e+00,  3.78202e+01,  3.86899e+01],
       [-1.43928e-02, -2.08869e-01,  2.86807e+01,  2.86837e+01],
       [-1.15036e-01, -1.73382e+00,  2.32269e+01,  2.33808e+01]])

In [13]:
signal_node_labels[0]

array([0, 1, 1])

In [14]:
signal.iloc[0,:]

eventno                228.000000
jetmultiplicity          2.000000
axion_Eta               -0.034237
axion_Phi               -0.864124
axion_pT                44.492100
                          ...    
invmass_2j1p            90.217625
delta_eta_jet1_jet2      0.100643
delta_phi_jet1_jet2      1.524951
deltaR_jet1_jet2         1.528268
label                    1.000000
Name: 0, Length: 81, dtype: float64

# Aggregating Edge Index and Edge Features

In [15]:
from helper_function_module import Euclidean_distance as ed
def get_edge_info(x:pd.DataFrame,*, cols = ['Eta', 'Phi'], num_isophoton_cols:int = None, num_jets_cols:int = None) -> list[np.array]:
    list_edge_attr =[]
    list_edge_index = []

    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_eta_phi = []
                event_edge_attr = []
                event_edge_index = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if np.sum(np.abs(isophoton_features)) > 0 :
                        event_eta_phi.append(isophoton_features.tolist())
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) > 0 ):
                        event_eta_phi.append(jet_features.tolist())

                event_eta_phi = np.array(event_eta_phi)

                for i in range(len(event_eta_phi)):
                    for j in range(len(event_eta_phi)):
                        if j>i:
                            event_edge_index.append([i,j])
                            delta_R = ed(event_eta_phi[i], event_eta_phi[j])
                            event_edge_attr.append(delta_R)
                list_edge_attr.append(np.array(event_edge_attr))
                list_edge_index.append(np.array(event_edge_index).T)
                
                pbar.update(1)
            
        print('Process Complemented :)')
        return list_edge_attr, list_edge_index
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')
        

In [16]:
signal_edge_attr, signal_edge_index = get_edge_info(signal, num_isophoton_cols = 1, num_jets_cols = 2)
background_edge_attr, background_edge_index = get_edge_info(background, num_isophoton_cols = 1, num_jets_cols = 2)

Initializing the Process...


Progress: 100%|██████████| 886/886 [00:00<00:00, 1209.92it/s]


Process Complemented :)
Initializing the Process...


Progress: 100%|██████████| 1529/1529 [00:01<00:00, 1370.06it/s]

Process Complemented :)





In [17]:
print(len(signal_edge_attr))
print(len(signal_edge_attr[4]))

886
3


In [18]:
signal_edge_attr[3]

array([2.22072783, 3.70415451, 1.90379881])

In [19]:
signal_edge_index[3]

array([[0, 0, 1],
       [1, 2, 2]])

In [20]:
print(len(background_edge_attr))
print(len(background_edge_attr[0]))

1529
3


# Invariant Mass Calculation for each Event

In [21]:
def get_invMass(x:pd.DataFrame,*, num_isophoton_cols:int = None, num_jets_cols:int = None) ->list[float]:
    list_inv_mass = []
    cols = ['E', 'Px', 'Py', 'Pz']

    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_cols = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(isophoton_features)) > 0) :
                        event_cols.append(isophoton_features.tolist())
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) > 0):
                        event_cols.append(jet_features.tolist())
                if event_cols:
                    event_cols = np.array(event_cols)
                    # print(event_cols)
                    sumSquared_EPxPyPz = (np.sum(event_cols, axis = 0))**2
                    inv_mass = np.sqrt(np.abs(sumSquared_EPxPyPz[0] - np.sum(sumSquared_EPxPyPz[1:])))
    
                    list_inv_mass.append(inv_mass)
                
                pbar.update(1)
            
        print('Process Complemented :)')
        return list_inv_mass
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')

In [22]:
signal_invMass = get_invMass(signal, num_isophoton_cols = 1, num_jets_cols = 2)
background_invMass = get_invMass(background, num_isophoton_cols = 1, num_jets_cols = 2)

Initializing the Process...


Progress: 100%|██████████| 886/886 [00:00<00:00, 1499.86it/s]


Process Complemented :)
Initializing the Process...


Progress: 100%|██████████| 1529/1529 [00:01<00:00, 1520.36it/s]

Process Complemented :)





In [23]:
signal_invMass[0]

90.21762513276815

In [24]:
signal.iloc[0,:]

eventno                228.000000
jetmultiplicity          2.000000
axion_Eta               -0.034237
axion_Phi               -0.864124
axion_pT                44.492100
                          ...    
invmass_2j1p            90.217625
delta_eta_jet1_jet2      0.100643
delta_phi_jet1_jet2      1.524951
deltaR_jet1_jet2         1.528268
label                    1.000000
Name: 0, Length: 81, dtype: float64

# Aggregating Data for Graphs

In [25]:
def get_graph_features(eventno:list[int], node_features_list:list[np.array], edge_features_list:list[np.array], edge_index_list:list[np.array], node_label:list[np.array], btag_label:list[int], jetmultiplicity:list[int], inv_mass:list[float], label:int) -> dict:
    event_graphs = []
    if len(node_features_list) == len(edge_features_list) == len(edge_index_list) == len(node_label) == len(jetmultiplicity) == len(inv_mass) == len(btag_label):
        for i in range(len(node_features_list)):
            graph_dict = {
                'eventno' : eventno[i],
                'nodes' : node_features_list[i].tolist(),
                'edges' : edge_features_list[i].tolist(),
                'edge_index' : edge_index_list[i].tolist(),
                'node_labels' : node_label[i].tolist(),
                'jet_btag_label' : btag_label[i].tolist(),
                'jetmultiplicity' : jetmultiplicity[i],
                'inv_mass' : inv_mass[i],
                'event_label' : label
            }
            event_graphs.append(graph_dict)
        return event_graphs

    else:
        print('***Number of Events in the lists do not match***')

In [26]:
signal_graphs_dict = get_graph_features(signal['eventno'].values.tolist(), signal_node_features, signal_edge_attr, signal_edge_index, signal_node_labels, signal_btag,signal.jetmultiplicity.values.tolist(), signal_invMass, 1)
background_graphs_dict = get_graph_features(background['eventno'].values.tolist(), background_node_features, background_edge_attr, background_edge_index, background_node_labels, background_btag,background.jetmultiplicity.values.tolist(), background_invMass, 0)

In [33]:
print(f"Signal Events Count: {len(signal_graphs_dict)}")
print(f"Background Events Count: {len(background_graphs_dict)}")

Signal Events Count: 886
Background Events Count: 1529


In [28]:
signal_graphs_dict[0]

{'eventno': 228,
 'nodes': [[0.214053, 2.49454, 37.8202, 38.6899],
  [-0.0143928, -0.208869, 28.6807, 28.6837],
  [-0.115036, -1.73382, 23.2269, 23.3808]],
 'edges': [2.713043992422283, 2.0810109593528523, 1.5282684993505689],
 'edge_index': [[0, 0, 1], [1, 2, 2]],
 'node_labels': [0, 1, 1],
 'jet_btag_label': [0.0, 1.0, 1.0],
 'jetmultiplicity': 2,
 'inv_mass': 90.21762513276815,
 'event_label': 1}

In [30]:
background_graphs_dict[0]

{'eventno': 469,
 'nodes': [[1.13126, -0.431804, 19.0611, 32.6154],
  [1.11429, 0.962557, 51.6036, 87.0954],
  [1.74965, -1.97337, 50.5056, 149.659]],
 'edges': [1.394464262439522, 1.6609731859533434, 3.0038890923149943],
 'edge_index': [[0, 0, 1], [1, 2, 2]],
 'node_labels': [0, 1, 1],
 'jet_btag_label': [0.0, 1.0, 1.0],
 'jetmultiplicity': 2,
 'inv_mass': 123.59113817850537,
 'event_label': 0}

# Exporting the Dictionaries as JSON File
*Uncomment to save the dictionaries as json files

In [31]:
# import json
# with open('signal_btagged_2j1p.json', 'w') as file:
#     json.dump(signal_graphs_dict, file)

In [32]:
# with open('background_btagged_2j1p.json', 'w') as file:
#     json.dump(background_graphs_dict, file)

In [38]:
# with open('signal_btagged_2j1p.json', 'r') as file:
#     loaded_data = json.load(file)

# print(loaded_data[885])