In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from tqdm import tqdm
import json 
import os

print(f"Curren Working Directory: {os.getcwd()}")

Curren Working Directory: /teamspace/studios/this_studio/all/GNN/DifferentAxionMass/preprocessing


# Importing the Main tabular data

In [2]:
signal = pd.read_csv('/teamspace/studios/this_studio/all/GNN/DifferentAxionMass/txtData/EtaBounded_txtData/processed_ppzaxbba_ax75_200k_minpt20_13jets.txt', sep = '\t')

In [3]:
signal = signal.drop(['Unnamed: 0'], axis = 1)

In [4]:
#Adding Labels in the DataFrame
signal['label'] = 1

In [5]:
print(list(signal.columns))

['eventno', 'jetmultiplicity', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet1_btag', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'jet2_btag', 'jet3_Eta', 'jet3_Phi', 'jet3_pT', 'jet3_Px', 'jet3_Py', 'jet3_Pz', 'jet3_E', 'jet3_btag', 'jet4_Eta', 'jet4_Phi', 'jet4_pT', 'jet4_Px', 'jet4_Py', 'jet4_Pz', 'jet4_E', 'jet4_btag', 'jet5_Eta', 'jet5_Phi', 'jet5_pT', 'jet5_Px', 'jet5_Py', 'jet5_Pz', 'jet5_E', 'jet5_btag', 'jet6_Eta', 'jet6_Phi', 'jet6_pT', 'jet6_Px', 'jet6_Py', 'jet6_Pz', 'jet6_E', 'jet6_btag', 'jet7_Eta', 'jet7_Phi', 'jet7_pT', 'jet7_Px', 'jet7_Py', 'jet7_Pz', 'jet7_E', 'jet7_btag', 'jet8_Eta', 'jet8_Phi', 'jet8_pT', 'jet8_Px', 'jet8_Py', 'jet8_Pz', 'jet8_E', 'jet8_btag', 'jet9_Eta', 'jet9_Phi', 'jet9_pT', 'jet9_Px', 'jet9_Py', 'jet9_Pz', 'jet9_E', 'jet9_btag', 'jet10_Eta', 'jet10_Phi', 'jet10_pT', 'jet10_Px', 'jet10_Py', 'jet10_Pz', 'jet10_E', 'jet10_btag', 'jet11_Eta', 'jet11_Phi', 'jet11_pT', 'jet11_Px', '

In [6]:
print(f"Signal Size: {len(signal)}")

Signal Size: 133375


# Aggregating Node Features

In [7]:
def get_node_features(x:pd.DataFrame,*, cols = ['Eta', 'Phi', 'pT', 'E'], num_isophoton_cols = None, num_jets_cols = None) -> list[np.array]:
    """
    Extracts node features for isophotons and the jets from a DataFrame
    and returns a list of node features for all events.

    Parameters:
    -----------
    x : pd.DataFrame
        Input DataFrame with columns containing features for each object in the event.

    Returns:
    --------
    list_of_node_features : list of lists
        Each element in the list is a 2D list of shape (<num_jets>, num_features) 
        containing node features for isophoton, jet1, and jet2.
    """
    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        list_of_node_features = []
        list_of_node_labels = []
        list_of_btag_labels = []
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_node_features = []
                event_node_labels = []
                jet_btag = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if np.sum(np.abs(isophoton_features)) > 0 :
                        event_node_features.append(isophoton_features.tolist())
                        event_node_labels.append(0)
                        jet_btag.append(0)
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) > 0):
                        event_node_features.append(jet_features.tolist())
                        event_node_labels.append(1)
                        jet_btag.append(event['jet' + str(jet_idx) + '_btag'])
                
                list_of_node_features.append(np.array(event_node_features))
                list_of_node_labels.append(np.array(event_node_labels))
                list_of_btag_labels.append(np.array(jet_btag))
                pbar.update(1)
        print('Process Complemented :)')
        return list_of_node_features, list_of_node_labels, list_of_btag_labels
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')

In [8]:
signal_node_features, signal_node_labels, signal_btag = get_node_features(signal, num_isophoton_cols = 3, num_jets_cols = 13)

Initializing the Process...


Progress: 100%|██████████| 133375/133375 [05:58<00:00, 372.37it/s]

Process Complemented :)





In [9]:
# Shape of node_features_list
print(len(signal_node_features))           # Number of events (graphs)
print(len(signal_node_features[0]))        # Number of nodes (5: isophoton, jet1, ...)
print(len(signal_node_features[0][0]))     # Number of features per node

133375
1
4


In [10]:
signal_node_features[10] #Each Row represents node feature

array([[ 1.02579 ,  1.95808 , 31.8803  , 50.1767  ],
       [ 0.334965, -1.31648 , 30.1612  , 31.8692  ]])

In [11]:
signal_node_labels[10]

array([1, 1])

In [12]:
signal_btag[10]

array([0., 0.])

In [13]:
signal.iloc[10,:]

eventno            14.00000
jetmultiplicity     2.00000
jet1_Eta            1.02579
jet1_Phi            1.95808
jet1_pT            31.88030
                     ...   
isophoton3_Px       0.00000
isophoton3_Py       0.00000
isophoton3_Pz       0.00000
isophoton3_E        0.00000
label               1.00000
Name: 10, Length: 129, dtype: float64

# Aggregating Edge Index and Edge Features

In [14]:
from helper_function_module import Euclidean_distance as ed
def get_edge_info(x:pd.DataFrame,*, cols = ['Eta', 'Phi'], num_isophoton_cols = None, num_jets_cols = None) -> list[np.array]:
    list_edge_attr =[]
    list_edge_index = []

    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_eta_phi = []
                event_edge_attr = []
                event_edge_index = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if np.sum(np.abs(isophoton_features)) > 0 :
                        event_eta_phi.append(isophoton_features.tolist())
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) > 0 ):
                        event_eta_phi.append(jet_features.tolist())

                event_eta_phi = np.array(event_eta_phi)

                for i in range(len(event_eta_phi)):
                    for j in range(len(event_eta_phi)):
                        if j>i:
                            event_edge_index.append([i,j])
                            delta_R = ed(event_eta_phi[i], event_eta_phi[j])
                            event_edge_attr.append(delta_R)
                list_edge_attr.append(np.array(event_edge_attr))
                list_edge_index.append(np.array(event_edge_index).T)
                
                pbar.update(1)
            
        print('Process Complemented :)')
        return list_edge_attr, list_edge_index
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')
        

In [15]:
signal_edge_attr, signal_edge_index = get_edge_info(signal, num_isophoton_cols = 3, num_jets_cols = 13)

Initializing the Process...


Progress: 100%|██████████| 133375/133375 [05:53<00:00, 376.78it/s]

Process Complemented :)





In [16]:
print(len(signal_edge_attr))
print(len(signal_edge_attr[4]))

133375
0


In [17]:
signal_edge_attr[3]

array([3.120346  , 2.23430459, 1.4377249 , 1.17109574, 2.60761439,
       3.49731196])

In [18]:
signal_edge_index[3]

array([[0, 0, 0, 1, 1, 2],
       [1, 2, 3, 2, 3, 3]])

# Invariant Mass Calculation for each Event

In [19]:
def get_invMass(x:pd.DataFrame,*, num_isophoton_cols = None, num_jets_cols = None) ->list[float]:
    list_inv_mass = []
    cols = ['E', 'Px', 'Py', 'Pz']

    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_cols = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(isophoton_features)) >= 0) :
                        event_cols.append(isophoton_features.tolist())
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) >= 0):
                        event_cols.append(jet_features.tolist())
                if event_cols:
                    event_cols = np.array(event_cols)
                    # print(event_cols)
                    sumSquared_EPxPyPz = (np.sum(event_cols, axis = 0))**2
                    inv_mass = np.sqrt(np.abs(sumSquared_EPxPyPz[0] - np.sum(sumSquared_EPxPyPz[1:])))
    
                    list_inv_mass.append(inv_mass)
                
                pbar.update(1)
            
        print('Process Complemented :)')
        return list_inv_mass
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')

In [20]:
# signal_invMass = get_invMass(signal.iloc[:5, :], num_isophoton_cols = 3, num_jets_cols = 13)

In [21]:
signal_invMass = get_invMass(signal, num_isophoton_cols = 3, num_jets_cols = 13)

Initializing the Process...


Progress: 100%|██████████| 133375/133375 [06:00<00:00, 369.78it/s]

Process Complemented :)





In [22]:
print(len(signal_invMass))

133375


In [23]:
signal_invMass[0]

0.11940477377004514

In [24]:
signal.iloc[0,:]

eventno             0.000000
jetmultiplicity     3.000000
jet1_Eta           -2.196800
jet1_Phi            0.731212
jet1_pT            29.732800
                     ...    
isophoton3_Px       0.000000
isophoton3_Py       0.000000
isophoton3_Pz       0.000000
isophoton3_E        0.000000
label               1.000000
Name: 0, Length: 129, dtype: float64

# Aggregating Data for Graphs

In [25]:
def get_graph_features(eventno:list[int], node_features_list:list[np.array], edge_features_list:list[np.array], edge_index_list:list[np.array], node_label:list[np.array], btag_label:list[int], jetmultiplicity:list[int], inv_mass:list[float], label:int) -> dict:
    '''
    Extract node features,  edge features and edge index for each event and put then into a dictionary
    representating each event and return a list containing the dictionaries.

    Parameters:
    -----------
    node_features_list : list
        Each element in the list is a 2D list of shape (3, num_features) 
        containing node features for isophoton, jet1, and jet2.

    edge_features_list : list
        Each element in the list is a 1D list of size 3 
        containing edge features for isophoton, jet1, and jet2.

    edge_index_list : list
        2D list of shape (2, num_edges), connectivity information.

    Returns:
    --------
    event_graph : list of dictionaries
        Each element contains:
            - 'nodes': 2D list of shape (3, num_features), node features for isophoton, jet1, and jet2.
            - 'edges': 2D list of shape (num_edges, edge_features), edge features (Delta R) for the fully connected graph.
            - 'edge_index': 2D list of shape (2, num_edges), connectivity information.
    '''
    event_graphs = []
    if len(node_features_list) == len(edge_features_list) == len(edge_index_list) == len(node_label) == len(jetmultiplicity) == len(inv_mass) == len(btag_label):
        for i in range(len(node_features_list)):
            graph_dict = {
                'eventno' : eventno[i],
                'nodes' : node_features_list[i].tolist(),
                'edges' : edge_features_list[i].tolist(),
                'edge_index' : edge_index_list[i].tolist(),
                'node_labels' : node_label[i].tolist(),
                'jet_btag_label' : btag_label[i].tolist(),
                'jetmultiplicity' : jetmultiplicity[i],
                'inv_mass' : inv_mass[i],
                'event_label' : label
            }
            event_graphs.append(graph_dict)
        return event_graphs

    else:
        print('***Number of Events in the lists do not match***')

In [26]:
signal_graphs_dict = get_graph_features(signal['eventno'].values.tolist(), signal_node_features, signal_edge_attr, signal_edge_index, signal_node_labels, signal_btag,signal.jetmultiplicity.values.tolist(), signal_invMass, 1)

In [27]:
print(len(signal_graphs_dict))
print(len(signal_graphs_dict[0]))

133375
9


In [28]:
signal_graphs_dict[0]

{'eventno': 0,
 'nodes': [[-2.1968, 0.731212, 29.7328, 135.393]],
 'edges': [],
 'edge_index': [],
 'node_labels': [1],
 'jet_btag_label': [1.0],
 'jetmultiplicity': 3,
 'inv_mass': 0.11940477377004514,
 'event_label': 1}

# Exporting the Dictionaries as JSON File

In [29]:
import json
with open('../jsonData/ax75_signal_btag_label.json', 'w') as file:
    json.dump(signal_graphs_dict, file)

In [30]:
with open('../jsonData/ax75_signal_btag_label.json', 'r') as file:
    loaded_data = json.load(file)

print(loaded_data[3])

{'eventno': 4, 'nodes': [[0.268358, -2.75891, 77.7872, 80.605], [1.10843, 0.51914, 75.5951, 126.987], [0.996547, -0.646599, 48.9889, 75.3955], [-0.665076, 2.43077, 29.4138, 36.1624]], 'edges': [3.12034600319281, 2.2343045854229455, 1.4377249003846044, 1.1710957355442808, 2.607614386548747, 3.497311961534172], 'edge_index': [[0, 0, 0, 1, 1, 2], [1, 2, 3, 2, 3, 3]], 'node_labels': [1, 1, 1, 1], 'jet_btag_label': [1.0, 0.0, 0.0, 1.0], 'jetmultiplicity': 5, 'inv_mass': 276.27463947423763, 'event_label': 1}
