In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from tqdm import tqdm
import json 

# Importing the Main tabular data

In [2]:
signal = pd.read_csv('/teamspace/studios/this_studio/GNN/data/sig_200k_minpt20_13jets_processed.txt', sep = '\t')
background = pd.read_csv('/teamspace/studios/this_studio/GNN/data/bg_500k_minpt20_13jets_processed.txt', sep = '\t')

In [3]:
signal = signal.drop(['Unnamed: 0'], axis = 1)
background = background.drop(['Unnamed: 0'], axis = 1)

In [4]:
#Adding Labels in the DataFrame
signal['label'] = 1
background['label'] = 0

In [5]:
print(list(signal.columns))

['eventno', 'jetmultiplicity', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet1_btag', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'jet2_btag', 'jet3_Eta', 'jet3_Phi', 'jet3_pT', 'jet3_Px', 'jet3_Py', 'jet3_Pz', 'jet3_E', 'jet3_btag', 'jet4_Eta', 'jet4_Phi', 'jet4_pT', 'jet4_Px', 'jet4_Py', 'jet4_Pz', 'jet4_E', 'jet4_btag', 'jet5_Eta', 'jet5_Phi', 'jet5_pT', 'jet5_Px', 'jet5_Py', 'jet5_Pz', 'jet5_E', 'jet5_btag', 'jet6_Eta', 'jet6_Phi', 'jet6_pT', 'jet6_Px', 'jet6_Py', 'jet6_Pz', 'jet6_E', 'jet6_btag', 'jet7_Eta', 'jet7_Phi', 'jet7_pT', 'jet7_Px', 'jet7_Py', 'jet7_Pz', 'jet7_E', 'jet7_btag', 'jet8_Eta', 'jet8_Phi', 'jet8_pT', 'jet8_Px', 'jet8_Py', 'jet8_Pz', 'jet8_E', 'jet8_btag', 'jet9_Eta', 'jet9_Phi', 'jet9_pT', 'jet9_Px', 'jet9_Py', 'jet9_Pz', 'jet9_E', 'jet9_btag', 'jet10_Eta', 'jet10_Phi', 'jet10_pT', 'jet10_Px', 'jet10_Py', 'jet10_Pz', 'jet10_E', 'jet10_btag', 'jet11_Eta', 'jet11_Phi', 'jet11_pT', 'jet11_Px', '

In [6]:
print(list(background.columns))

['eventno', 'jetmultiplicity', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet1_btag', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'jet2_btag', 'jet3_Eta', 'jet3_Phi', 'jet3_pT', 'jet3_Px', 'jet3_Py', 'jet3_Pz', 'jet3_E', 'jet3_btag', 'jet4_Eta', 'jet4_Phi', 'jet4_pT', 'jet4_Px', 'jet4_Py', 'jet4_Pz', 'jet4_E', 'jet4_btag', 'jet5_Eta', 'jet5_Phi', 'jet5_pT', 'jet5_Px', 'jet5_Py', 'jet5_Pz', 'jet5_E', 'jet5_btag', 'jet6_Eta', 'jet6_Phi', 'jet6_pT', 'jet6_Px', 'jet6_Py', 'jet6_Pz', 'jet6_E', 'jet6_btag', 'jet7_Eta', 'jet7_Phi', 'jet7_pT', 'jet7_Px', 'jet7_Py', 'jet7_Pz', 'jet7_E', 'jet7_btag', 'jet8_Eta', 'jet8_Phi', 'jet8_pT', 'jet8_Px', 'jet8_Py', 'jet8_Pz', 'jet8_E', 'jet8_btag', 'jet9_Eta', 'jet9_Phi', 'jet9_pT', 'jet9_Px', 'jet9_Py', 'jet9_Pz', 'jet9_E', 'jet9_btag', 'jet10_Eta', 'jet10_Phi', 'jet10_pT', 'jet10_Px', 'jet10_Py', 'jet10_Pz', 'jet10_E', 'jet10_btag', 'jet11_Eta', 'jet11_Phi', 'jet11_pT', 'jet11_Px', '

In [7]:
print(f"Signal Size: {len(signal)}")
print(f"Background Size: {len(background)}")

Signal Size: 119966
Background Size: 126444


# Aggregating Node Features

In [16]:
def get_node_features(x:pd.DataFrame,*, cols = ['Eta', 'Phi', 'pT', 'E'], num_isophoton_cols = None, num_jets_cols = None) -> list[np.array]:
    """
    Extracts node features for isophotons and the jets from a DataFrame
    and returns a list of node features for all events.

    Parameters:
    -----------
    x : pd.DataFrame
        Input DataFrame with columns containing features for each object in the event.

    Returns:
    --------
    list_of_node_features : list of lists
        Each element in the list is a 2D list of shape (<num_jets>, num_features) 
        containing node features for isophoton, jet1, and jet2.
    """
    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        list_of_node_features = []
        list_of_node_labels = []
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_node_features = []
                event_node_labels = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if np.sum(np.abs(isophoton_features)) > 0 :
                        event_node_features.append(isophoton_features.tolist())
                        event_node_labels.append(0)
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) > 0) & (event['jet' + str(jet_idx) + '_btag'] == 1) :
                        event_node_features.append(jet_features.tolist())
                        event_node_labels.append(1)
                
                list_of_node_features.append(np.array(event_node_features))
                list_of_node_labels.append(np.array(event_node_labels))
                pbar.update(1)
        print('Process Complemented :)')
        return list_of_node_features, list_of_node_labels
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')

In [17]:
signal_node_features, signal_node_labels = get_node_features(signal, num_isophoton_cols = 3, num_jets_cols = 13)
background_node_features, background_node_labels = get_node_features(background, num_isophoton_cols = 3, num_jets_cols = 13)

Initializing the Process...


Progress: 100%|██████████| 119966/119966 [06:13<00:00, 321.60it/s]


Process Complemented :)
Initializing the Process...


Progress: 100%|██████████| 126444/126444 [06:29<00:00, 324.52it/s]

Process Complemented :)





In [30]:
# Shape of node_features_list
print(len(signal_node_features))           # Number of events (graphs)
print(len(signal_node_features[0]))        # Number of nodes (5: isophoton, jet1, ...)
print(len(signal_node_features[0][0]))     # Number of features per node

119966
1
4


In [31]:
# Shape of node_features_list
print(len(background_node_features))           # Number of events (graphs)
print(len(background_node_features[0]))        # Number of nodes (3: isophoton, jet1, ...)
print(len(background_node_features[0][0]))     # Number of features per node

126444
1
4


In [20]:
test = signal_node_features[3][:, :2] #Each Row represents node feature

In [46]:
signal_node_features[3] #Each Row represents node feature

array([[  1.12555,  -1.84877,  96.2593 , 163.948  ],
       [  1.51451,  -1.49654,  34.9717 ,  83.3572 ]])

In [47]:
signal_node_labels[3]

array([1, 1])

In [48]:
signal.iloc[3,:]

eventno              8.00000
jetmultiplicity      5.00000
jet1_Eta             0.43832
jet1_Phi             2.19311
jet1_pT            144.61600
                     ...    
isophoton3_Px        0.00000
isophoton3_Py        0.00000
isophoton3_Pz        0.00000
isophoton3_E         0.00000
label                1.00000
Name: 3, Length: 129, dtype: float64

# Aggregating Edge Index and Edge Features

In [24]:
from helper_function_module import Euclidean_distance as ed
def get_edge_info(x:pd.DataFrame,*, cols = ['Eta', 'Phi'], num_isophoton_cols = None, num_jets_cols = None) -> list[np.array]:
    list_edge_attr =[]
    list_edge_index = []

    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_eta_phi = []
                event_edge_attr = []
                event_edge_index = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if np.sum(np.abs(isophoton_features)) > 0 :
                        event_eta_phi.append(isophoton_features.tolist())
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) > 0 ) & (event['jet' + str(jet_idx) + '_btag'] == 1) :
                        event_eta_phi.append(jet_features.tolist())

                event_eta_phi = np.array(event_eta_phi)

                for i in range(len(event_eta_phi)):
                    for j in range(len(event_eta_phi)):
                        if j>i:
                            event_edge_index.append([i,j])
                            delta_R = ed(event_eta_phi[i], event_eta_phi[j])
                            event_edge_attr.append(delta_R)
                list_edge_attr.append(np.array(event_edge_attr))
                list_edge_index.append(np.array(event_edge_index).T)
                
                pbar.update(1)
            
        print('Process Complemented :)')
        return list_edge_attr, list_edge_index
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')
        

In [25]:
signal_edge_attr, signal_edge_index = get_edge_info(signal, num_isophoton_cols = 3, num_jets_cols = 13)
background_edge_attr, background_edge_index = get_edge_info(background, num_isophoton_cols = 3, num_jets_cols = 13)

Initializing the Process...


Progress: 100%|██████████| 119966/119966 [06:10<00:00, 324.06it/s]


Process Complemented :)
Initializing the Process...


Progress: 100%|██████████| 126444/126444 [06:26<00:00, 327.56it/s]

Process Complemented :)





In [26]:
print(len(signal_edge_attr))
print(len(signal_edge_attr[4]))

119966
1


In [27]:
signal_edge_attr[3]

array([0.52474361])

In [28]:
signal_edge_index[3]

array([[0],
       [1]])

In [29]:
print(len(background_edge_attr))
print(len(background_edge_attr[0]))

126444
0


# Invariant Mass Calculation for each Event

In [57]:
def get_invMass(x:pd.DataFrame,*, num_isophoton_cols = None, num_jets_cols = None) ->list[float]:
    list_inv_mass = []
    cols = ['E', 'Px', 'Py', 'Pz']

    if ((num_isophoton_cols is not None) & (num_jets_cols is not None)):
        print('Initializing the Process...')
        
        with tqdm(total=len(x), desc="Progress", leave=True) as pbar:
            for _, event in x.iterrows():
                event_cols = []
                for isophoton_idx in range(1,num_isophoton_cols + 1):
                    isophoton_features =  event[[('isophoton' + str(isophoton_idx) + '_' + feature) for feature in cols]].values
                    if np.sum(np.abs(isophoton_features)) > 0 :
                        event_cols.append(isophoton_features.tolist())
        
                for jet_idx in range(1, num_jets_cols + 1):
                    jet_features = event[[('jet' + str(jet_idx) + '_' + feature) for feature in cols]].values
                    if (np.sum(np.abs(jet_features)) > 0 ):
                        event_cols.append(jet_features.tolist())
                if event_cols:
                    event_cols = np.array(event_cols)
                    # print(event_cols)
                    sumSquared_EPxPyPz = (np.sum(event_cols, axis = 0))**2
                    inv_mass = np.sqrt(np.abs(sumSquared_EPxPyPz[0] - np.sum(sumSquared_EPxPyPz[1:])))
    
                    list_inv_mass.append(inv_mass)
                
                pbar.update(1)
            
        print('Process Complemented :)')
        return list_inv_mass
    else:
        print('***num_isophoton_cols and num_jets_cols should not be None***')

In [58]:
# signal_invMass = get_invMass(signal.iloc[:5, :], num_isophoton_cols = 3, num_jets_cols = 13)

In [59]:
signal_invMass = get_invMass(signal, num_isophoton_cols = 3, num_jets_cols = 13)
background_invMass = get_invMass(background, num_isophoton_cols = 3, num_jets_cols = 13)

Initializing the Process...


Progress: 100%|██████████| 119966/119966 [06:08<00:00, 325.32it/s]


Process Complemented :)
Initializing the Process...


Progress: 100%|██████████| 126444/126444 [06:29<00:00, 324.53it/s]

Process Complemented :)





In [64]:
print(len(signal_invMass))
print(len(background_invMass))

119966
126444


In [62]:
signal_invMass[0]

34.334697499497636

In [63]:
signal.iloc[0,:]

eventno             2.00000
jetmultiplicity     2.00000
jet1_Eta           -1.74376
jet1_Phi           -1.40230
jet1_pT            22.48450
                     ...   
isophoton3_Px       0.00000
isophoton3_Py       0.00000
isophoton3_Pz       0.00000
isophoton3_E        0.00000
label               1.00000
Name: 0, Length: 129, dtype: float64

# Aggregating Data for Graphs

In [65]:
def get_graph_features(node_features_list:list[np.array], edge_features_list:list[np.array], edge_index_list:list[np.array], node_label:list[np.array], jetmultiplicity:list[int], inv_mass:list[float], label:int) -> dict:
    '''
    Extract node features,  edge features and edge index for each event and put then into a dictionary
    representating each event and return a list containing the dictionaries.

    Parameters:
    -----------
    node_features_list : list
        Each element in the list is a 2D list of shape (3, num_features) 
        containing node features for isophoton, jet1, and jet2.

    edge_features_list : list
        Each element in the list is a 1D list of size 3 
        containing edge features for isophoton, jet1, and jet2.

    edge_index_list : list
        2D list of shape (2, num_edges), connectivity information.

    Returns:
    --------
    event_graph : list of dictionaries
        Each element contains:
            - 'nodes': 2D list of shape (3, num_features), node features for isophoton, jet1, and jet2.
            - 'edges': 2D list of shape (num_edges, edge_features), edge features (Delta R) for the fully connected graph.
            - 'edge_index': 2D list of shape (2, num_edges), connectivity information.
    '''
    event_graphs = []
    if len(node_features_list) == len(edge_features_list) == len(edge_index_list) == len(node_label) == len(jetmultiplicity) == len(inv_mass):
        for i in range(len(node_features_list)):
            graph_dict = {
                'nodes' : node_features_list[i].tolist(),
                'edges' : edge_features_list[i].tolist(),
                'edge_index' : edge_index_list[i].tolist(),
                'node_labels' : node_label[i].tolist(),
                'jetmultiplicity' : jetmultiplicity[i],
                'inv_mass' : inv_mass[i],
                'label' : label
            }
            event_graphs.append(graph_dict)
        return event_graphs

    else:
        print('***Number of Events in the lists do not match***')

In [66]:
signal_graphs_dict = get_graph_features(signal_node_features, signal_edge_attr, signal_edge_index, signal_node_labels, signal.jetmultiplicity.values.tolist(), signal_invMass, 1)
background_graphs_dict = get_graph_features(background_node_features, background_edge_attr, background_edge_index, background_node_labels, background.jetmultiplicity.values.tolist(), background_invMass, 0)

In [67]:
print(len(signal_graphs_dict))
print(len(signal_graphs_dict[0]))

119966
7


In [68]:
signal_graphs_dict[0]

{'nodes': [[-2.22728, 0.220275, 22.3859, 105.017]],
 'edges': [],
 'edge_index': [],
 'node_labels': [0],
 'jetmultiplicity': 2,
 'inv_mass': 34.334697499497636,
 'label': 1}

In [69]:
print(len(background_graphs_dict))
print(len(background_graphs_dict[0]))

126444
7


In [70]:
background_graphs_dict[0]

{'nodes': [[1.65831, 2.39558, 30.7174, 83.5648]],
 'edges': [],
 'edge_index': [],
 'node_labels': [1],
 'jetmultiplicity': 2,
 'inv_mass': 74.9692480591549,
 'label': 0}

# Exporting the Dictionaries as JSON File

In [71]:
import json
with open('signal_btagged.json', 'w') as file:
    json.dump(signal_graphs_dict, file)

In [72]:
with open('background_btagged.json', 'w') as file:
    json.dump(background_graphs_dict, file)

In [76]:
with open('signal_btagged.json', 'r') as file:
    loaded_data = json.load(file)

print(loaded_data[3])

{'nodes': [[1.12555, -1.84877, 96.2593, 163.948], [1.51451, -1.49654, 34.9717, 83.3572]], 'edges': [0.5247436083460189], 'edge_index': [[0], [1]], 'node_labels': [1, 1], 'jetmultiplicity': 5, 'inv_mass': 539.1108908891305, 'label': 1}
