In [2]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import os
import math
import traceback

In [3]:
def load_data(filepath: str, sep: str = '\t') -> pd.DataFrame | None:
    """
    Loads data from a text file (CSV format) into a Pandas DataFrame.

    Args:
        filepath (str): The path to the text file.
        sep (str): Delimiter to use.

    Returns:
        pd.DataFrame | None: DataFrame containing the loaded data, or None if an error occurs.
    """
    try:
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath, sep=sep, low_memory=False)
        print(f"--- Data loaded successfully: {df.shape[0]} events, {df.shape[1]} columns :)")
        return df
    except FileNotFoundError:
        print(f"*** Error: File not found at {filepath}")
        return None
    except pd.errors.EmptyDataError:
        print(f"*** Error: File at {filepath} is empty.")
        return None
    except Exception as e:
        print(f"*** An unexpected error occurred during file loading: {e}")
        traceback.print_exc()
        return None

def save_to_json(data: list[dict], filepath: str) -> bool:
    """
    Saves a list of dictionaries to a JSON file.

    Args:
        data (list): The list of event dictionaries.
        filepath (str): The path where the JSON file will be saved.

    Returns:
        bool: True if saving was successful, False otherwise.
    """
    if not isinstance(data, list):
        print("Error: Data to be saved must be a list of dictionaries.")
        return False
    if not data:
        print("Warning: Data list is empty. Saving an empty JSON file.")
        return False

    print(f"Attempting to save {len(data)} events to JSON file: {filepath}")
    try:
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=1)
        print("JSON file saved successfully.")
        return True
    except TypeError as e:
        print(f"Error: Data contains types not serializable to JSON: {e}")
        print("This might indicate NumPy types weren't converted or other complex objects exist.")
        return False
    except IOError as e:
        print(f"Error: Could not write to file {filepath}: {e}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred during JSON saving: {e}")
        traceback.print_exc()
        return False

def filter_zero_multiplicity(df:pd.DataFrame) -> pd.DataFrame:
    """
    Removes events with zero jetmultiplicity.

    Args:
        df (pd.DataFrame): The input DataFrame with event data.

    Returns:
        pd.DataFrame: A new DataFrame with zero jetmultiplicity events removed.
                     Returns None if the input DataFrame is invalid or lacks
                     the 'jetmultiplicity' column.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame :(")
        return None
    if 'jetmultiplicity' not in df.columns:
        print("*** Error: 'jetmultiplicity' column not found in DataFrame :(")
        return None

    initial_events = len(df)
    print(f"Initial number of events: {initial_events}")

    # Filter events where jetmultiplicity is greater than 0
    df_filtered = df[df['jetmultiplicity'] > 0].copy()

    removed_events = initial_events - len(df_filtered)
    print(f"--- Removed {removed_events} events with zero jetmultiplicity :)")
    print(f"Number of events after filtering: {len(df_filtered)}")

    return df_filtered

def filter_jets_by_eta(df:pd.DataFrame, eta_min:float=-2.5, eta_max:float=2.5, max_jets:int=13) -> [pd.DataFrame, list]:
    """
    Sets jet quantities to NaN if the jet's Eta is outside the specified range.

    It iterates through each possible jet (1 to max_jets) and checks its Eta value.
    If Eta is outside [eta_min, eta_max], all features (Eta, Phi, pT, Px, Py, Pz, E)
    for that specific jet in that event are set to NaN.

    Args:
        df (pd.DataFrame): The input DataFrame with event data.
        eta_min (float): The minimum allowed Eta value. Defaults to -2.5.
        eta_max (float): The maximum allowed Eta value. Defaults to 2.5.
        max_jets (int): The maximum number of jets to check per event. Defaults to 13.

    Returns:
        pd.DataFrame: The DataFrame with jet quantities potentially modified to NaN.
                      Returns None if the input DataFrame is invalid.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame for Eta filtering :(")
        return None

    print(f"Applying Eta filter: Keeping jets with Eta between {eta_min} and {eta_max}.")

    df_modified = df.copy()
    jet_eta_cols_in_df = []
    for i in range(1, max_jets + 1):
        eta_col = f'jet{i}_Eta'
        if eta_col in df_modified.columns:
            jet_eta_cols_in_df.append(eta_col)

            # mask = ~df_modified[eta_col].between(eta_min, eta_max, inclusive='both')
            # jet_cols = [f'jet{i}_{feature}' for feature in JET_FEATURES]
            # existing_jet_cols = [col for col in jet_cols if col in df_modified.columns]
            # if not existing_jet_cols:
            #     continue
            # df_modified.loc[mask, existing_jet_cols] = np.nan
    print("--- Eta filtering complete :)")

    return df_modified, jet_eta_cols_in_df


def filter_empty_events(df:pd.DataFrame, jet_eta_cols:list, max_photons:int=3) -> pd.DataFrame:
    """
    Removes events that have no valid jets AND no valid photons after processing.

    - No valid jets means all existing jet_Eta columns for the event are NaN.
    - No valid photons means all existing isophoton_E columns are <= 0 (or NaN).

    Args:
        df (pd.DataFrame): DataFrame after jet Eta filtering.
        jet_eta_cols (list): List of jet_Eta column names that actually exist in df.
        max_photons (int): Maximum number of photons to check.

    Returns:
        pd.DataFrame: DataFrame with empty events removed, or None if input is invalid.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame for empty event filtering :(")
        return None
    if not jet_eta_cols:
         print("*** Warning: No jet Eta columns found in DataFrame. Cannot filter based on jets :(")
         has_no_valid_jets = pd.Series([True] * len(df), index=df.index) # Assume no jets if no columns
    else:
        # Check rows where ALL existing jet_Eta columns are NaN
        has_no_valid_jets = df[jet_eta_cols].isnull().all(axis=1)

    photon_e_cols = [f'isophoton{i}_E' for i in range(1, max_photons + 1)]
    photon_e_cols_in_df = [col for col in photon_e_cols if col in df.columns]

    if not photon_e_cols_in_df:
        print("*** Warning: No photon Energy columns found in DataFrame. Cannot filter based on photons :(")
        has_no_valid_photons = pd.Series([True] * len(df), index=df.index) # Assume no photons if no columns
    else:
        has_no_valid_photons = (df[photon_e_cols_in_df].fillna(0) <= 0).all(axis=1)

    # Identify events to remove (those having no valid jets AND no valid photons)
    is_empty_event = has_no_valid_jets & has_no_valid_photons

    # Filter the DataFrame: keep rows where is_empty_event is False
    df_filtered = df[~is_empty_event].copy()

    removed_count = len(df) - len(df_filtered)
    if removed_count > 0:
        print(f"Removed {removed_count} events with no valid jets AND no valid photons :)")
    else:
        print("No events found with both empty jets and empty photons.")
    print(f"Number of events after empty event filtering: {len(df_filtered)}")

    return df_filtered


def save_data(df:pd.DataFrame, output_filepath:str) -> bool:
    """
    Saves the DataFrame to a txt file.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        output_filepath (str): The path where the txt file will be saved.

    Returns:
        bool: True if saving was successful, False otherwise.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid DataFrame provided for saving :(")
        return False
    try:
        print(f"Saving processed data to {output_filepath}...")
        df.to_csv(output_filepath, index=False, sep = '\t')
        print("--- Data saved successfully :)")
        return True
    except Exception as e:
        print(f"*** An unexpected error occurred during file saving: {e}")
        return False



In [4]:
# --- Calculation Helper Functions ---

def calculate_delta_r_robust(eta1: float | None, phi1: float | None,
                             eta2: float | None, phi2: float | None) -> float:
    """
    Compute the Delta R distance between two nodes using (Eta, Phi).
    Handles potential None inputs and periodicity in Phi.

    Returns:
    - float: The Delta R distance, or np.nan if inputs are invalid.
    """
    if any(v is None for v in [eta1, phi1, eta2, phi2]):
        return np.nan
    try:
        eta1_f, phi1_f, eta2_f, phi2_f = map(float, [eta1, phi1, eta2, phi2])
        deta = eta1_f - eta2_f
        dphi = phi1_f - phi2_f
        # map dphi between [-pi, pi]
        dphi = np.arctan2(np.sin(dphi), np.cos(dphi))

        delta_r_sq = deta**2 + dphi**2
        # Ensure result is not complex due to tiny numerical errors
        if isinstance(delta_r_sq, complex) or delta_r_sq < 0:
             return np.nan

        result = np.sqrt(delta_r_sq)
        return result if not np.isnan(result) else np.nan
    except (TypeError, ValueError):
        return np.nan

def calculate_invariant_mass_robust(four_vectors: list[list[float]]) -> float:
    """
    Calculates the invariant mass of a system given a list of four-vectors [E, Px, Py, Pz].

    Args:
        four_vectors (list[list[float]]): List where each inner list is [E, Px, Py, Pz].

    Returns:
        float: The invariant mass, or np.nan if calculation is not possible.
    """
    if not four_vectors:
        return np.nan

    try:
        fv_array = np.array(four_vectors, dtype=float)

        if fv_array.ndim != 2 or fv_array.shape[1] != 4 or np.isnan(fv_array).any():
            return np.nan

        # Summing along the particle axis (axis=0)
        sum_fv = np.sum(fv_array, axis=0)
        tot_E, tot_Px, tot_Py, tot_Pz = sum_fv

        # Check if sum resulted in NaN (unlikely if individual checks passed, but safe)
        if np.isnan(sum_fv).any():
            return np.nan

        mass_squared = tot_E**2 - (tot_Px**2 + tot_Py**2 + tot_Pz**2)

        # Check for NaN or negative mass squared (allowing small tolerance)
        if np.isnan(mass_squared) or mass_squared < -1e-9: # Tolerance for numerical precision
            return np.nan
        if mass_squared < 0:
             mass_squared = 0.0 # Treat slightly negative as zero

        result = np.sqrt(mass_squared)
        return result if not np.isnan(result) else np.nan

    except (TypeError, ValueError, IndexError):
        return np.nan

# --- Core Event Processing Function ---

def process_event_to_gnn(event_series: pd.Series, event_label: int,
                         max_jets: int, max_photons: int) -> dict | None:
    """
    Processes a single event (represented as a Pandas Series)
    and converts it into the GNN dictionary format.

    Args:
        event_series (pd.Series): A row from the DataFrame representing one event.
        event_label (int): The predetermined label (0 or 1) for this event.
        max_jets (int): Maximum number of potential jet columns (e.g., jet1_, ..., jetN_).
        max_photons (int): Maximum number of potential photon columns.

    Returns:
        dict | None: A dictionary in GNN format, or None if processing fails.
    """
    event_no = event_series.get('eventno', None)
    if event_no is None:
        # print("Warning: Event skipped, 'eventno' column missing or NaN.")
        return None
    try:
        event_no = int(event_no) # Ensure event number is integer
    except (ValueError, TypeError):
        # print(f"Warning: Event skipped, could not convert eventno '{event_no}' to int.")
        return None

    nodes = []
    node_positions = [] # Store [Eta, Phi] separately for edge calculation
    node_labels = [] # 0 for photon, 1 for jet (conventional)
    jet_btag_labels_for_nodes = [] # Aligned with nodes: btag value for jets, 0 for photons
    particle_four_vectors = [] # Store [E, Px, Py, Pz] for valid nodes

    # --- Extract Photons ---
    for i in range(1, max_photons + 1):
        # Construct column names efficiently
        p_prefix = f'isophoton{i}_'
        p_node_cols = [p_prefix + feat for feat in NODE_FEATURES]
        p_pos_cols = [p_prefix + feat for feat in POS_FEATURES]
        p_kin_cols = [p_prefix + feat for feat in KINEMATIC_FEATURES]

        # Check if essential columns exist (more robust than checking value sum)
        if not all(col in event_series.index for col in p_node_cols):
            continue # Skip if this photon's columns don't fully exist

        try:
            # Extract features, convert to float, check for NaN
            node_vals = [float(event_series.get(col, np.nan)) for col in p_node_cols]
            pos_vals = [float(event_series.get(col, np.nan)) for col in p_pos_cols]
            kin_vals = [float(event_series.get(col, np.nan)) for col in p_kin_cols]

            # Use pT > 0 as the primary check for photon existence/validity
            # Assuming NODE_FEATURES = ['Eta', 'Phi', 'pT', 'E'], pT is index 2
            if len(node_vals) == len(NODE_FEATURES) and not np.isnan(node_vals[2]) and node_vals[2] > 0:
                 # Check if *all* extracted values for this particle are valid numbers
                if not np.isnan(node_vals).any() and not np.isnan(pos_vals).any() and not np.isnan(kin_vals).any():
                    nodes.append(node_vals)
                    node_positions.append(pos_vals)
                    node_labels.append(0) # Photon label = 0
                    jet_btag_labels_for_nodes.append(0) # B-tag is NaN for photons
                    particle_four_vectors.append(kin_vals)
                # else: # Optional: Warn if photon has pT>0 but other NaNs
                    # print(f"Warning: Event {event_no}, Photon {i} - pT>0 but NaN found in features. Skipping node.")

        except (TypeError, ValueError, KeyError):
            # print(f"Warning: Event {event_no}, Photon {i} - Error extracting/converting features. Skipping.")
            continue # Skip this photon if any error occurs

    # --- Extract Jets ---
    jets_for_mass_calc = [] # Store full jet dicts needed later
    for i in range(1, max_jets + 1):
        j_prefix = f'jet{i}_'
        j_node_cols = [j_prefix + feat for feat in NODE_FEATURES]
        j_pos_cols = [j_prefix + feat for feat in POS_FEATURES]
        j_kin_cols = [j_prefix + feat for feat in KINEMATIC_FEATURES]
        j_btag_col = j_prefix + BTAG_FEATURE

        # Check if essential columns exist
        if not all(col in event_series.index for col in j_node_cols) or j_btag_col not in event_series.index:
             continue

        try:
            node_vals = [float(event_series.get(col, np.nan)) for col in j_node_cols]
            pos_vals = [float(event_series.get(col, np.nan)) for col in j_pos_cols]
            kin_vals = [float(event_series.get(col, np.nan)) for col in j_kin_cols]
            btag_val_raw = event_series.get(j_btag_col, np.nan)
            btag_val = float(btag_val_raw) if not pd.isna(btag_val_raw) else np.nan

            # Use pT > 0 primary check for jet validity
            if len(node_vals) == len(NODE_FEATURES) and not np.isnan(node_vals[2]) and node_vals[2] > 0:
                 # Check if all extracted values are valid numbers (including btag)
                if not np.isnan(node_vals).any() and not np.isnan(pos_vals).any() and not np.isnan(kin_vals).any() and not np.isnan(btag_val):
                    nodes.append(node_vals)
                    node_positions.append(pos_vals)
                    node_labels.append(1) # Jet label = 1
                    jet_btag_labels_for_nodes.append(btag_val)
                    particle_four_vectors.append(kin_vals)
                    # Store jet info needed for later mass calculation (original dict structure preferred by invariant mass func)
                    jets_for_mass_calc.append({
                        'E': kin_vals[0], 'Px': kin_vals[1], 'Py': kin_vals[2], 'Pz': kin_vals[3],
                        'pT': node_vals[2] # Include pT for sorting
                    })
                # else:
                    # print(f"Warning: Event {event_no}, Jet {i} - pT>0 but NaN found in features/btag. Skipping node.")

        except (TypeError, ValueError, KeyError):
             # print(f"Warning: Event {event_no}, Jet {i} - Error extracting/converting features. Skipping.")
             continue

    # --- Check if any valid nodes were created ---
    num_nodes = len(nodes)
    if num_nodes == 0:
        # print(f"Info: Event {event_no} resulted in 0 valid nodes. Skipping event.")
        return None

    # --- Calculate Edges and Edge Index ---
    edge_index_sources = []
    edge_index_targets = []
    edges = [] # DeltaR values

    if num_nodes >= 2:
        for i in range(num_nodes):
            for j in range(i + 1, num_nodes): # Calculate only for j > i
                eta1, phi1 = node_positions[i]
                eta2, phi2 = node_positions[j]
                delta_r = calculate_delta_r_robust(eta1, phi1, eta2, phi2)

                if not np.isnan(delta_r):
                    # Add edges in both directions for undirected graph
                    edge_index_sources.extend([i, j])
                    edge_index_targets.extend([j, i])
                    edges.extend([delta_r, delta_r]) # Add distance twice

    edge_index = [edge_index_sources, edge_index_targets]

    # --- Calculate Graph-Level Features ---
    inv_mass_2j = np.nan
    inv_mass_2j1p = np.nan
    isophoton_pt = np.nan

    # Find leading photon (if any) from the 'nodes' list (index 2 is pT)
    photon_nodes = [(nodes[i], particle_four_vectors[i]) for i, label in enumerate(node_labels) if label == 0]
    if photon_nodes:
        photons_sorted = sorted(photon_nodes, key=lambda p: p[0][2], reverse=True) # Sort by pT
        leading_photon_fv = photons_sorted[0][1] # Get four-vector [E,Px,Py,Pz]
        isophoton_pt = photons_sorted[0][0][2] # Get pT
    else:
        leading_photon_fv = None

    # Find leading 2 jets (if any) using the stored jet dicts
    if len(jets_for_mass_calc) >= 2:
         # Sort the collected jets by pT
        jets_sorted = sorted(jets_for_mass_calc, key=lambda j: j['pT'], reverse=True)
        leading_jets_dicts = jets_sorted[:2]
        # Extract four-vectors for mass calculation
        leading_jets_fv = [[j['E'], j['Px'], j['Py'], j['Pz']] for j in leading_jets_dicts]
        inv_mass_2j = calculate_invariant_mass_robust(leading_jets_fv)

        # Calculate 2j+1p mass if photon also exists
        if leading_photon_fv is not None:
            inv_mass_2j1p = calculate_invariant_mass_robust(leading_jets_fv + [leading_photon_fv])

    # --- Assemble Final GNN Dictionary ---
    # Convert NumPy types (like np.nan) to standard types for JSON
    final_nodes = [node.tolist() if isinstance(node, np.ndarray) else node for node in nodes]
    final_edges = [edge if not np.isnan(edge) else None for edge in edges]
    final_edge_index = edge_index # Already list of lists
    final_node_labels = node_labels # Already list of ints
    final_jet_btag = [btag if not np.isnan(btag) else None for btag in jet_btag_labels_for_nodes]
    num_nodes = len(final_nodes)
    num_btag_jets = final_jet_btag.count(1.0)
    num_isophotons = final_node_labels.count(0.0)

    gnn_dict = {
        'eventno': event_no,
        'event_label': event_label,
        'nodes': final_nodes,
        'num_nodes':int(num_nodes) if not np.isnan(num_nodes) else None,
        'edges': final_edges,
        'edge_index': final_edge_index,
        'node_labels': final_node_labels,
        'jet_btag_label': final_jet_btag,
        'num_btag_jets': int(num_btag_jets) if not np.isnan(num_btag_jets) else None,
        'num_isophotons': int(num_isophotons) if not np.isnan(num_isophotons) else None,
        'inv_mass_2j1p': float(inv_mass_2j1p) if not np.isnan(inv_mass_2j1p) else None,
        'inv_mass_2j': float(inv_mass_2j) if not np.isnan(inv_mass_2j) else None,
        'isophoton_pT': float(isophoton_pt) if not np.isnan(isophoton_pt) else None
    }
    return gnn_dict


# --- Main Pipeline Function ---
def main_pipeline(input_filepath: str, output_filepath: str,
                  MAX_JETS: int, MAX_PHOTONS: int, 
                  ETA_MIN: float, ETA_MAX: float, sep: str = ',') -> None:
    """
    Runs the full data pipeline: load, process events, save GNN data.

    Args:
        input_filepath (str): Path to the input data file (CSV/TXT).
        output_filepath (str): Path to save the output GNN JSON file.
        max_jets (int): Max number of potential jet columns in input.
        max_photons (int): Max number of potential photon columns in input.
        sep (str): Separator for the input file.
    """
    # 1. Load Data
    raw_df = load_data(input_filepath, sep=sep)
    if raw_df is not None:
        df_filtered_multiplicity = raw_df

        if df_filtered_multiplicity is not None:
            # 3. Filter jets based on Eta range (Sets invalid jets to NaN)
            df_eta_filtered, existing_jet_eta_cols = filter_jets_by_eta(df_filtered_multiplicity,
                                                                        eta_min=ETA_MIN,
                                                                        eta_max=ETA_MAX,
                                                                        max_jets=MAX_JETS)
            print(existing_jet_eta_cols)

            if df_eta_filtered is not None and not df_eta_filtered.empty:
                # 4. Filter out events with no valid jets AND no valid photons
                df = filter_empty_events(df_eta_filtered,
                                         jet_eta_cols=existing_jet_eta_cols,
                                         max_photons=MAX_PHOTONS)

            elif df_eta_filtered is not None and df_eta_filtered.empty:
                 print("*** All events were removed during the Eta filtering step :(")
            else:
                print("*** Eta filtering step failed :(")
        elif df_filtered_multiplicity is not None and df_filtered_multiplicity.empty:
            print("*** All events were removed during the jet multiplicity filtering step :(")
        else:
            print("*** Jet multiplicity filtering step failed :(")
    else:
        print("*** Data loading failed. Aborting processing :(")

    # 5. Determine Event Label (Improved approach needed)
    if 'event_label' in df.columns:
         try:
             event_label = int(df['event_label'].iloc[0])
             print(f"Determined event label '{event_label}' from 'event_label' column.")
         except (ValueError, TypeError, IndexError):
              print("Error: Could not determine event label from 'event_label' column. Defaulting to -1.")
              event_label = -1 # Indicate unknown label
    elif "background" in input_filepath.lower(): # Fallback to filename check (less reliable)
        event_label = 0
        print("Determined event label '0' based on filename (background).")
    elif "ax" in input_filepath.lower(): # Add signal check if needed
        event_label = 1
        print("Determined event label '1' based on filename (signal).")
    else:
        event_label = -1 # Unknown
        print("Warning: Could not determine event label from filename or column. Using '-1'.")


    # 6. Process Events
    gnn_data_list = []
    print(f"\nConverting {len(df)} events to GNN format...")
    # Use itertuples for faster iteration than iterrows
    # `index=False` gives only the data columns
    # `name=None` returns standard tuples (slightly faster)
    for event_tuple in tqdm(df.itertuples(index=False, name=None), total=len(df), desc="Processing Events"):
        # Convert tuple back to Series with correct index (column names)
        event_series = pd.Series(event_tuple, index=df.columns)
        gnn_dict = process_event_to_gnn(event_series, event_label, MAX_JETS, MAX_PHOTONS)
        if (gnn_dict is not None) and (gnn_dict['num_btag_jets'] == 2) and (gnn_dict['num_isophotons'] == 1):
            gnn_data_list.append(gnn_dict)

    print(f"\nSuccessfully converted {len(gnn_data_list)} events out of {len(df)}.")

    # 7. Save Results
    if gnn_data_list:
        success = save_to_json(gnn_data_list, output_filepath)
        if success:
            print(f"\nPipeline finished successfully. GNN data saved to {output_filepath}")
            # Optional: Load and print a sample
            try:
                with open(output_filepath, 'r') as f: sample_data = json.load(f)
                if sample_data: print("\n--- Sample GNN Event:\n", json.dumps(sample_data[0], indent=2))
            except: pass # Ignore errors reading sample
        else:
            print("\nPipeline finished, but saving the JSON file failed.")
    else:
        print("\nPipeline finished, but no events were successfully converted.")

In [7]:
# --- Main Execution ---
if __name__ == "__main__":
    # Define constants for jet features and limits
    JET_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E']
    PHOTON_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E']
    BTAG_FEATURE = 'btag' 
    NODE_FEATURES = ['Eta', 'Phi', 'pT', 'E']
    KINEMATIC_FEATURES = ['E', 'Px', 'Py', 'Pz']
    POS_FEATURES = ['Eta', 'Phi'] # For DeltaR
    
    MAX_JETS = 15 # Change this to 2 if you just want to use jet1 and jet2
    MAX_PHOTONS = 1 
    
    ETA_MIN = -2.5
    ETA_MAX = 2.5
    INPUT_SEPARATOR = '\t'
    BASE_TXT_DATA_DIR = "./raw_txt_data"
    INPUT_FILE_PATHS = []
    for files in os.listdir(BASE_TXT_DATA_DIR):
        if files.endswith(".txt"):
            INPUT_FILE_PATHS.append(os.path.join(BASE_TXT_DATA_DIR, files))
    
    print(f"Found {len(INPUT_FILE_PATHS)} files to preprocess and convert to JSON file: \n{INPUT_FILE_PATHS}")
    
    GNN_OUTPUT_JSON_FILE_PATHS = []
    # BASE_OUTPUT_DIR = "./GNN_JSON_DATA/onlyFirst2bj_onlyFirst1p"
    BASE_OUTPUT_DIR = "./"
    os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
    for paths in INPUT_FILE_PATHS:
        if paths.endswith(".txt"):
            output_path = paths.split("/")[-1][:-4] + '_onlyFirst2bj_onlyFirst1p_GNN_JSON_data.json'
            GNN_OUTPUT_JSON_FILE_PATHS.append(os.path.join(BASE_OUTPUT_DIR, output_path))
    
    print(f"\nFiles will be saved to paths: \n{GNN_OUTPUT_JSON_FILE_PATHS}")
    for _ in range(5):
        print("*")
    print(f"\nStarting to process the files...")
    for input_file, output_file in zip(INPUT_FILE_PATHS, GNN_OUTPUT_JSON_FILE_PATHS):
        main_pipeline(
            input_filepath=input_file,
            output_filepath=output_file,
            MAX_JETS=MAX_JETS,
            MAX_PHOTONS=MAX_PHOTONS,
            ETA_MIN=ETA_MIN,
            ETA_MAX=ETA_MAX,
            sep=INPUT_SEPARATOR
        )
        for _ in range(3):
            print(".")
    for _ in range(5):
        print("*")
    print(f"All files processed SUCCESSFULLY :)")

Found 6 files to preprocess and convert to JSON file: 
['./raw_txt_data/background_ppbba_500k_minpt10_15jets_etafiltered_corrected.txt', './raw_txt_data/background_ppbba_500k_minpt20_15jets_etafiltered_corrected.txt', './raw_txt_data/ppzaxbba_ax15_200k_minpt10_15jets_etafiltered_corrected.txt', './raw_txt_data/ppzaxbba_ax15_200k_minpt20_15jets_etafiltered_corrected.txt', './raw_txt_data/ppzaxbba_ax45_200k_minpt10_15jets_etafiltered_corrected.txt', './raw_txt_data/ppzaxbba_ax45_200k_minpt20_15jets_etafiltered_corrected.txt']

Files will be saved to paths: 
['./background_ppbba_500k_minpt10_15jets_etafiltered_corrected_onlyFirst2bj_onlyFirst1p_GNN_JSON_data.json', './background_ppbba_500k_minpt20_15jets_etafiltered_corrected_onlyFirst2bj_onlyFirst1p_GNN_JSON_data.json', './ppzaxbba_ax15_200k_minpt10_15jets_etafiltered_corrected_onlyFirst2bj_onlyFirst1p_GNN_JSON_data.json', './ppzaxbba_ax15_200k_minpt20_15jets_etafiltered_corrected_onlyFirst2bj_onlyFirst1p_GNN_JSON_data.json', './ppzaxbba

Processing Events: 100%|██████████| 470848/470848 [09:51<00:00, 795.58it/s] 



Successfully converted 5645 events out of 470848.
Attempting to save 5645 events to JSON file: ./background_ppbba_500k_minpt10_15jets_etafiltered_corrected_onlyFirst2bj_onlyFirst1p_GNN_JSON_data.json
JSON file saved successfully.

Pipeline finished successfully. GNN data saved to ./background_ppbba_500k_minpt10_15jets_etafiltered_corrected_onlyFirst2bj_onlyFirst1p_GNN_JSON_data.json

--- Sample GNN Event:
 {
  "eventno": 262,
  "event_label": 0,
  "nodes": [
    [
      0.55687,
      -0.181863,
      45.2844,
      52.4892
    ],
    [
      -0.460313,
      0.224215,
      38.8455,
      43.0342
    ],
    [
      -2.304,
      -2.48777,
      13.1734,
      66.618
    ],
    [
      -1.4672,
      0.447601,
      12.5798,
      28.73
    ]
  ],
  "num_nodes": 4,
  "edges": [
    1.0952445377964684,
    1.0952445377964684,
    3.6744774117619774,
    3.6744774117619774,
    2.119689668842116,
    2.119689668842116,
    3.2793359684841685,
    3.2793359684841685,
    1.03136934982817