In [29]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import os
import traceback
from pprint import pprint

# --- Constants ---
# For original GNN node features
ORIGINAL_GNN_NODE_FEATURES = ['Eta', 'Phi', 'pT', 'E'] # What you used for 'nodes' previously

# For LorentzNet 4-vectors
FOUR_VECTOR_COLS_FROM_DF = ['E', 'Px', 'Py', 'Pz']

# For LorentzNet scalar features
LORENTZNET_SCALAR_FEATURE_ORDER = ['particle_type', 'btag_status', 'invariant_mass']
# particle_type: 0 for photon, 1 for jet

# Max particles from input DataFrame
MAX_JETS_INPUT = 15
MAX_PHOTONS_INPUT = 3

# Original DataFrame feature names used in filtering
JET_DF_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E'] # For filter_jets_by_eta

# --- Utility Functions (load_data, save_to_json - assume they are defined) ---
# ... (Your existing load_data and save_to_json) ...
def load_data(filepath: str, sep: str = '\t') -> pd.DataFrame | None:
    try:
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath, sep=sep, low_memory=False)
        print(f"--- Data loaded successfully: {df.shape[0]} events, {df.shape[1]} columns :)")
        return df
    except FileNotFoundError: return None
    except pd.errors.EmptyDataError: return None
    except Exception: traceback.print_exc(); return None

def save_to_json(data: list[dict], filepath: str) -> bool:
    if not isinstance(data, list): return False
    if not data: return False
    print(f"Attempting to save {len(data)} events to JSON file: {filepath}")
    try:
        with open(filepath, 'w') as f:
            def nan_to_null(obj):
                if isinstance(obj, float) and np.isnan(obj): return None
                if isinstance(obj, (np.float32, np.float64)): return float(obj) # Convert numpy floats
                if isinstance(obj, (np.int32, np.int64)): return int(obj) # Convert numpy ints
                return obj
            json.dump(data, f, indent=2, default=nan_to_null)
        print("JSON file saved successfully."); return True
    except Exception: traceback.print_exc(); return False

# --- Filtering Functions (filter_zero_multiplicity, filter_jets_by_eta, filter_empty_events) ---
# ... (Your existing filtering functions) ...
def filter_zero_multiplicity(df:pd.DataFrame) -> pd.DataFrame:
    """
    Removes events with zero jetmultiplicity.

    Args:
        df (pd.DataFrame): The input DataFrame with event data.

    Returns:
        pd.DataFrame: A new DataFrame with zero jetmultiplicity events removed.
                     Returns None if the input DataFrame is invalid or lacks
                     the 'jetmultiplicity' column.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame :(")
        return None
    if 'jetmultiplicity' not in df.columns:
        print("*** Error: 'jetmultiplicity' column not found in DataFrame :(")
        print("*** Returning the Original Dataframe...")
        return df

    initial_events = len(df)
    print(f"Initial number of events: {initial_events}")

    # Filter events where jetmultiplicity is greater than 0
    df_filtered = df[df['jetmultiplicity'] > 0].copy()

    removed_events = initial_events - len(df_filtered)
    print(f"--- Removed {removed_events} events with zero jetmultiplicity :)")
    print(f"Number of events after filtering: {len(df_filtered)}")

    return df_filtered

def filter_jets_by_eta(df:pd.DataFrame, eta_min:float=-2.5, eta_max:float=2.5, max_jets:int=13) -> [pd.DataFrame, list]:
    """
    Sets jet quantities to NaN if the jet's Eta is outside the specified range.

    It iterates through each possible jet (1 to max_jets) and checks its Eta value.
    If Eta is outside [eta_min, eta_max], all features (Eta, Phi, pT, Px, Py, Pz, E)
    for that specific jet in that event are set to NaN.

    Args:
        df (pd.DataFrame): The input DataFrame with event data.
        eta_min (float): The minimum allowed Eta value. Defaults to -2.5.
        eta_max (float): The maximum allowed Eta value. Defaults to 2.5.
        max_jets (int): The maximum number of jets to check per event. Defaults to 13.

    Returns:
        pd.DataFrame: The DataFrame with jet quantities potentially modified to NaN.
                      Returns None if the input DataFrame is invalid.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame for Eta filtering :(")
        return None

    print(f"Applying Eta filter: Keeping jets with Eta between {eta_min} and {eta_max}.")
    JET_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E']
    df_modified = df.copy()
    jet_eta_cols_in_df = []
    for i in range(1, max_jets + 1):
        eta_col = f'jet{i}_Eta'
        if eta_col in df_modified.columns:
            jet_eta_cols_in_df.append(eta_col)
            mask = ~df_modified[eta_col].between(eta_min, eta_max, inclusive='both')
            jet_cols = [f'jet{i}_{feature}' for feature in JET_FEATURES]
            existing_jet_cols = [col for col in jet_cols if col in df_modified.columns]
            if not existing_jet_cols:
                continue
            df_modified.loc[mask, existing_jet_cols] = np.nan
    print("--- Eta filtering complete :)")

    return df_modified, jet_eta_cols_in_df


def filter_empty_events(df:pd.DataFrame, jet_eta_cols:list, max_photons:int=3) -> pd.DataFrame:
    """
    Removes events that have no valid jets AND no valid photons after processing.

    - No valid jets means all existing jet_Eta columns for the event are NaN.
    - No valid photons means all existing isophoton_E columns are <= 0 (or NaN).

    Args:
        df (pd.DataFrame): DataFrame after jet Eta filtering.
        jet_eta_cols (list): List of jet_Eta column names that actually exist in df.
        max_photons (int): Maximum number of photons to check.

    Returns:
        pd.DataFrame: DataFrame with empty events removed, or None if input is invalid.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame for empty event filtering :(")
        return None
    if not jet_eta_cols:
         print("*** Warning: No jet Eta columns found in DataFrame. Cannot filter based on jets :(")
         has_no_valid_jets = pd.Series([True] * len(df), index=df.index) # Assume no jets if no columns
    else:
        # Check rows where ALL existing jet_Eta columns are NaN
        has_no_valid_jets = df[jet_eta_cols].isnull().all(axis=1)

    photon_e_cols = [f'isophoton{i}_E' for i in range(1, max_photons + 1)]
    photon_e_cols_in_df = [col for col in photon_e_cols if col in df.columns]

    if not photon_e_cols_in_df:
        print("*** Warning: No photon Energy columns found in DataFrame. Cannot filter based on photons :(")
        has_no_valid_photons = pd.Series([True] * len(df), index=df.index) # Assume no photons if no columns
    else:
        has_no_valid_photons = (df[photon_e_cols_in_df].fillna(0) <= 0).all(axis=1)

    # Identify events to remove (those having no valid jets AND no valid photons)
    is_empty_event = has_no_valid_jets & has_no_valid_photons

    # Filter the DataFrame: keep rows where is_empty_event is False
    df_filtered = df[~is_empty_event].copy()

    removed_count = len(df) - len(df_filtered)
    if removed_count > 0:
        print(f"Removed {removed_count} events with no valid jets AND no valid photons :)")
    else:
        print("No events found with both empty jets and empty photons.")
    print(f"Number of events after empty event filtering: {len(df_filtered)}")

    return df_filtered


# --- Calculation Helper Functions ---
def calculate_delta_r_robust(eta1: float | None, phi1: float | None,
                             eta2: float | None, phi2: float | None) -> float:
    # ... (Your existing calculate_delta_r_robust function) ...
    if any(v is None for v in [eta1, phi1, eta2, phi2]): return np.nan
    try:
        eta1_f, phi1_f, eta2_f, phi2_f = map(float, [eta1, phi1, eta2, phi2])
        deta = eta1_f - eta2_f; dphi = phi1_f - phi2_f
        dphi = np.arctan2(np.sin(dphi), np.cos(dphi)) # Normalize phi to [-pi, pi]
        delta_r_sq = deta**2 + dphi**2
        if isinstance(delta_r_sq, complex) or delta_r_sq < 0: return np.nan
        result = np.sqrt(delta_r_sq)
        return result if not np.isnan(result) else np.nan
    except (TypeError, ValueError): return np.nan


def calculate_invariant_mass_robust(four_vectors_list_of_dicts: list[dict]) -> float:
    """
    Calculates invariant mass from a list of particle dictionaries,
    each expected to have 'E', 'Px', 'Py', 'Pz' keys.
    """
    if not four_vectors_list_of_dicts: return np.nan
    four_vectors_for_calc = []
    for p_dict in four_vectors_list_of_dicts:
        try:
            fv = [float(p_dict['E']), float(p_dict['Px']), float(p_dict['Py']), float(p_dict['Pz'])]
            if any(np.isnan(v) for v in fv): return np.nan # Skip if any component is NaN
            four_vectors_for_calc.append(fv)
        except (KeyError, TypeError, ValueError): return np.nan # Skip if keys missing or non-numeric
    if not four_vectors_for_calc: return np.nan # No valid four-vectors extracted

    # Now call the original mass calculation with the list of numeric lists
    return _calculate_invariant_mass_from_list_of_lists(four_vectors_for_calc)

def _calculate_invariant_mass_from_list_of_lists(four_vectors: list[list[float]]) -> float:
    """Helper: Calculates invariant mass from a list of [E, Px, Py, Pz] lists."""
    if not four_vectors: return np.nan
    try:
        fv_array = np.array(four_vectors, dtype=float)
        if fv_array.ndim != 2 or fv_array.shape[1] != 4 or np.isnan(fv_array).any(): return np.nan
        sum_fv = np.sum(fv_array, axis=0)
        tot_E, tot_Px, tot_Py, tot_Pz = sum_fv
        if np.isnan(sum_fv).any(): return np.nan
        mass_squared = tot_E**2 - (tot_Px**2 + tot_Py**2 + tot_Pz**2)
        if np.isnan(mass_squared) or mass_squared < -1e-9: return np.nan
        if mass_squared < 0: mass_squared = 0.0
        result = np.sqrt(mass_squared)
        return result if not np.isnan(result) else np.nan
    except (TypeError, ValueError, IndexError): return np.nan


# --- Core Event Processing Function (Modified) ---
def process_event_combined(event_series: pd.Series, event_label: int,
                           max_jets_input: int, max_photons_input: int) -> dict | None:
    """
    Processes a single event for both original GNN features and LorentzNet features.
    """
    event_no = event_series.get('eventno')
    if event_no is None: return None
    try: event_no = int(event_no)
    except (ValueError, TypeError): return None

    # For original GNN structure
    original_gnn_nodes = [] # List of [Eta, Phi, pT, E]
    node_positions_for_edges = [] # List of [Eta, Phi] for DeltaR
    original_node_labels = [] # 0 for photon, 1 for jet
    original_jet_btag_labels = [] # btag for jets, NaN for photons (aligned with original_gnn_nodes)

    # For LorentzNet structure
    lorentz_x_coords = []  # List of 4-vectors [E, Px, Py, Pz]
    lorentz_h_scalars = [] # List of scalar feature vectors

    # To collect particle dicts for invariant mass calculations
    valid_particles_for_mass_calc = [] # List of dicts {'E': E, 'Px': Px, ... 'pT': pT, 'btag': btag_val }

    # --- Extract Photons ---
    for i in range(1, max_photons_input + 1):
        p_prefix = f'isophoton{i}_'
        pt_col = f'{p_prefix}pT'
        if pt_col not in event_series.index or pd.isna(event_series[pt_col]) or event_series[pt_col] <= 0:
            continue

        # Extract all needed features, handling potential NaNs
        try:
            eta = float(event_series.get(f'{p_prefix}Eta', np.nan))
            phi = float(event_series.get(f'{p_prefix}Phi', np.nan))
            pt = float(event_series.get(f'{p_prefix}pT', np.nan)) # Should be >0 from check
            e_kin = float(event_series.get(f'{p_prefix}E', np.nan))
            px = float(event_series.get(f'{p_prefix}Px', np.nan))
            py = float(event_series.get(f'{p_prefix}Py', np.nan))
            pz = float(event_series.get(f'{p_prefix}Pz', np.nan))

            # Check if essential features for node/4-vector are present
            if any(np.isnan([eta, phi, pt, e_kin, px, py, pz])): continue

            # Original GNN Node
            original_gnn_nodes.append([eta, phi, pt, e_kin])
            node_positions_for_edges.append([eta, phi])
            original_node_labels.append(0) # Photon
            original_jet_btag_labels.append(np.nan) # No btag for photon

            # LorentzNet Features
            lorentz_x_coords.append([e_kin, px, py, pz])
            photon_mass = _calculate_invariant_mass_from_list_of_lists([[e_kin, px, py, pz]])
            current_h_scalars = {'particle_type': 0.0, 'btag_status': 0.0, 'invariant_mass': photon_mass}
            lorentz_h_scalars.append([current_h_scalars.get(k, np.nan) for k in LORENTZNET_SCALAR_FEATURE_ORDER])

            valid_particles_for_mass_calc.append({'E':e_kin, 'Px':px, 'Py':py, 'Pz':pz, 'pT':pt, 'type':'photon'})

        except (TypeError, ValueError): continue

    # --- Extract Jets ---
    for i in range(1, max_jets_input + 1):
        j_prefix = f'jet{i}_'
        pt_col = f'{j_prefix}pT'
        if pt_col not in event_series.index or pd.isna(event_series[pt_col]) or event_series[pt_col] <= 0:
            continue
        try:
            eta = float(event_series.get(f'{j_prefix}Eta', np.nan))
            phi = float(event_series.get(f'{j_prefix}Phi', np.nan))
            pt = float(event_series.get(f'{j_prefix}pT', np.nan))
            e_kin = float(event_series.get(f'{j_prefix}E', np.nan))
            px = float(event_series.get(f'{j_prefix}Px', np.nan))
            py = float(event_series.get(f'{j_prefix}Py', np.nan))
            pz = float(event_series.get(f'{j_prefix}Pz', np.nan))
            btag_val_raw = event_series.get(f'{j_prefix}btag', np.nan)
            btag_val = float(btag_val_raw) if not pd.isna(btag_val_raw) else 0.0 # Default non-btag to 0

            if any(np.isnan([eta, phi, pt, e_kin, px, py, pz, btag_val])): continue

            # Original GNN Node
            original_gnn_nodes.append([eta, phi, pt, e_kin])
            node_positions_for_edges.append([eta, phi])
            original_node_labels.append(1) # Jet
            original_jet_btag_labels.append(btag_val)

            # LorentzNet Features
            lorentz_x_coords.append([e_kin, px, py, pz])
            jet_mass = _calculate_invariant_mass_from_list_of_lists([[e_kin, px, py, pz]])
            current_h_scalars = {'particle_type': 1.0, 'btag_status': btag_val, 'invariant_mass': jet_mass}
            lorentz_h_scalars.append([current_h_scalars.get(k, np.nan) for k in LORENTZNET_SCALAR_FEATURE_ORDER])

            valid_particles_for_mass_calc.append({'E':e_kin, 'Px':px, 'Py':py, 'Pz':pz, 'pT':pt, 'type':'jet', 'btag':btag_val})
        except (TypeError, ValueError): continue


    num_total_nodes = len(original_gnn_nodes)
    if num_total_nodes == 0: return None # No valid particles at all

    # --- Original GNN Edges ---
    edge_index_sources, edge_index_targets, edges_delta_r = [], [], []
    if num_total_nodes >= 2:
        for i in range(num_total_nodes):
            for j in range(i + 1, num_total_nodes):
                eta1, phi1 = node_positions_for_edges[i]
                eta2, phi2 = node_positions_for_edges[j]
                delta_r = calculate_delta_r_robust(eta1, phi1, eta2, phi2)
                if not np.isnan(delta_r):
                    edge_index_sources.extend([i, j]); edge_index_targets.extend([j, i])
                    edges_delta_r.extend([delta_r, delta_r])
    original_edge_index = [edge_index_sources, edge_index_targets]

    # --- Original GNN Graph-Level Features ---
    inv_mass_2leadingbj = np.nan
    inv_mass_2leadingbj1p = np.nan
    leading_isophoton_pt = np.nan

    photons_from_valid = [p for p in valid_particles_for_mass_calc if p['type'] == 'photon']
    jets_from_valid = [p for p in valid_particles_for_mass_calc if p['type'] == 'jet']

    if photons_from_valid:
        photons_sorted = sorted(photons_from_valid, key=lambda p: p['pT'], reverse=True)
        leading_photon_dict = photons_sorted[0]
        leading_isophoton_pt = leading_photon_dict['pT']
    else:
        leading_photon_dict = None

    if len(jets_from_valid) >= 2:
        # Sort by b-tag (desc) then pT (desc) to get leading b-jets preferentially
        jets_sorted = sorted(jets_from_valid, key=lambda j: (j.get('btag', -np.inf), j.get('pT', -np.inf)), reverse=True)
        leading_2_jets_dicts = jets_sorted[:2]
        inv_mass_2leadingbj = calculate_invariant_mass_robust(leading_2_jets_dicts)
        if leading_photon_dict:
            inv_mass_2leadingbj1p = calculate_invariant_mass_robust(leading_2_jets_dicts + [leading_photon_dict])

    # --- NaN to None for JSON and final counts ---
    final_original_jet_btag = [b if not np.isnan(b) else None for b in original_jet_btag_labels]
    final_lorentz_h_scalars = [[0.0 if np.isnan(s) else s for s in h_vec] for h_vec in lorentz_h_scalars]

    num_final_nodes = len(original_gnn_nodes)
    num_final_btag_jets = sum(1 for b in final_original_jet_btag if b == 1.0) # Count btags=1.0
    num_final_isophotons = original_node_labels.count(0) # Count photons

    combined_dict = {
        'eventno': event_no,
        'event_label': event_label,
        'nodes': original_gnn_nodes,
        'edges': [e if not np.isnan(e) else None for e in edges_delta_r],
        'edge_index': original_edge_index,
        'node_labels': original_node_labels,
        'jet_btag_label': final_original_jet_btag, # This was your list of btags aligned with 'nodes'
        'num_nodes': num_final_nodes,
        'num_btag_jets': num_final_btag_jets,
        'num_isophotons': num_final_isophotons,
        'invMass_2leadingbj1p': float(inv_mass_2leadingbj1p) if not np.isnan(inv_mass_2leadingbj1p) else None,
        'invMass_2leadingbj': float(inv_mass_2leadingbj) if not np.isnan(inv_mass_2leadingbj) else None,
        'leading_isophoton_pT': float(leading_isophoton_pt) if not np.isnan(leading_isophoton_pt) else None,
        # LorentzNet specific features
        'x_coords': lorentz_x_coords,
        'h_scalars': final_lorentz_h_scalars
    }
    return combined_dict


# --- Main Pipeline Function (Modified to use the combined processor) ---
def main_combined_pipeline(input_filepath: str, output_filepath: str,
                           max_jets_input_df: int, max_photons_input_df: int,
                           eta_min_filter: float, eta_max_filter: float,
                           sep: str = '\t') -> None:
    """
    Runs the full data pipeline: load, filter, process for combined GNN/LorentzNet, save.
    """
       # 1. Load Data
    raw_df = load_data(input_filepath, sep=sep)
    if raw_df is not None:
        df_filtered_multiplicity = raw_df

        if df_filtered_multiplicity is not None:
            # 3. Filter jets based on Eta range (Sets invalid jets to NaN)
            df_eta_filtered, existing_jet_eta_cols = filter_jets_by_eta(df_filtered_multiplicity,
                                                                        eta_min=eta_min_filter,
                                                                        eta_max=eta_max_filter,
                                                                        max_jets=max_jets_input_df)
            print(existing_jet_eta_cols)

            if df_eta_filtered is not None and not df_eta_filtered.empty:
                # 4. Filter out events with no valid jets AND no valid photons
                df_final_filtered = filter_empty_events(df_eta_filtered,
                                         jet_eta_cols=existing_jet_eta_cols,
                                         max_photons=max_photons_input_df)

            elif df_eta_filtered is not None and df_eta_filtered.empty:
                 print("*** All events were removed during the Eta filtering step :(")
            else:
                print("*** Eta filtering step failed :(")
        elif df_filtered_multiplicity is not None and df_filtered_multiplicity.empty:
            print("*** All events were removed during the jet multiplicity filtering step :(")
        else:
            print("*** Jet multiplicity filtering step failed :(")
    else:
        print("*** Data loading failed. Aborting processing :(")
    # 3. Determine Event Label (same as before)
    event_label = -1
    if 'event_label' in df_final_filtered.columns:
         try:
             unique_labels = df_final_filtered['event_label'].unique()
             if len(unique_labels) == 1: event_label = int(unique_labels[0]); print(f"Label '{event_label}' from 'event_label' column.")
             else: event_label = int(df_final_filtered['event_label'].iloc[0]); print(f"Warning: Multiple labels. Using label from first row: {event_label}.")
         except: event_label = -1
    elif "background" in input_filepath.lower(): event_label = 0; print("Label '0' from filename.")
    elif "ppbba" in input_filepath.lower(): event_label = 0; print("Label '0' from filename.")
    elif "ax" in input_filepath.lower() or "sig" in input_filepath.lower(): event_label = 1; print("Label '1' from filename.")
    if event_label == -1: print("Warning: Event label undetermined. Using -1.")

    # 4. Process Events using the combined function
    combined_data_list = []
    print(f"\nConverting {len(df_final_filtered)} filtered events to combined GNN/LorentzNet format...")
    for event_tuple in tqdm(df_final_filtered.itertuples(index=False, name=None), total=len(df_final_filtered), desc="Processing Events"):
        event_series = pd.Series(event_tuple, index=df_final_filtered.columns)
        combined_dict = process_event_combined(event_series, event_label, max_jets_input_df, max_photons_input_df)
        # Your previous filtering on the gnn_dict
        if (combined_dict is not None) and \
           (combined_dict.get('num_nodes', 0) >= 3) and \
           (combined_dict.get('num_btag_jets', 0) >= 2) and \
           (combined_dict.get('num_isophotons', 0) >= 1):
            combined_data_list.append(combined_dict)

    print(f"\nSuccessfully prepared {len(combined_data_list)} events for combined format out of {len(df_final_filtered)} filtered events.")

    # 5. Save Results
    if combined_data_list:
        success = save_to_json(combined_data_list, output_filepath)
        if success:
            print(f"\nPipeline finished successfully. Combined data saved to {output_filepath}")
            try:
                with open(output_filepath, 'r') as f: sample_data = json.load(f)
                if sample_data: print("\n--- Sample Combined Event:\n");pprint(sample_data[0])
            except: pass
        else: print("\nPipeline finished, but saving the JSON file failed.")
    else: print("\nPipeline finished, but no events were successfully prepared.")

In [30]:
# --- Main Execution ---
if __name__ == "__main__":
    INPUT_SEPARATOR = '\t'
    ETA_MIN_FILTER = -2.5
    ETA_MAX_FILTER = 2.5

    # Ensure global constants are defined
    JET_DF_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E']
    LORENTZNET_SCALAR_FEATURE_ORDER = ['particle_type', 'btag_status', 'invariant_mass']
    MAX_JETS_INPUT = 15
    MAX_PHOTONS_INPUT = 1
    FOUR_VECTOR_COLS_FROM_DF = ['E', 'Px', 'Py', 'Pz']
    SCALAR_JET_FEATURES_FROM_DF = ['btag']
    SCALAR_PHOTON_FEATURES_FROM_DF = []
    BASE_TXT_DATA_DIR = "./raw_txt_data"
    INPUT_FILE_PATHS = []
    for files in os.listdir(BASE_TXT_DATA_DIR):
        if files.endswith(".txt"):
            INPUT_FILE_PATHS.append(os.path.join(BASE_TXT_DATA_DIR, files))
    
    print(f"Found {len(INPUT_FILE_PATHS)} files to preprocess and convert to JSON file: \n{INPUT_FILE_PATHS}")
    
    GNN_OUTPUT_JSON_FILE_PATHS = []
    BASE_OUTPUT_DIR = "./onlyAny2bj_onlyAny1p"
    # BASE_OUTPUT_DIR = "./"
    os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
    for paths in INPUT_FILE_PATHS:
        if paths.endswith(".txt"):
            output_path = paths.split("/")[-1][:-4] + '_onlyAny2bj_onlyAny1p_LorentzNet_data.json'
            GNN_OUTPUT_JSON_FILE_PATHS.append(os.path.join(BASE_OUTPUT_DIR, output_path))
    
    print(f"\nFiles will be saved to paths: \n{GNN_OUTPUT_JSON_FILE_PATHS}")
    for _ in range(5):
        print("*")
    print(f"\nStarting to process the files...")
    for input_file, output_file in zip(INPUT_FILE_PATHS, GNN_OUTPUT_JSON_FILE_PATHS):
        main_combined_pipeline(
        input_filepath=input_file,
        output_filepath=output_file,
        max_jets_input_df=MAX_JETS_INPUT,
        max_photons_input_df=MAX_PHOTONS_INPUT,
        eta_min_filter=ETA_MIN_FILTER,
        eta_max_filter=ETA_MAX_FILTER,
        sep=INPUT_FILE_SEPARATOR
        )
        for _ in range(3):
            print(".")
    for _ in range(5):
        print("*")
    print(f"All files processed SUCCESSFULLY :)")

Found 6 files to preprocess and convert to JSON file: 
['./raw_txt_data/ppbba_500k_minpt10_15jets_etafiltered_corrected.txt', './raw_txt_data/ppbba_500k_minpt20_15jets_etafiltered_corrected.txt', './raw_txt_data/ppzaxbba_ax15_200k_minpt10_15jets_etafiltered_corrected.txt', './raw_txt_data/ppzaxbba_ax15_200k_minpt20_15jets_etafiltered_corrected.txt', './raw_txt_data/ppzaxbba_ax45_200k_minpt10_15jets_etafiltered_corrected.txt', './raw_txt_data/ppzaxbba_ax45_200k_minpt20_15jets_etafiltered_corrected.txt']

Files will be saved to paths: 
['./onlyAny2bj_onlyAny1p/ppbba_500k_minpt10_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json', './onlyAny2bj_onlyAny1p/ppbba_500k_minpt20_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json', './onlyAny2bj_onlyAny1p/ppzaxbba_ax15_200k_minpt10_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json', './onlyAny2bj_onlyAny1p/ppzaxbba_ax15_200k_minpt20_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_Lo

Processing Events: 100%|██████████| 470848/470848 [02:24<00:00, 3259.51it/s]



Successfully prepared 5931 events for combined format out of 470848 filtered events.
Attempting to save 5931 events to JSON file: ./onlyAny2bj_onlyAny1p/ppbba_500k_minpt10_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
JSON file saved successfully.

Pipeline finished successfully. Combined data saved to ./onlyAny2bj_onlyAny1p/ppbba_500k_minpt10_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
{'edge_index': [[0, 1, 0, 2, 0, 3, 1, 2, 1, 3, 2, 3],
                [1, 0, 2, 0, 3, 0, 2, 1, 3, 1, 3, 2]],
 'edges': [1.0952445377964684,
           1.0952445377964684,
           3.6744774117619774,
           3.6744774117619774,
           2.119689668842116,
           2.119689668842116,
           3.2793359684841685,
           3.2793359684841685,
           1.0313693498281786,
           1.0313693498281786,
           3.052316685345903,
           3.052316685345903],
 'event_label': 0,
 'eventno': 262,
 'h_scalars': [[0.0, 0.0, 0.053012993692885

Processing Events: 100%|██████████| 355591/355591 [01:27<00:00, 4079.80it/s]



Successfully prepared 1939 events for combined format out of 355591 filtered events.
Attempting to save 1939 events to JSON file: ./onlyAny2bj_onlyAny1p/ppbba_500k_minpt20_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
JSON file saved successfully.

Pipeline finished successfully. Combined data saved to ./onlyAny2bj_onlyAny1p/ppbba_500k_minpt20_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
{'edge_index': [[0, 1, 0, 2, 0, 3, 1, 2, 1, 3, 2, 3],
                [1, 0, 2, 0, 3, 0, 2, 1, 3, 1, 3, 2]],
 'edges': [1.5214262041602122,
           1.5214262041602122,
           2.041574429594601,
           2.041574429594601,
           1.257692013262786,
           1.257692013262786,
           0.5410989753841342,
           0.5410989753841342,
           1.801105101150224,
           1.801105101150224,
           2.1094013598617325,
           2.1094013598617325],
 'event_label': 0,
 'eventno': 541,
 'h_scalars': [[0.0, 0.0, 0.0],
             

Processing Events: 100%|██████████| 175296/175296 [01:02<00:00, 2808.68it/s]



Successfully prepared 3488 events for combined format out of 175296 filtered events.
Attempting to save 3488 events to JSON file: ./onlyAny2bj_onlyAny1p/ppzaxbba_ax15_200k_minpt10_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
JSON file saved successfully.

Pipeline finished successfully. Combined data saved to ./onlyAny2bj_onlyAny1p/ppzaxbba_ax15_200k_minpt10_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
{'edge_index': [[0, 1, 0, 2, 0, 3, 1, 2, 1, 3, 2, 3],
                [1, 0, 2, 0, 3, 0, 2, 1, 3, 1, 3, 2]],
 'edges': [2.8989862509236968,
           2.8989862509236968,
           3.004379511988102,
           3.004379511988102,
           3.0607649100713368,
           3.0607649100713368,
           0.5335701639184861,
           0.5335701639184861,
           2.2034447201824694,
           2.2034447201824694,
           2.0566762871789037,
           2.0566762871789037],
 'event_label': 1,
 'eventno': 30,
 'h_scalars': [[0.0, 0.0, 

Processing Events: 100%|██████████| 149600/149600 [00:43<00:00, 3450.87it/s]



Successfully prepared 570 events for combined format out of 149600 filtered events.
Attempting to save 570 events to JSON file: ./onlyAny2bj_onlyAny1p/ppzaxbba_ax15_200k_minpt20_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
JSON file saved successfully.

Pipeline finished successfully. Combined data saved to ./onlyAny2bj_onlyAny1p/ppzaxbba_ax15_200k_minpt20_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
{'edge_index': [[0, 1, 0, 2, 1, 2], [1, 0, 2, 0, 2, 1]],
 'edges': [2.9923461582176616,
           2.9923461582176616,
           2.866177727299879,
           2.866177727299879,
           0.5579355171522961,
           0.5579355171522961],
 'event_label': 1,
 'eventno': 292,
 'h_scalars': [[0.0, 0.0, 0.0],
               [1.0, 1.0, 0.0],
               [1.0, 1.0, 0.053466306223098614]],
 'invMass_2leadingbj': 13.256282227680105,
 'invMass_2leadingbj1p': 95.62838682036785,
 'jet_btag_label': [None, 1.0, 1.0],
 'leading_isophoton_pT': 43

Processing Events: 100%|██████████| 176691/176691 [01:06<00:00, 2675.41it/s]



Successfully prepared 6773 events for combined format out of 176691 filtered events.
Attempting to save 6773 events to JSON file: ./onlyAny2bj_onlyAny1p/ppzaxbba_ax45_200k_minpt10_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
JSON file saved successfully.

Pipeline finished successfully. Combined data saved to ./onlyAny2bj_onlyAny1p/ppzaxbba_ax45_200k_minpt10_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
{'edge_index': [[0, 1, 0, 2, 0, 3, 1, 2, 1, 3, 2, 3],
                [1, 0, 2, 0, 3, 0, 2, 1, 3, 1, 3, 2]],
 'edges': [2.2661411295230494,
           2.2661411295230494,
           3.027992839451096,
           3.027992839451096,
           4.3909377077909,
           4.3909377077909,
           2.032559334518183,
           2.032559334518183,
           3.5570066467601658,
           3.5570066467601658,
           3.881371923315389,
           3.881371923315389],
 'event_label': 1,
 'eventno': 66,
 'h_scalars': [[0.0, 0.0, 0.0],
    

Processing Events: 100%|██████████| 148523/148523 [00:42<00:00, 3457.20it/s]



Successfully prepared 1658 events for combined format out of 148523 filtered events.
Attempting to save 1658 events to JSON file: ./onlyAny2bj_onlyAny1p/ppzaxbba_ax45_200k_minpt20_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
JSON file saved successfully.

Pipeline finished successfully. Combined data saved to ./onlyAny2bj_onlyAny1p/ppzaxbba_ax45_200k_minpt20_15jets_etafiltered_corrected_onlyAny2bj_onlyAny1p_LorentzNet_data.json
{'edge_index': [[0, 1, 0, 2, 0, 3, 0, 4, 1, 2, 1, 3, 1, 4, 2, 3, 2, 4, 3, 4],
                [1, 0, 2, 0, 3, 0, 4, 0, 2, 1, 3, 1, 4, 1, 3, 2, 4, 2, 4, 3]],
 'edges': [2.8568541454335303,
           2.8568541454335303,
           2.9492141862111336,
           2.9492141862111336,
           0.7218868799583492,
           0.7218868799583492,
           1.082784586113508,
           1.082784586113508,
           1.1807386463777665,
           1.1807386463777665,
           3.080221988592543,
           3.080221988592543,
           1.810