In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [68]:
JET_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E']
PHOTON_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E'] # Assuming same features for photons
BTAG_FEATURE = 'btag' # How the b-tag column is named (e.g., jet1_btag)

MAX_JETS = 13
MAX_PHOTONS = 3 # Define maximum number of photons
ETA_MIN = -2.5
ETA_MAX = 2.5

In [76]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os

def load_data(filepath:str, sep:str = '\t') -> pd.DataFrame:
    """
    Loads data from a txt file into a Pandas DataFrame.

    Args:
        filepath (str): The path to the txt file.

    Returns:
        pd.DataFrame: DataFrame containing the loaded data, or None if an error occurs.
    """
    try:
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath, sep = sep)
        print("--- Data loaded successfully :)")
        return df
    except FileNotFoundError:
        print(f"*** Error: File not found at {filepath}")
        return None
    except pd.errors.EmptyDataError:
        print(f"*** Error: File at {filepath} is empty.")
        return None
    except Exception as e:
        print(f"*** An unexpected error occurred during file loading: {e}")
        return None

def filter_zero_multiplicity(df:pd.DataFrame) -> pd.DataFrame:
    """
    Removes events with zero jetmultiplicity.

    Args:
        df (pd.DataFrame): The input DataFrame with event data.

    Returns:
        pd.DataFrame: A new DataFrame with zero jetmultiplicity events removed.
                     Returns None if the input DataFrame is invalid or lacks
                     the 'jetmultiplicity' column.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame :(")
        return None
    if 'jetmultiplicity' not in df.columns:
        print("*** Error: 'jetmultiplicity' column not found in DataFrame :(")
        return None

    initial_events = len(df)
    print(f"Initial number of events: {initial_events}")

    # Filter events where jetmultiplicity is greater than 0
    df_filtered = df[df['jetmultiplicity'] > 0].copy()

    removed_events = initial_events - len(df_filtered)
    print(f"--- Removed {removed_events} events with zero jetmultiplicity :)")
    print(f"Number of events after filtering: {len(df_filtered)}")

    return df_filtered

def filter_jets_by_eta(df:pd.DataFrame, eta_min:float=-2.5, eta_max:float=2.5, max_jets:int=13) -> [pd.DataFrame, list]:
    """
    Sets jet quantities to NaN if the jet's Eta is outside the specified range.

    It iterates through each possible jet (1 to max_jets) and checks its Eta value.
    If Eta is outside [eta_min, eta_max], all features (Eta, Phi, pT, Px, Py, Pz, E)
    for that specific jet in that event are set to NaN.

    Args:
        df (pd.DataFrame): The input DataFrame with event data.
        eta_min (float): The minimum allowed Eta value. Defaults to -2.5.
        eta_max (float): The maximum allowed Eta value. Defaults to 2.5.
        max_jets (int): The maximum number of jets to check per event. Defaults to 13.

    Returns:
        pd.DataFrame: The DataFrame with jet quantities potentially modified to NaN.
                      Returns None if the input DataFrame is invalid.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame for Eta filtering :(")
        return None

    print(f"Applying Eta filter: Keeping jets with Eta between {eta_min} and {eta_max}.")

    df_modified = df.copy()
    jet_eta_cols_in_df = []
    for i in range(1, max_jets + 1):
        eta_col = f'jet{i}_Eta'
        if eta_col in df_modified.columns:
            jet_eta_cols_in_df.append(eta_col)

            mask = ~df_modified[eta_col].between(eta_min, eta_max, inclusive='both')
            jet_cols = [f'jet{i}_{feature}' for feature in JET_FEATURES]
            existing_jet_cols = [col for col in jet_cols if col in df_modified.columns]
            if not existing_jet_cols:
                continue
            df_modified.loc[mask, existing_jet_cols] = np.nan
    print("--- Eta filtering complete :)")

    return df_modified, jet_eta_cols_in_df


def filter_empty_events(df:pd.DataFrame, jet_eta_cols:list, max_photons:int=3) -> pd.DataFrame:
    """
    Removes events that have no valid jets AND no valid photons after processing.

    - No valid jets means all existing jet_Eta columns for the event are NaN.
    - No valid photons means all existing isophoton_E columns are <= 0 (or NaN).

    Args:
        df (pd.DataFrame): DataFrame after jet Eta filtering.
        jet_eta_cols (list): List of jet_Eta column names that actually exist in df.
        max_photons (int): Maximum number of photons to check.

    Returns:
        pd.DataFrame: DataFrame with empty events removed, or None if input is invalid.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid input DataFrame for empty event filtering :(")
        return None
    if not jet_eta_cols:
         print("*** Warning: No jet Eta columns found in DataFrame. Cannot filter based on jets :(")
         has_no_valid_jets = pd.Series([True] * len(df), index=df.index) # Assume no jets if no columns
    else:
        # Check rows where ALL existing jet_Eta columns are NaN
        has_no_valid_jets = df[jet_eta_cols].isnull().all(axis=1)

    photon_e_cols = [f'isophoton{i}_E' for i in range(1, max_photons + 1)]
    photon_e_cols_in_df = [col for col in photon_e_cols if col in df.columns]

    if not photon_e_cols_in_df:
        print("*** Warning: No photon Energy columns found in DataFrame. Cannot filter based on photons :(")
        has_no_valid_photons = pd.Series([True] * len(df), index=df.index) # Assume no photons if no columns
    else:
        has_no_valid_photons = (df[photon_e_cols_in_df].fillna(0) <= 0).all(axis=1)

    # Identify events to remove (those having no valid jets AND no valid photons)
    is_empty_event = has_no_valid_jets & has_no_valid_photons

    # Filter the DataFrame: keep rows where is_empty_event is False
    df_filtered = df[~is_empty_event].copy()

    removed_count = len(df) - len(df_filtered)
    if removed_count > 0:
        print(f"Removed {removed_count} events with no valid jets AND no valid photons :)")
    else:
        print("No events found with both empty jets and empty photons.")
    print(f"Number of events after empty event filtering: {len(df_filtered)}")

    return df_filtered


def save_data(df:pd.DataFrame, output_filepath:str) -> bool:
    """
    Saves the DataFrame to a txt file.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        output_filepath (str): The path where the txt file will be saved.

    Returns:
        bool: True if saving was successful, False otherwise.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("*** Error: Invalid DataFrame provided for saving :(")
        return False
    try:
        print(f"Saving processed data to {output_filepath}...")
        df.to_csv(output_filepath, index=False)
        print("--- Data saved successfully :)")
        return True
    except Exception as e:
        print(f"*** An unexpected error occurred during file saving: {e}")
        return False

def create_event_dictionary(row:pd.Series, max_jets:int=13, max_photons:int=3,
                            jet_features:list[str, ...]=None, photon_features:list[str, ...]=None,
                            btag_feature:str=None) -> dict:
    """
    Creates a dictionary for a single event (row) with structured jet/photon info.

    Args:
        row (pd.Series): A row from the DataFrame representing one event.
        max_jets (int): Maximum number of jets to check.
        max_photons (int): Maximum number of photons to check.
        jet_features (list): List of feature names for jets.
        photon_features (list): List of feature names for photons.
        btag_suffix (str): Suffix used for b-tag columns (e.g., 'btag').

    Returns:
        dict: A dictionary representing the event, or None if essential event
              info (like eventno) is missing.
    """
    if 'eventno' not in row or pd.isna(row['eventno']):
        print("*** Warning: Skipping row due to missing or NaN 'eventno' :(")
        return None

    event_dict = {
        "eventno": int(row['eventno']), # Ensure event number is an integer
        "jets": [],
        "photons": []
    }
    current_btag_jet_count = 0
    current_photon_count = 0

    for i in range(1, max_jets + 1):
        key_feature_col = f'jet{i}_{jet_features[0]}' # e.g., jet1_Eta

        # Check if the jet is valid (its key feature is not NaN after filtering)
        if key_feature_col in row and pd.notna(row[key_feature_col]):
            jet_data = {'jet_index':i}
            # Populate physics features
            for feature in jet_features:
                col_name = f'jet{i}_{feature}'
                if col_name in row and pd.notna(row[col_name]):
                    value = row[col_name]
                    # Convert numpy types to standard python types for JSON compatibility
                    jet_data[feature] = value.item() if hasattr(value, 'item') else value
                else:
                    jet_data[feature] = None # Mark missing sub-features

            # --- Add b-tag label directly to jet_data ---
            btag_col_name = f'jet{i}_{btag_feature}'
            btag_value = None # Default to None
            if btag_col_name in row and pd.notna(row[btag_col_name]):
                try:
                    # Attempt conversion to int, assuming b-tag is 0 or 1
                    btag_value = int(row[btag_col_name])
                except (ValueError, TypeError):
                     # Handle cases where conversion fails (e.g., unexpected string)
                     print(f"*** Warning: Could not convert b-tag value '{row[btag_col_name]}' to int for jet {i}, event {event_dict['eventno']}. Setting to None :(")
                     btag_value = None # Keep as None if conversion fails

                # --- Increment b-tag count if applicable ---
                # Check if the obtained btag_value indicates a b-tagged jet (assuming 1 means tagged)
                if btag_value == 1:
                    current_btag_jet_count += 1

            # Add the btag_value (which is int or None) to the jet dictionary
            jet_data[btag_feature] = btag_value

            # Append the complete jet data to the event's jet list
            event_dict["jets"].append(jet_data)

    # --- Process Isolated Photons ---
    for i in range(1, max_photons + 1):
        energy_col = f'isophoton{i}_E'

        # Check if photon is valid (Energy exists, is not NaN, and is > 0)
        if energy_col in row and pd.notna(row[energy_col]) and row[energy_col] > 0:
            photon_data = {'isophoton_index':i}
            # Populate physics features
            for feature in photon_features:
                col_name = f'isophoton{i}_{feature}'
                if col_name in row and pd.notna(row[col_name]):
                     value = row[col_name]
                     photon_data[feature] = value.item() if hasattr(value, 'item') else value
                else:
                    photon_data[feature] = None # Mark missing sub-features

            # Append the valid photon data
            event_dict["photons"].append(photon_data)
            # --- Increment photon count ---
            current_photon_count += 1 # Increment for each valid photon added

    # --- Add the recalculated counts to the event dictionary ---
    event_dict["num_btag_jets"] = current_btag_jet_count
    event_dict["num_isophoton"] = current_photon_count

    return event_dict


def convert_df_to_event_dicts(df:pd.DataFrame, max_jets:int=13, max_photons:int=3,
                            jet_features:list[str, ...]=None, photon_features:list[str, ...]=None,
                            btag_feature:str=None) -> list[dict, ...]:
    """
    Converts the DataFrame into a list of event dictionaries.

    Args:
        df (pd.DataFrame): The processed DataFrame.
        max_jets (int): Maximum number of jets to consider.
        max_photons (int): Maximum number of photons to consider.
        jet_features (list): List of feature names for jets.
        photon_features (list): List of feature names for photons.
        btag_suffix (str): Suffix for b-tag columns.

    Returns:
        list: A list containing dictionaries, each representing an event.
              Returns an empty list if the input DataFrame is invalid.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        print("Error: Invalid DataFrame provided for dictionary conversion.")
        return []
    if df.empty:
        print("Warning: Input DataFrame is empty. Returning empty list.")
        return []

    event_dictionaries = []
    print("Converting DataFrame rows to event dictionaries...")
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Events"):
        event_dict = create_event_dictionary(row, max_jets, max_photons,
                                             jet_features, photon_features, btag_feature)
        # Append only those events with just one isophoton and at least 2 btag jets
        if (event_dict is not None) and (event_dict["num_isophoton"] >= 1) and (event_dict["num_btag_jets"] >= 2):
            event_dictionaries.append(event_dict)

    print(f"--- Successfully converted {len(event_dictionaries)} events (with only ONE isophoton and AT LEAST TWO btag jets) to dictionaries :)")
    return event_dictionaries

def save_to_json(data:list[dict, ...], filepath:str) -> bool:
    """
    Saves a list of dictionaries to a JSON file.

    Args:
        data (list): The list of event dictionaries.
        filepath (str): The path where the JSON file will be saved.

    Returns:
        bool: True if saving was successful, False otherwise.
    """
    if not isinstance(data, list):
        print("Error: Data to be saved must be a list of dictionaries.")
        return False

    print(f"Attempting to save {len(data)} events to JSON file: {filepath}")
    try:
        with open(filepath, 'w') as f:
            # Use indent for readability, but remove for smaller file size if needed
            json.dump(data, f, indent=2)
        print("JSON file saved successfully.")
        return True
    except TypeError as e:
        print(f"Error: Data contains types not serializable to JSON: {e}")
        # This might happen if numpy types weren't converted properly
        return False
    except IOError as e:
        print(f"Error: Could not write to file {filepath}: {e}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred during JSON saving: {e}")
        return False

In [77]:
mam_signal = pd.read_csv('./ppzaxbba_ax45_200k_minpt20_13jets_etafiltered.txt', sep = "\t")
mam_background = pd.read_csv("./bg_500k_minpt20_13jets_etafiltered.txt", sep = "\t")

In [78]:
mam_signal.shape

(200000, 113)

In [79]:
mam_background.shape

(500000, 113)

In [80]:
mam_signal.head()

Unnamed: 0,eventno,jet1_Eta,jet1_Phi,jet1_pT,jet1_Px,jet1_Py,jet1_Pz,jet1_E,jet1_btag,jet2_Eta,...,jet13_E,jet13_btag,isophotoncount,isophoton1_Eta,isophoton1_Phi,isophoton1_pT,isophoton1_Px,isophoton1_Py,isophoton1_Pz,isophoton1_E
0,0,,,,,,,,-1,,...,,-1.0,1.0,0.200392,0.092595,36.0136,35.8593,3.32991,7.26522,36.7391
1,1,,,,,,,,-1,,...,,-1.0,1.0,-2.14693,-1.1612,23.7492,9.45782,-21.7847,-100.241,103.016
2,2,-1.74376,-1.4023,22.4845,3.77066,-22.1661,-62.3264,66.2581,0,,...,-102.603,105.017,,,,,,,,
3,3,2.41166,-0.512524,38.5637,33.6086,-18.9108,213.311,216.769,0,0.684929,...,,-1.0,0.0,,,,,,,
4,4,,,,,,,,-1,,...,,-1.0,0.0,,,,,,,


In [81]:
mam_signal_zero_eta_filtered, jet_eta_cols_in_df = filter_jets_by_eta(mam_signal)
mam_signal_filter_empty_events = filter_empty_events(mam_signal_zero_eta_filtered, jet_eta_cols_in_df)
mam_signal_convert_df_to_event_dicts = convert_df_to_event_dicts(mam_signal_filter_empty_events, max_jets =13, max_photons =3,
                            jet_features=JET_FEATURES, photon_features=PHOTON_FEATURES,
                            btag_feature=BTAG_FEATURE)
# save_to_json(mam_signal_convert_df_to_event_dicts, "./ax45_mam_signal.json")

Applying Eta filter: Keeping jets with Eta between -2.5 and 2.5.
--- Eta filtering complete :)
Removed 16565 events with no valid jets AND no valid photons :)
Number of events after empty event filtering: 183435
Converting DataFrame rows to event dictionaries...


Processing Events: 100%|██████████| 183435/183435 [00:45<00:00, 4036.42it/s]

--- Successfully converted 1425 events (with only ONE isophoton and AT LEAST TWO btag jets) to dictionaries :)





In [82]:
mam_background_zero_eta_filtered, jet_eta_cols_in_df_background = filter_jets_by_eta(mam_background)
mam_background_filter_empty_events = filter_empty_events(mam_background_zero_eta_filtered, jet_eta_cols_in_df_background)
mam_background_convert_df_to_event_dicts = convert_df_to_event_dicts(mam_background_filter_empty_events, max_jets = 13, max_photons = 3,
                            jet_features=JET_FEATURES, photon_features=PHOTON_FEATURES,
                            btag_feature=BTAG_FEATURE)
# save_to_json(mam_background_convert_df_to_event_dicts, "./mam_background.json")

Applying Eta filter: Keeping jets with Eta between -2.5 and 2.5.
--- Eta filtering complete :)
Removed 139796 events with no valid jets AND no valid photons :)
Number of events after empty event filtering: 360204
Converting DataFrame rows to event dictionaries...


Processing Events: 100%|██████████| 360204/360204 [01:17<00:00, 4645.57it/s]

--- Successfully converted 1637 events (with only ONE isophoton and AT LEAST TWO btag jets) to dictionaries :)





# Comparing Data

In [28]:
import os
print(os.getcwd())

/teamspace/studios/this_studio/all/txt_data_preprocess/raw_txt_data/Tejaswini_Mam_Filtered_dataset


In [25]:
def load_json_data(filepath):
    """Loads data from a JSON file."""
    if not os.path.exists(filepath):
        print(f"*** Error: JSON file not found at {filepath} :(")
        return None
    try:
        print(f"Loading event data from {filepath}...")
        with open(filepath, 'r') as f:
            data = json.load(f)
        if not isinstance(data, list):
            print(f"*** Error: Expected a list of events in JSON file, found {type(data)} :(")
            return None
        print(f"--- Successfully loaded {len(data)} events :)")
        return data
    except Exception as e:
        print(f"*** An unexpected error occurred during JSON loading: {e} :(")
        return None

In [31]:
signal_886 = load_json_data("/teamspace/studios/this_studio/all/txt_data_preprocess/ax45_sig_200k_minpt20_13jets_onlyFirst2bj_onlyFirst1p_processed.json")

Loading event data from /teamspace/studios/this_studio/all/txt_data_preprocess/ax45_sig_200k_minpt20_13jets_onlyFirst2bj_onlyFirst1p_processed.json...
--- Successfully loaded 886 events :)


In [32]:
signal_836 = load_json_data("/teamspace/studios/this_studio/all/txt_data_preprocess/raw_txt_data/Tejaswini_Mam_Filtered_dataset/ax45_mam_signal.json")

Loading event data from /teamspace/studios/this_studio/all/txt_data_preprocess/raw_txt_data/Tejaswini_Mam_Filtered_dataset/ax45_mam_signal.json...
--- Successfully loaded 836 events :)


In [38]:
signal_886_eventno = np.array([event['eventno'] for event in signal_886])

In [39]:
signal_836_eventno = np.array([event['eventno'] for event in signal_836])

In [40]:
def find_missing_elements(array_total, array_subset):
    """
    Finds elements present in the first NumPy array but not in the second.

    Args:
        array_total (np.ndarray): The larger NumPy array containing all potential elements.
        array_subset (np.ndarray): The smaller NumPy array representing a subset.

    Returns:
        np.ndarray: A NumPy array containing elements that are in array_total
                    but not in array_subset. Returns an empty array if inputs
                    are invalid or no differences are found.
    """
    # Input validation (optional but good practice)
    if not isinstance(array_total, np.ndarray) or not isinstance(array_subset, np.ndarray):
        print("Error: Both inputs must be NumPy arrays.")
        return np.array([]) # Return empty array on error

    if array_total.ndim != 1 or array_subset.ndim != 1:
        print("Error: Input arrays must be 1-dimensional.")
        return np.array([])

    # --- Use np.setdiff1d ---
    # This function returns the sorted, unique values in array_total
    # that are not in array_subset.
    missing_elements = np.setdiff1d(array_total, array_subset, assume_unique=False)
    # assume_unique=False is safer if your arrays might contain duplicates,
    # though event numbers are likely unique. If you are CERTAIN they are unique,
    # setting it to True might offer a slight performance boost.

    return missing_elements

In [41]:
missing_eventnos = find_missing_elements(signal_886_eventno, signal_836_eventno)

print(f"\nFound {len(missing_eventnos)} events present in the total array but not in the subset array.")


Found 50 events present in the total array but not in the subset array.


In [43]:
missing_eventnos

array([  1497,   2361,   9894,   9928,  16996,  17997,  26249,  26932,
        33536,  35366,  46082,  46298,  59911,  66668,  68691,  71750,
        73570,  75256,  83766,  85693,  87802,  89897,  92670,  96022,
        98597, 100765, 106704, 108970, 113260, 117396, 124324, 134820,
       136398, 138161, 144708, 145751, 146725, 147433, 158753, 159074,
       160519, 161863, 170509, 173746, 179253, 182657, 183415, 190838,
       192115, 195314])

In [47]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import os

# --- Constants (Ensure these match the structure in your JSON) ---
JET_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E']
PHOTON_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E']
BTAG_FEATURE = 'btag' # The key used for btag inside the jet dict in JSON

# Define the maximum number of jets/photons expected per event
# This determines how many columns like 'jetX_Eta' will be created
# Set these based on the maximum possible in your original data or JSON creation step
MAX_JETS_OUTPUT = 2 # Or adjust as needed
MAX_PHOTONS_OUTPUT = 1 # Or adjust as needed

# --- Function to Load JSON ---
def load_json_data(filepath):
    """Loads data from a JSON file."""
    if not os.path.exists(filepath):
        print(f"Error: JSON file not found at {filepath}")
        return None
    try:
        print(f"Loading full event data from {filepath}...")
        with open(filepath, 'r') as f:
            data = json.load(f)
        if not isinstance(data, list):
            print(f"Error: Expected a list of events in JSON file, found {type(data)}.")
            return None
        print(f"Successfully loaded {len(data)} total events from JSON.")
        return data
    except Exception as e:
        print(f"An unexpected error occurred during JSON loading: {e}")
        return None

# --- Function to Filter Dictionaries ---
def find_missing_event_dicts(all_events_data, missing_eventnos_set):
    """Filters the list of event dicts to find those matching the missing event numbers."""
    if not isinstance(all_events_data, list): return []
    if not isinstance(missing_eventnos_set, set): return []
    if not all_events_data or not missing_eventnos_set: return []

    missing_event_dictionaries = []
    print(f"Searching for {len(missing_eventnos_set)} specific event numbers...")
    found_count = 0
    for event_dict in tqdm(all_events_data, desc="Finding Missing Events"):
        event_no = event_dict.get('eventno')
        if event_no is not None and event_no in missing_eventnos_set:
            missing_event_dictionaries.append(event_dict)
            found_count += 1
    print(f"Found {found_count} matching event dictionaries.")
    return missing_event_dictionaries

# --- NEW Function to Flatten Dictionaries ---
def flatten_event_dicts_to_dataframe(event_dicts,
                                       max_jets=MAX_JETS_OUTPUT,
                                       max_photons=MAX_PHOTONS_OUTPUT,
                                       jet_features=JET_FEATURES,
                                       photon_features=PHOTON_FEATURES,
                                       btag_feature=BTAG_FEATURE):
    """
    Converts a list of event dictionaries (with nested jets/photons)
    into a flattened Pandas DataFrame.

    Args:
        event_dicts (list): List of event dictionaries to flatten.
        max_jets (int): Max number of jets to create columns for (e.g., jet1_..., jet{max_jets}_...).
        max_photons (int): Max number of photons to create columns for.
        jet_features (list): List of feature names for jets (excluding btag).
        photon_features (list): List of feature names for photons.
        btag_feature (str): The key name used for the b-tag label within the jet dict.

    Returns:
        pd.DataFrame: A flattened DataFrame, or None if input is invalid.
    """
    if not isinstance(event_dicts, list) or not event_dicts:
        print("Error: Input 'event_dicts' is empty or not a list.")
        return None

    flattened_data = [] # List to hold dictionaries, each representing a flat row
    print(f"Flattening {len(event_dicts)} event dictionaries...")

    for event_dict in tqdm(event_dicts, desc="Flattening Events"):
        flat_event = {}

        # --- Copy scalar fields ---
        scalar_keys = ['eventno', 'event_label', 'num_btag_jets', 'num_isophoton'] # Add others if present
        for key in scalar_keys:
            flat_event[key] = event_dict.get(key) # Use .get() for safety

        # --- Flatten Jets ---
        jets = event_dict.get('jets', [])
        for i in range(max_jets):
            jet_index = i + 1 # 1-based index for column names
            if i < len(jets): # Check if this jet exists in the event's list
                jet_data = jets[i]
                # Add physics features
                for feature in jet_features:
                    col_name = f"jet{jet_index}_{feature}"
                    flat_event[col_name] = jet_data.get(feature, np.nan) # Default to NaN if feature missing in dict
                # Add btag feature
                btag_col_name = f"jet{jet_index}_{btag_feature}"
                flat_event[btag_col_name] = jet_data.get(btag_feature, np.nan) # Default to NaN
            else:
                # Jet doesn't exist for this event, fill columns with NaN
                for feature in jet_features:
                    col_name = f"jet{jet_index}_{feature}"
                    flat_event[col_name] = np.nan
                # Add btag feature column with NaN
                btag_col_name = f"jet{jet_index}_{btag_feature}"
                flat_event[btag_col_name] = np.nan

        # --- Flatten Photons ---
        photons = event_dict.get('photons', [])
        for i in range(max_photons):
            photon_index = i + 1 # 1-based index
            if i < len(photons): # Check if photon exists
                photon_data = photons[i]
                for feature in photon_features:
                    col_name = f"isophoton{photon_index}_{feature}"
                    flat_event[col_name] = photon_data.get(feature, np.nan)
            else:
                # Photon doesn't exist, fill columns with NaN
                for feature in photon_features:
                    col_name = f"isophoton{photon_index}_{feature}"
                    flat_event[col_name] = np.nan

        flattened_data.append(flat_event)

    # --- Create DataFrame ---
    try:
        df = pd.DataFrame(flattened_data)
        print(f"Successfully created flattened DataFrame with shape {df.shape}.")
        # Reorder columns logically if desired (optional)
        # df = df.sort_index(axis=1) # Sort columns alphabetically
        return df
    except Exception as e:
        print(f"An error occurred during DataFrame creation from flattened data: {e}")
        return None


# --- Main Execution ---
if __name__ == "__main__":
    all_events = signal_886.copy()

    if all_events and missing_eventnos.size > 0:
        # 2. Convert missing event numbers to a set
        missing_eventnos_set = set(missing_eventnos)

        # 3. Find the dictionaries for the missing events
        missing_dicts = find_missing_event_dicts(all_events, missing_eventnos_set)

        if missing_dicts:
            # 4. Flatten the list of missing dictionaries into a DataFrame
            df_missing_flat = flatten_event_dicts_to_dataframe(
                missing_dicts,
                max_jets=MAX_JETS_OUTPUT,
                max_photons=MAX_PHOTONS_OUTPUT,
                jet_features=JET_FEATURES,
                photon_features=PHOTON_FEATURES,
                btag_feature=BTAG_FEATURE
            )

            if df_missing_flat is not None:
                print("\n--- Flattened DataFrame of Missing Events ---")
                print(f"Shape: {df_missing_flat.shape}")
                print("\n--- Columns ---")
                print(df_missing_flat.columns.tolist()) # Show all created columns
                print("\n--- Head ---")
                # Display more columns if needed: pd.set_option('display.max_columns', None)
                print(df_missing_flat.head())
                print("\n--- Info ---")
                df_missing_flat.info()
                print("\n--- Basic Description (numeric columns) ---")
                print(df_missing_flat.describe(include=np.number))

                # Example: Check how many missing events have a valid jet1_Eta
                # valid_jet1_eta_count = df_missing_flat['jet1_Eta'].notna().sum()
                # print(f"\nNumber of missing events with non-NaN jet1_Eta: {valid_jet1_eta_count}")

            else:
                print("\nCould not create flattened DataFrame for missing events.")
        else:
            print("\nNo matching event dictionaries found for the specified missing event numbers.")
    elif not all_events:
         print("\nCould not load JSON data. Aborting.")
    else:
         print("\nThe list of missing event numbers is empty.")

Searching for 50 specific event numbers...


Finding Missing Events: 100%|██████████| 886/886 [00:00<00:00, 1463052.50it/s]


Found 50 matching event dictionaries.
Flattening 50 event dictionaries...


Flattening Events: 100%|██████████| 50/50 [00:00<00:00, 69053.41it/s]

Successfully created flattened DataFrame with shape (50, 27).

--- Flattened DataFrame of Missing Events ---
Shape: (50, 27)

--- Columns ---
['eventno', 'event_label', 'num_btag_jets', 'num_isophoton', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet1_btag', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'jet2_btag', 'isophoton1_Eta', 'isophoton1_Phi', 'isophoton1_pT', 'isophoton1_Px', 'isophoton1_Py', 'isophoton1_Pz', 'isophoton1_E']

--- Head ---
   eventno event_label  num_btag_jets  num_isophoton  jet1_Eta  jet1_Phi  \
0     1497        None              2              1 -1.815330 -1.677930   
1     2361        None              2              1  0.008029 -0.475817   
2     9894        None              2              1  0.561517 -1.504180   
3     9928        None              2              1 -1.847810 -2.145460   
4    16996        None              2              1  1.377990 -1.205140   

   jet1_pT   jet1_Px  je




In [48]:
df_missing_flat.head()

Unnamed: 0,eventno,event_label,num_btag_jets,num_isophoton,jet1_Eta,jet1_Phi,jet1_pT,jet1_Px,jet1_Py,jet1_Pz,...,jet2_Pz,jet2_E,jet2_btag,isophoton1_Eta,isophoton1_Phi,isophoton1_pT,isophoton1_Px,isophoton1_Py,isophoton1_Pz,isophoton1_E
0,1497,,2,1,-1.81533,-1.67793,45.9977,-4.91846,-45.734,-137.54,...,-56.7206,61.971,1,-0.985062,2.49808,24.5172,-19.6136,14.7104,-28.2507,37.4058
1,2361,,2,1,0.008029,-0.475817,33.0508,29.3795,-15.1394,0.26537,...,-8.01525,32.8652,1,1.09583,-3.04158,20.1531,-20.0524,-2.01213,26.7774,33.5139
2,9894,,2,1,0.561517,-1.50418,57.36,3.81833,-57.2328,33.928,...,-82.2554,96.2855,1,1.21058,1.16662,39.897,15.6899,36.6824,60.9906,72.8809
3,9928,,2,1,-1.84781,-2.14546,37.421,-20.3401,-31.4104,-115.786,...,-107.846,114.13,1,-2.01941,0.819483,24.516,16.7346,17.9161,-90.7233,93.9774
4,16996,,2,1,1.37799,-1.20514,62.6876,22.4148,-58.5432,116.437,...,27.1278,58.6211,1,-0.597624,-2.08244,14.8291,-7.26052,-12.9301,-9.39926,17.557


In [49]:
mam_signal_eventno = mam_signal['eventno'].values

In [51]:
mask_in_subset = np.in1d(mam_signal_eventno, missing_eventnos, assume_unique=False)

In [56]:
missing_events = mam_signal[mask_in_subset]

In [57]:
# missing_events.to_csv('missing_events.csv')

In [None]:
# Original ax45_sig_200k_minpt20_13jets.txt

In [59]:
signal_886_full = pd.read_csv("/teamspace/studios/this_studio/all/txt_data_preprocess/raw_txt_data/ax45_sig_200k_minpt20_13jets.txt", sep = '\t')

In [62]:
signal_886_full_eventno = signal_886_full['eventno'].values

In [63]:
mask_in_subset_signal_886_full_eventno = np.in1d(signal_886_full_eventno, missing_eventnos, assume_unique=False)

In [64]:
missing_events_signal_886_full_eventno = signal_886_full[mask_in_subset_signal_886_full_eventno]

In [65]:
missing_events_signal_886_full_eventno.shape

(50, 163)

In [66]:
missing_events_signal_886_full_eventno.to_csv('missing_events_signal_886_full.csv')