# Signal/Background Classification using Boosted Decision Trees

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import helper_function_module as hfm

seed = 1234
np.random.seed(seed)

In [3]:
import os
print(os.getcwd())

/teamspace/studios/this_studio/all/BDT/BDT_Old_Preprocessing


## 1. Data Loading

In [4]:
ax15_data = pd.read_csv('/teamspace/studios/this_studio/all/BDT/BDT_Old_Preprocessing/ax15_signalBackground_2j1p_data.txt', 
                     sep = '\t')
ax75_data = pd.read_csv('/teamspace/studios/this_studio/all/BDT/BDT_Old_Preprocessing/ax75_signalBackground_2j1p_data.txt', 
                     sep = '\t')

In [5]:
ax15_data = ax15_data.drop(['Unnamed: 0'], axis = 1)
ax75_data = ax75_data.drop(['Unnamed: 0'], axis = 1)

# 2j1p Invariant Mass and deltaR_jet12 calculation

In [6]:
ax15_data.head(10)

Unnamed: 0,eventno,jetmultiplicity,jet1_Eta,jet1_Phi,jet1_pT,jet1_Px,jet1_Py,jet1_Pz,jet1_E,jet1_btag,...,isophoton3_Eta,isophoton3_Phi,isophoton3_pT,isophoton3_Px,isophoton3_Py,isophoton3_Pz,isophoton3_E,event_label,num_btag_jets,num_isophoton
0,387541,4,-0.228061,-2.91466,31.4575,-30.6509,-7.07762,-7.23657,32.2791,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
1,411307,2,0.174646,-2.09081,39.1901,-19.4734,-34.0096,6.87926,39.7893,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
2,48351,3,,,,,,,,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
3,265808,2,-1.38352,0.081758,60.4259,60.224,4.93477,-112.942,128.091,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
4,346661,2,0.872268,-1.59261,28.1645,-0.614285,-28.1578,27.8029,39.5757,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
5,104577,2,0.409978,1.90441,31.5756,-10.3397,29.8348,13.311,34.2667,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
6,307466,2,-0.029848,-0.764917,65.198,47.0364,-45.1481,-1.94634,65.227,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
7,138574,3,0.886513,1.5832,145.361,-1.80306,145.35,146.419,206.321,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2,1
8,342677,3,0.434404,-1.41991,85.1319,12.7969,-84.1646,38.1557,93.2915,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1
9,304654,5,-2.39407,-0.385101,61.9448,57.408,-23.2697,-336.57,342.223,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,1


In [7]:
ax15_data.iloc[1,:]

eventno            411307.000000
jetmultiplicity         2.000000
jet1_Eta                0.174646
jet1_Phi               -2.090810
jet1_pT                39.190100
                       ...      
isophoton3_Pz           0.000000
isophoton3_E            0.000000
event_label             0.000000
num_btag_jets           2.000000
num_isophoton           1.000000
Name: 1, Length: 131, dtype: float64

In [8]:
def calculate_invMass_deltaRjet12(dataset:pd.DataFrame) -> pd.DataFrame:
    pass

In [9]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# Register tqdm for pandas apply
tqdm.pandas(desc="Processing Events")

# --- Constants ---
BASE_FEATURES = ['Eta', 'Phi', 'pT', 'Px', 'Py', 'Pz', 'E']
N_JETS_TOTAL = 13
N_PHOTONS_TOTAL = 3

# --- Helper Function to Generate Column Names ---
def generate_feature_columns(prefix, indices, features):
    """
    Generates a list of column names for specific indices.

    Args:
        prefix (str): The prefix ('jet' or 'isophoton').
        indices (list[int]): List of 1-based indices to generate columns for.
        features (list[str]): Feature names (e.g., ['Eta', 'Phi']).

    Returns:
        list[str]: List of column names (e.g., ['jet3_Eta', 'jet6_Eta', ...]).
    """
    columns = []
    for i in indices:
        for feature in features:
            columns.append(f"{prefix}{i}_{feature}")
    return columns

# --- Function to Process a Single Event (Row) ---
def process_event_row(row, base_features, n_jets_total, n_photons_total):
    """
    Processes a single event row to select active jets/photons based on pT.

    Identifies the required number of jets/photons based on 'num_btag_jets'
    and 'num_isophoton' columns by selecting those with the highest pT values.
    Creates a new Series with standardized column names for the selected particles.

    Args:
        row (pd.Series): A single row from the input DataFrame.
        base_features (list[str]): List of feature names per particle.
        n_jets_total (int): Total number of jet columns available (e.g., 13).
        n_photons_total (int): Total number of photon columns available (e.g., 3).

    Returns:
        pd.Series: A new Series containing general info and features of
                   the selected top N jets and M photons, with standardized
                   column names like 'selected_jet1_Eta', 'selected_photon1_pT'.
                   Returns None if expected columns are missing.
    """
    try:
        n_jets_to_keep = int(row['num_btag_jets'])
        n_photons_to_keep = int(row['num_isophoton'])
    except KeyError as e:
        print(f"Error: Missing required column '{e}' in row. Skipping event.")
        return None # Or raise an error, or return an empty Series

    # --- Identify Active Jets ---
    jet_pt_cols = [f'jet{i}_pT' for i in range(1, n_jets_total + 1)]
    # Ensure columns exist before accessing
    valid_jet_pt_cols = [col for col in jet_pt_cols if col in row.index]
    if not valid_jet_pt_cols:
        # Handle case where no jet pT columns are found (maybe log/warn)
        selected_jet_indices = []
    elif n_jets_to_keep > 0:
        jet_pts = row[valid_jet_pt_cols]
        # Get indices (0-based) of the N largest pT values
        top_jet_indices_0based = jet_pts.nlargest(n_jets_to_keep).index
        # Convert DataFrame column index (like 'jet5_pT') back to particle index (like 5)
        # Add 1 because particle indices are 1-based
        selected_jet_indices = sorted([int(idx.split('_')[0][3:]) for idx in top_jet_indices_0based])
    else:
        selected_jet_indices = [] # No jets to keep for this event

    # --- Identify Active Photons ---
    photon_pt_cols = [f'isophoton{i}_pT' for i in range(1, n_photons_total + 1)]
    valid_photon_pt_cols = [col for col in photon_pt_cols if col in row.index]
    if not valid_photon_pt_cols:
        selected_photon_indices = []
    elif n_photons_to_keep > 0:
        photon_pts = row[valid_photon_pt_cols]
        top_photon_indices_0based = photon_pts.nlargest(n_photons_to_keep).index
        selected_photon_indices = sorted([int(idx.split('_')[0][9:]) for idx in top_photon_indices_0based])
    else:
        selected_photon_indices = []

    # --- Build the Result Series ---
    result_data = {}

    # 1. Add General Columns
    general_cols = ['eventno', 'jetmultiplicity', 'event_label', 'num_btag_jets', 'num_isophoton']
    for col in general_cols:
        if col in row.index:
            result_data[col] = row[col]

    # 2. Add Selected Jet Features with Standardized Names
    for i, jet_idx in enumerate(selected_jet_indices):
        output_jet_num = i + 1 # Output as selected_jet1, selected_jet2, ...
        for feature in base_features:
            original_col = f'jet{jet_idx}_{feature}'
            output_col = f'jet{output_jet_num}_{feature}'
            if original_col in row.index:
                result_data[output_col] = row[original_col]
            else:
                result_data[output_col] = np.nan # Or 0.0 if preferred for missing data

    # 3. Add Selected Photon Features with Standardized Names
    for i, photon_idx in enumerate(selected_photon_indices):
        output_photon_num = i + 1
        for feature in base_features:
            original_col = f'isophoton{photon_idx}_{feature}'
            output_col = f'isophoton{output_photon_num}_{feature}'
            if original_col in row.index:
                result_data[output_col] = row[original_col]
            else:
                result_data[output_col] = np.nan

    return pd.Series(result_data)


# --- Main Filtering Function ---
def filter_and_select_active_particles(df, base_features, n_jets_total, n_photons_total):
    """
    Filters DataFrame to keep only features of the 'active' jets and photons
    for each event, identified by having the highest pT values according to
    'num_btag_jets' and 'num_isophoton' columns.

    Args:
        df (pd.DataFrame): Input DataFrame.
        base_features (list[str]): Feature names per particle.
        n_jets_total (int): Max jet index in the columns (e.g., 13).
        n_photons_total (int): Max photon index in the columns (e.g., 3).

    Returns:
        pd.DataFrame: A new DataFrame with standardized columns for the
                      selected active particles for each event.
    """
    print("Processing events to select active particles...")

    # Apply the row-wise processing function
    # `progress_apply` automatically uses tqdm
    processed_series_list = df.progress_apply(
        process_event_row,
        axis=1,
        base_features=base_features,
        n_jets_total=n_jets_total,
        n_photons_total=n_photons_total
    )

    # Drop rows where processing might have failed (returned None)
    processed_series_list = processed_series_list.dropna(how='all')

    # Convert the list of Series back into a DataFrame
    # Pandas handles potentially different columns by filling with NaN
    result_df = pd.DataFrame(processed_series_list)

    # Define expected output column order (optional but good practice)
    # Determine max selected jets/photons across the dataset
    max_jets_kept = df['num_btag_jets'].max().astype(int) if 'num_btag_jets' in df.columns else 0
    max_photons_kept = df['num_isophoton'].max().astype(int) if 'num_isophoton' in df.columns else 0

    ordered_cols = []
    general_cols = ['eventno', 'jetmultiplicity', 'event_label', 'num_btag_jets', 'num_isophoton']
    ordered_cols.extend([col for col in general_cols if col in result_df.columns]) # Keep only existing

    for i in range(1, max_jets_kept + 1):
        for feature in base_features:
            ordered_cols.append(f'jet{i}_{feature}')
    for i in range(1, max_photons_kept + 1):
         for feature in base_features:
            ordered_cols.append(f'isophoton{i}_{feature}')

    # Keep only columns that actually exist in the result_df and reorder
    final_cols = [col for col in ordered_cols if col in result_df.columns]
    result_df = result_df[final_cols]


    print("Processing complete.")
    return result_df

In [10]:
ax15_selected_data = filter_and_select_active_particles(
        df=ax15_data,
        base_features=BASE_FEATURES,
        n_jets_total=N_JETS_TOTAL,
        n_photons_total=N_PHOTONS_TOTAL
    )
print("\n--- Filtered DataFrame Info ---")
print(f"Shape: {ax15_selected_data.shape}")
print("Columns:", ax15_selected_data.columns.tolist())
print("\nExample processed event (row 5):")
print(ax15_selected_data.iloc[5])

Processing events to select active particles...


Processing Events:   0%|          | 0/2494 [00:00<?, ?it/s]

Processing complete.

--- Filtered DataFrame Info ---
Shape: (2494, 26)
Columns: ['eventno', 'jetmultiplicity', 'event_label', 'num_btag_jets', 'num_isophoton', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'isophoton1_Eta', 'isophoton1_Phi', 'isophoton1_pT', 'isophoton1_Px', 'isophoton1_Py', 'isophoton1_Pz', 'isophoton1_E']

Example processed event (row 5):
eventno            104577.000000
jetmultiplicity         2.000000
event_label             0.000000
num_btag_jets           2.000000
num_isophoton           1.000000
jet1_Eta                0.409978
jet1_Phi                1.904410
jet1_pT                31.575600
jet1_Px               -10.339700
jet1_Py                29.834800
jet1_Pz                13.311000
jet1_E                 34.266700
jet2_Eta                0.845292
jet2_Phi               -1.495100
jet2_pT                22.977200
jet2_Px                 1.737650
j

In [11]:
ax75_selected_data = filter_and_select_active_particles(
        df=ax75_data,
        base_features=BASE_FEATURES,
        n_jets_total=N_JETS_TOTAL,
        n_photons_total=N_PHOTONS_TOTAL
)
print("\n--- Filtered DataFrame Info ---")
print(f"Shape: {ax75_selected_data.shape}")
print("Columns:", ax75_selected_data.columns.tolist())
print("\nExample processed event (row 5):")
print(ax75_selected_data.iloc[5])

Processing events to select active particles...


Processing Events:   0%|          | 0/6702 [00:00<?, ?it/s]

Processing complete.

--- Filtered DataFrame Info ---
Shape: (6702, 26)
Columns: ['eventno', 'jetmultiplicity', 'event_label', 'num_btag_jets', 'num_isophoton', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'isophoton1_Eta', 'isophoton1_Phi', 'isophoton1_pT', 'isophoton1_Px', 'isophoton1_Py', 'isophoton1_Pz', 'isophoton1_E']

Example processed event (row 5):
eventno            194666.000000
jetmultiplicity         2.000000
event_label             1.000000
num_btag_jets           2.000000
num_isophoton           1.000000
jet1_Eta               -0.113118
jet1_Phi               -0.486638
jet1_pT                35.555900
jet1_Px                31.428200
jet1_Py               -16.628000
jet1_Pz                -4.030610
jet1_E                 35.783600
jet2_Eta               -1.607380
jet2_Phi               -2.907960
jet2_pT                35.310700
jet2_Px               -34.351300
j

# Invariant Mass and DeltaR calculation

In [12]:
def calculate_invariant_mass_jjg(df):
    """
    Calculates the invariant mass of the system composed of the first two
    selected jets and the first selected photon.

    Adds a new column 'invariant_mass_jjg' to the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing features for selected particles
                           (e.g., 'jet1_E', 'jet2_Px',
                           'isophoton1_Pz', etc.).

    Returns:
        pd.DataFrame: The DataFrame with the added 'invariant_mass_jjg' column.
                      The column will contain NaN where input components are missing
                      or the calculation results in an invalid number.
    """
    print("Calculating invariant mass (inv_mass_2j1p)...")
    # Define required columns
    req_cols = [
        'jet1_E', 'jet1_Px', 'jet1_Py', 'jet1_Pz',
        'jet2_E', 'jet2_Px', 'jet2_Py', 'jet2_Pz',
        'isophoton1_E', 'isophoton1_Px', 'isophoton1_Py', 'isophoton1_Pz'
    ]

    # Check if all required columns exist
    missing_cols = [col for col in req_cols if col not in df.columns]
    if missing_cols:
        print(f"Warning: Missing required columns for invariant mass calculation: {missing_cols}")
        print("         'inv_mass_2j1p' column will be filled with NaN.")
        df['inv_mass_2j1p'] = np.nan
        return df

    # Calculate total energy and momentum components
    E_tot = df['jet1_E'] + df['jet2_E'] + df['isophoton1_E']
    Px_tot = df['jet1_Px'] + df['jet2_Px'] + df['isophoton1_Px']
    Py_tot = df['jet1_Py'] + df['jet2_Py'] + df['isophoton1_Py']
    Pz_tot = df['jet1_Pz'] + df['jet2_Pz'] + df['isophoton1_Pz']

    # Calculate squared invariant mass: M^2 = E^2 - Px^2 - Py^2 - Pz^2
    # Use np.maximum to avoid sqrt of small negative numbers due to float precision
    m_squared = np.maximum(0, E_tot**2 - Px_tot**2 - Py_tot**2 - Pz_tot**2)

    # Calculate invariant mass
    df['inv_mass_2j1p'] = np.sqrt(m_squared)

    # Handle potential NaNs produced by calculation (e.g., if inputs were NaN)
    df['inv_mass_2j1p'] = df['inv_mass_2j1p'].fillna(np.nan)

    print("Invariant mass calculation complete.")
    return df

# --- Function to Calculate Delta R (j1j2) ---
def calculate_deltaR_j1j2(df):
    """
    Calculates the Delta R separation between the first two selected jets.

    Delta R = sqrt( (Eta1 - Eta2)^2 + (Phi1 - Phi2)^2 )
    Corrects the Phi difference for periodicity (-pi to pi).

    Adds a new column 'deltaR_j1j2' to the DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing features for selected particles
                           (e.g., 'jet1_Eta', 'jet1_Phi',
                           'jet2_Eta', 'jet2_Phi').

    Returns:
        pd.DataFrame: The DataFrame with the added 'deltaR_j1j2' column.
                      The column will contain NaN where input components are missing.
    """
    print("Calculating Delta R (j1j2)...")
    # Define required columns
    req_cols = [
        'jet1_Eta', 'jet1_Phi',
        'jet2_Eta', 'jet2_Phi'
    ]

    # Check if all required columns exist
    missing_cols = [col for col in req_cols if col not in df.columns]
    if missing_cols:
        print(f"Warning: Missing required columns for Delta R calculation: {missing_cols}")
        print("         'deltaR_j1j2' column will be filled with NaN.")
        df['deltaR_j1j2'] = np.nan
        return df

    # Calculate differences
    delta_eta = df['jet1_Eta'] - df['jet2_Eta']
    delta_phi_raw = df['jet1_Phi'] - df['jet2_Phi']

    # Correct delta_phi for periodicity (-pi to pi)
    # Method: map difference to (-pi, pi] range
    # delta_phi = (delta_phi_raw + np.pi) % (2 * np.pi) - np.pi
    # More robust approach using arctan2:
    delta_phi = np.arctan2(np.sin(delta_phi_raw), np.cos(delta_phi_raw))


    # Calculate Delta R squared
    dr_squared = delta_eta**2 + delta_phi**2

    # Calculate Delta R
    df['deltaR_j1j2'] = np.sqrt(dr_squared)

    # Handle potential NaNs produced by calculation
    df['deltaR_j1j2'] = df['deltaR_j1j2'].fillna(np.nan)

    print("Delta R calculation complete.")
    return df


# --- Combined Function to Add Derived Features ---
def add_derived_features(df):
    """
    Adds derived physics features (invariant mass, delta R) to the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame after selecting active particles.
                           Must contain 'selected_jetX_...' and
                           'selected_photonX_...' columns.

    Returns:
        pd.DataFrame: DataFrame with added 'invariant_mass_jjg' and 'deltaR_j1j2'.
    """
    if df.empty:
        print("Input DataFrame is empty. Cannot add derived features.")
        # Add empty columns if desired, otherwise just return
        df['inv_mass_2j1p'] = np.nan
        df['deltaR_j1j2'] = np.nan
        return df

    # Calculate and add invariant mass
    df = calculate_invariant_mass_jjg(df)

    # Calculate and add Delta R
    df = calculate_deltaR_j1j2(df)

    return df

In [13]:
ax15_final_invmass_deltaR = add_derived_features(ax15_selected_data.copy())

print("\n--- Final DataFrame Info ---")
print(f"Shape: {ax15_final_invmass_deltaR.shape}")
print("Columns:", ax15_final_invmass_deltaR.columns.tolist())
print("\nFinal DataFrame with derived features:")
print(ax15_final_invmass_deltaR[['eventno', 'jet1_pT', 'jet2_pT', 'isophoton1_pT', 'inv_mass_2j1p', 'deltaR_j1j2']])

Calculating invariant mass (inv_mass_2j1p)...
Invariant mass calculation complete.
Calculating Delta R (j1j2)...
Delta R calculation complete.

--- Final DataFrame Info ---
Shape: (2494, 28)
Columns: ['eventno', 'jetmultiplicity', 'event_label', 'num_btag_jets', 'num_isophoton', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'isophoton1_Eta', 'isophoton1_Phi', 'isophoton1_pT', 'isophoton1_Px', 'isophoton1_Py', 'isophoton1_Pz', 'isophoton1_E', 'inv_mass_2j1p', 'deltaR_j1j2']

Final DataFrame with derived features:
       eventno  jet1_pT  jet2_pT  isophoton1_pT  inv_mass_2j1p  deltaR_j1j2
0     387541.0  31.4575  29.7383        49.7504     127.326288     2.604160
1     411307.0  39.1901  29.6660        55.3095     135.737860     0.784236
2      48351.0  28.7248  25.3380        35.0363      88.972461     1.819248
3     265808.0  60.4259  44.4943        17.3703     120.716535     3

In [14]:
ax75_final_invmass_deltaR = add_derived_features(ax75_selected_data.copy())

print("\n--- Final DataFrame Info ---")
print(f"Shape: {ax75_final_invmass_deltaR.shape}")
print("Columns:", ax75_final_invmass_deltaR.columns.tolist())
print("\nFinal DataFrame with derived features:")
print(ax75_final_invmass_deltaR[['eventno', 'jet1_pT', 'jet2_pT', 'isophoton1_pT', 'inv_mass_2j1p', 'deltaR_j1j2']])

Calculating invariant mass (inv_mass_2j1p)...
Invariant mass calculation complete.
Calculating Delta R (j1j2)...
Delta R calculation complete.

--- Final DataFrame Info ---
Shape: (6702, 28)
Columns: ['eventno', 'jetmultiplicity', 'event_label', 'num_btag_jets', 'num_isophoton', 'jet1_Eta', 'jet1_Phi', 'jet1_pT', 'jet1_Px', 'jet1_Py', 'jet1_Pz', 'jet1_E', 'jet2_Eta', 'jet2_Phi', 'jet2_pT', 'jet2_Px', 'jet2_Py', 'jet2_Pz', 'jet2_E', 'isophoton1_Eta', 'isophoton1_Phi', 'isophoton1_pT', 'isophoton1_Px', 'isophoton1_Py', 'isophoton1_Pz', 'isophoton1_E', 'inv_mass_2j1p', 'deltaR_j1j2']

Final DataFrame with derived features:
       eventno  jet1_pT  jet2_pT  isophoton1_pT  inv_mass_2j1p  deltaR_j1j2
0      54229.0  26.0517  22.9301        27.4569      76.497345     1.729497
1     451531.0  27.9884  27.1288        28.0520      81.254966     0.849681
2      29203.0  66.2397   0.0000        27.8527      80.421951     2.425907
3      52691.0  51.1242  29.3283        15.9033     123.826058     2

# Saving the Final Dataset

In [15]:
ax75_final_invmass_deltaR.to_csv('./ax75_signalBackground_2j1p_invMass_deltaR_data.txt', sep = '\t')
ax15_final_invmass_deltaR.to_csv('./ax15_signalBackground_2j1p_invMass_deltaR_data.txt', sep = '\t')