In [1]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Input
from keras.callbacks import EarlyStopping
import os

2025-06-19 15:04:26.493787: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
file_dict = { # dictionary for file paths + names 
    'Background': '/tmp/all_data/background_for_training.h5',
    'Ato4l': '/tmp/all_data/Ato4l_lepFilter_13TeV_filtered.h5',
    'hChToTauNu': '/tmp/all_data/hChToTauNu_13TeV_PU20_filtered.h5',
    'hToTauTau' : '/tmp/all_data/hToTauTau_13TeV_PU20_filtered.h5', 
    'leptoquark': '/tmp/all_data/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5',
}

background_file = "/tmp/all_data/background_for_training.h5" # background data file

signal_files = [ # signal data files
    
    '/tmp/all_data/Ato4l_lepFilter_13TeV_filtered.h5',
    '/tmp/all_data/hChToTauNu_13TeV_PU20_filtered.h5',
    '/tmp/all_data/hToTauTau_13TeV_PU20_filtered.h5', 
    '/tmp/all_data/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5',
]

output_names = [ # names for output files
    "Ato4l_lepFilter",
    "hChToTauNu",
    "hToTauTau",
    "leptoquark_LOWMASS"
]

# creating output folders 
output_folder = '/tmp/all_data/combined_datasets/'
model_output = "/tmp/all_data/models/"

os.makedirs(model_output, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)


In [3]:
# load particle data from file
def load_particles(file_path):
    with h5py.File(file_path, 'r') as f:
        particles = f['Particles'][:] 
    return particles

def remove_padding(particles): 
    padding_mask = particles[:, :, 3] != 0  # filter out padding (any index that is not 0) 
    valid_features = particles[:, :, :3][padding_mask]  # take pt, eta, phi 
    return valid_features

def combine_signal_background(signal_path, background_path, output_path):
    # load background particles
    background_data = load_particles(background_path)

    # load signal particles
    signal_data = load_particles(signal_path)

    # combine particle data
    combined_data = np.concatenate([signal_data, background_data], axis=0)

    # save combined dataset
    with h5py.File(output_path, 'w') as f:
        f.create_dataset('Particles', data=combined_data) # saving new particles data in combined dataset

In [None]:
# loop through and combine each signal with background
for output_name, signal_path in zip(output_names, signal_files):
    output_path = os.path.join(output_folder, f'{output_name}_with_background.h5')
    combine_signal_background(signal_path, background_file, output_path)
    print(f"saved: {output_path}")

In [4]:
all_features = []

for name, filepath in file_dict.items():
    data = load_particles(filepath)
    data = remove_padding(data)
    all_features.append(data)

# combine all datasets into one array
combined_features = np.vstack(all_features)

# fit scaler to all combined data 
scaler = StandardScaler() # saved as scaler, can now call this later 
scaler.fit(combined_features)

OSError: Can't synchronously read data (filter returned failure during read)

In [None]:
def create_supervised_dataset(bkg_file, signal_file, scaler, output_path, events = None, test_size=0.2, val_size=0.2, input_shape=57, random_num=42):
    # load and process background file 
    with h5py.File(bkg_file, 'r') as file:
        bkg_data = file['Particles'][:, :, :-1] # remove the last feature (label)
        np.random.shuffle(bkg_data) # shuffle data to remove bias and ensure randomness
        if events: # if events not None 
            bkg_data = bkg_data[:events, :, :]
        bkg_data_flattened = bkg_data.reshape(bkg_data.shape[0], input_shape)
        y_bkg = np.zeros(bkg_data.shape[0]) #0's for background events

    # load and process signal file
    with h5py.File(signal_file, 'r') as file:
        sig_data = file['Particles'][:, :, :-1] #drop last feature
        if events: # if events not None
            sig_data = sig_data[:events, :, :] 
        sig_data_flattened = sig_data.reshape(sig_data.shape[0], input_shape)
        y_sig = np.ones(sig_data.shape[0]) # 1's for signal events 

        # Subsample background to match signal size
        n_signal = sig_data.shape[0]
        if bkg_data.shape[0] > n_signal:
            indices = np.random.choice(bkg_data.shape[0], size=n_signal, replace=False)
            bkg_data_balanced = bkg_data[indices]
            bkg_data_flat_balanced = bkg_data_flattened[indices]
            y_bkg_balanced = y_bkg[indices]
        else:
            bkg_data_balanced = bkg_data
            bkg_data_flat_balanced = bkg_data_flattened
            y_bkg_balanced = y_bkg

    # Combine balanced datasets
    X = np.vstack((bkg_data_balanced, sig_data))  # shape: (n_events, 19, 3)
    X_flattened = np.vstack((bkg_data_flat_balanced, sig_data_flattened))  # shape: (n_events, 57)
    y = np.concatenate((y_bkg_balanced, y_sig))

    # Normalise using provided global scaler
    X_flat_scaled = scaler.transform(X_flattened)

    # Train/val/test splitting
    X_train, X_test, y_train, y_test = train_test_split(X_flat_scaled, y, test_size=test_size, stratify=y, random_state=random_num)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, stratify=y_train, random_state=random_num)

    with h5py.File(output_path, 'w') as h5f:
        h5f.create_dataset('X_train', data=X_train)
        h5f.create_dataset('y_train', data=y_train)
        h5f.create_dataset('X_val', data=X_val)
        h5f.create_dataset('y_val', data=y_val)
        h5f.create_dataset('X_test', data=X_test)
        h5f.create_dataset('y_test', data=y_test)
        #h5f.create_dataset('X', data=X) # save the raw data (shape: (n_events, 19, 3)) for plotting combined distributions 

In [None]:
# create the combined datasets for each signal file 
for signal_file, output_name in zip(signal_files, output_names):
    output_path = f"{output_folder}/{output_name}_dataset.h5"
    create_supervised_dataset(background_file, signal_file, scaler, output_path, events= 10000, test_size=0.2, val_size=0.2, input_shape=57, random_num=42)

In [None]:
# create supervised binary classifier NN 
def build_model(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
for output_name in output_names:
    file_path = f"{output_folder}/{output_name}_dataset.h5"
    with h5py.File(file_path, 'r') as f:
        X_train = f['X_train'][:]
        y_train = f['y_train'][:]
        X_val = f['X_val'][:]
        y_val = f['y_val'][:]

    model = build_model(input_dim=57)
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                        epochs=50, batch_size=128, callbacks=[early_stop])
    
    # Save the model
    model.save(f"{model_output}/{output_name}_model.h5")

    # Plot training
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.legend()
    plt.title(f"{output_name} Loss")
    plt.grid()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Acc')
    plt.plot(history.history['val_accuracy'], label='Validation Acc')
    plt.legend()
    plt.title(f"{output_name} Accuracy")
    plt.grid()
    plt.show()



In [None]:
# cross testing models (ROC CURVES)

for test_name in output_names:
    with h5py.File(f"{output_folder}/{test_name}_dataset.h5", 'r') as f:
        X_test = f['X_test'][:]
        y_test = f['y_test'][:]

    plt.figure(figsize=(8,6))
    
    for train_name in output_names:
        model = load_model(f"{model_output}/{train_name}_model.h5")
        y_pred = model.predict(X_test).flatten()
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        plt.plot(fpr, tpr, label=f"Model [{train_name}] AUC: {auc:.3f}")

    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curves: All models tested on [{test_name}] dataset")
    plt.legend()
    plt.grid()
    plt.show()
