In [1]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Input
from keras.callbacks import EarlyStopping
import os

2025-06-20 10:53:05.772156: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
file_dict = {
    'Background': '/tmp/all_data/background_for_training.h5',
    'Ato4l': '/tmp/all_data/Ato4l_lepFilter_13TeV_filtered.h5',
    'hChToTauNu': '/tmp/all_data/hChToTauNu_13TeV_PU20_filtered.h5',
    'hToTauTau' : '/tmp/all_data/hToTauTau_13TeV_PU20_filtered.h5', 
    'leptoquark': '/tmp/all_data/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5',
}

background_file = "/tmp/all_data/background_for_training.h5" # background data file

signal_files = [ # signal data files
    
    '/tmp/all_data/Ato4l_lepFilter_13TeV_filtered.h5',
    '/tmp/all_data/hChToTauNu_13TeV_PU20_filtered.h5',
    '/tmp/all_data/hToTauTau_13TeV_PU20_filtered.h5', 
    '/tmp/all_data/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5',
]

output_names = [ # names for output files
    "Ato4l_lepFilter",
    "hChToTauNu",
    "hToTauTau",
    "leptoquark_LOWMASS"
]

# creating output folders 
output_folder = '/tmp/all_data/combined_datasets/'
model_output = "/tmp/all_data/models/"

os.makedirs(model_output, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

In [3]:
with h5py.File(file_dict['Ato4l'], 'r') as f:
    particles = f['Particles'][:]
    classes = f['Particles_Classes'][:]
    names = f['Particles_Names'][:]
    event_ids = f.get('EvtId')

print(f'particles shape: {particles.shape}')
print(f'classes shape: {classes.shape}')
print(f'names shape: {names.shape}')
print(f'event_ids shape: {event_ids.shape if event_ids is not None else "No event_ids found"}')

particles shape: (55969, 19, 4)
classes shape: (4,)
names shape: (4,)
event_ids shape: No event_ids found


In [4]:
print('particles')
print(particles)
print('classes')
print(classes)
print('names')
print(names)
print(event_ids)

particles
[[[ 9.23502350e+00  0.00000000e+00 -1.88284254e+00  1.00000000e+00]
  [ 2.70772038e+01  2.10681868e+00 -2.32072282e+00  2.00000000e+00]
  [ 2.27008305e+01  2.02266026e+00  3.09363747e+00  2.00000000e+00]
  ...
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[ 2.26857662e+01  0.00000000e+00  1.47127521e+00  1.00000000e+00]
  [ 2.84564648e+01 -1.08470809e+00 -3.89027774e-01  2.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  ...
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[ 6.76132965e+00  0.00000000e+00  1.15189862e+00  1.00000000e+00]
  [ 2.80287857e+01  8.48856047e-02 -1.33467388e+00  2.00000000e+00]
  [ 0.00000000e+00  0.

In [5]:
for name, file_path in file_dict.items():
    with h5py.File(file_dict[name], 'r') as f:
        particles = f['Particles'][:]
        classes = f['Particles_Classes'][:]
        names = f['Particles_Names'][:]
        event_ids = f.get('EvtId')

    print(f'Processing {name} from {file_path}')
    print(f'particles shape: {particles.shape}')
    print(f'classes shape: {classes.shape}')
    print(f'names shape: {names.shape}')
    print(f'event_ids shape: {event_ids.shape if event_ids is not None else "No event_ids found"}')

Processing Background from /tmp/all_data/background_for_training.h5
particles shape: (13451915, 19, 4)
classes shape: (4,)
names shape: (4,)
event_ids shape: No event_ids found
Processing Ato4l from /tmp/all_data/Ato4l_lepFilter_13TeV_filtered.h5
particles shape: (55969, 19, 4)
classes shape: (4,)
names shape: (4,)
event_ids shape: No event_ids found
Processing hChToTauNu from /tmp/all_data/hChToTauNu_13TeV_PU20_filtered.h5
particles shape: (760272, 19, 4)
classes shape: (368,)
names shape: (368,)
event_ids shape: No event_ids found
Processing hToTauTau from /tmp/all_data/hToTauTau_13TeV_PU20_filtered.h5
particles shape: (691283, 19, 4)
classes shape: (388,)
names shape: (388,)
event_ids shape: No event_ids found
Processing leptoquark from /tmp/all_data/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5
particles shape: (340544, 19, 4)
classes shape: (4,)
names shape: (4,)
event_ids shape: No event_ids found


In [None]:
def remove_padding_flat(particles):
    # reshape to (N * 19, 4)
    flattened = particles.reshape(-1, 4)
    
    # filter out rows where class is 0 (padding)
    non_padded = flattened[flattened[:, 3] != 0]
    
    return non_padded

In [7]:
for name, file_path in file_dict.items():
    with h5py.File(file_dict[name], 'r') as f:
        particles = f['Particles'][:]
        particles = remove_padding_flat(particles)
        classes = f['Particles_Classes'][:]
        names = f['Particles_Names'][:]
        event_ids = f.get('EvtId')

    print(f'Processing {name} from {file_path}')
    print(f'particles shape: {particles.shape}')
    print(f'classes shape: {classes.shape}')
    print(f'names shape: {names.shape}')
    print(f'event_ids shape: {event_ids.shape if event_ids is not None else "No event_ids found"}')

Processing Background from /tmp/all_data/background_for_training.h5
particles shape: (43876265, 4)
classes shape: (4,)
names shape: (4,)
event_ids shape: No event_ids found
Processing Ato4l from /tmp/all_data/Ato4l_lepFilter_13TeV_filtered.h5
particles shape: (348790, 4)
classes shape: (4,)
names shape: (4,)
event_ids shape: No event_ids found
Processing hChToTauNu from /tmp/all_data/hChToTauNu_13TeV_PU20_filtered.h5
particles shape: (4486740, 4)
classes shape: (368,)
names shape: (368,)
event_ids shape: No event_ids found
Processing hToTauTau from /tmp/all_data/hToTauTau_13TeV_PU20_filtered.h5
particles shape: (3325050, 4)
classes shape: (388,)
names shape: (388,)
event_ids shape: No event_ids found
Processing leptoquark from /tmp/all_data/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5
particles shape: (1955868, 4)
classes shape: (4,)
names shape: (4,)
event_ids shape: No event_ids found


In [8]:
all_features = []

for name, filepath in file_dict.items():
    with h5py.File(file_dict[name], 'r') as f:
        particles = f['Particles'][:]
        particles = remove_padding_flat(particles)
    all_features.append(particles)

# combine all datasets into one array
combined_features = np.vstack(all_features)

# fit scaler to all combined data 
scaler = StandardScaler() # saved as scaler, can now call this later 
scaler.fit(combined_features)

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [9]:
def create_supervised_dataset(bkg_file, signal_file, scaler, output_path, events = None, test_size=0.2, val_size=0.2, input_shape=57, random_num=42):
    # load and process background file 
    with h5py.File(bkg_file, 'r') as file:
        bkg_data = file['Particles'][:, :, :-1] # remove the last feature (label)
        np.random.shuffle(bkg_data) # shuffle data to remove bias and ensure randomness
        if events: # if events not None 
            bkg_data = bkg_data[:events, :, :]
        bkg_data_flattened = bkg_data.reshape(bkg_data.shape[0], input_shape)
        y_bkg = np.zeros(bkg_data.shape[0]) #0's for background events

    # load and process signal file
    with h5py.File(signal_file, 'r') as file:
        sig_data = file['Particles'][:, :, :-1] #drop last feature
        if events: # if events not None
            sig_data = sig_data[:events, :, :] 
        sig_data_flattened = sig_data.reshape(sig_data.shape[0], input_shape)
        y_sig = np.ones(sig_data.shape[0]) # 1's for signal events 

        # Subsample background to match signal size
        n_signal = sig_data.shape[0]
        if bkg_data.shape[0] > n_signal:
            indices = np.random.choice(bkg_data.shape[0], size=n_signal, replace=False)
            bkg_data_balanced = bkg_data[indices]
            bkg_data_flat_balanced = bkg_data_flattened[indices]
            y_bkg_balanced = y_bkg[indices]
        else:
            bkg_data_balanced = bkg_data
            bkg_data_flat_balanced = bkg_data_flattened
            y_bkg_balanced = y_bkg

    # Combine balanced datasets
    #X = np.vstack((bkg_data_balanced, sig_data))  # shape: (n_events, 19, 3)
    X_flattened = np.vstack((bkg_data_flat_balanced, sig_data_flattened))  # shape: (n_events, 57)
    y = np.concatenate((y_bkg_balanced, y_sig))

    # Normalise using provided global scaler
    X_flat_scaled = scaler.transform(X_flattened)

    # Train/val/test splitting
    X_train, X_test, y_train, y_test = train_test_split(X_flat_scaled, y, test_size=test_size, stratify=y, random_state=random_num)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, stratify=y_train, random_state=random_num)

    with h5py.File(output_path, 'w') as h5f:
        h5f.create_dataset('X_train', data=X_train)
        h5f.create_dataset('y_train', data=y_train)
        h5f.create_dataset('X_val', data=X_val)
        h5f.create_dataset('y_val', data=y_val)
        h5f.create_dataset('X_test', data=X_test)
        h5f.create_dataset('y_test', data=y_test)
        #h5f.create_dataset('X', data=X) # save the raw data (shape: (n_events, 19, 3)) for plotting combined distributions 

In [10]:
# create the combined datasets for each signal file 
for signal_file, output_name in zip(signal_files, output_names):
    output_path = f"{output_folder}/{output_name}_dataset.h5"
    create_supervised_dataset(background_file, signal_file, scaler, output_path, events= None, test_size=0.2, val_size=0.2, input_shape=57, random_num=42)

ValueError: X has 57 features, but StandardScaler is expecting 4 features as input.