In [4]:
import h5py
import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import roc_auc_score, roc_curve
# from keras.models import Sequential, load_model
# from keras.layers import Dense, Dropout, Input
# from keras.callbacks import EarlyStopping
import os

In [5]:
file_dict = { # dictionary for file paths + names 
    'Background': '/home/s1974482/Desktop/MScDissertation/all_data/background_for_training.h5',
    'Ato4l': '/home/s1974482/Desktop/MScDissertation/all_data/Ato4l_lepFilter_13TeV_filtered.h5',
    'hChToTauNu': '/home/s1974482/Desktop/MScDissertation/all_data/hChToTauNu_13TeV_PU20_filtered.h5',
    'hToTauTau' : '/home/s1974482/Desktop/MScDissertation/all_data/hToTauTau_13TeV_PU20_filtered.h5', 
    'leptoquark': '/home/s1974482/Desktop/MScDissertation/all_data/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5',
}

background_file = "/home/s1974482/Desktop/MScDissertation/all_data/background_for_training.h5" # background data file

signal_files = [ # signal data files
    
    '/home/s1974482/Desktop/MScDissertation/all_data/Ato4l_lepFilter_13TeV_filtered.h5',
    '/home/s1974482/Desktop/MScDissertation/all_data/hChToTauNu_13TeV_PU20_filtered.h5',
    '/home/s1974482/Desktop/MScDissertation/all_data/hToTauTau_13TeV_PU20_filtered.h5', 
    '/home/s1974482/Desktop/MScDissertation/all_data/leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5',
]

output_names = [ # names for output files
    "Ato4l_lepFilter",
    "hChToTauNu",
    "hToTauTau",
    "leptoquark_LOWMASS"
]

# creating output folders 
output_folder = '/home/s1974482/Desktop/MScDissertation/all_data/combined_datasets/'
model_output = "/home/s1974482/Desktop/MScDissertation/all_data/models/"

os.makedirs(model_output, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)


In [6]:
# load particle data from file
def load_particles(file_path):
    with h5py.File(file_path, 'r') as f:
        particles = f['Particles'][:] 
    return particles

def remove_padding(particles): 
    padding_mask = particles[:, :, 3] != 0  # filter out padding (any index that is not 0) 
    valid_features = particles[:, :, :3][padding_mask]  # take pt, eta, phi 
    return valid_features

def combine_signal_background(signal_path, background_path, output_path):
    # load background particles
    background_data = load_particles(background_path)

    # load signal particles
    signal_data = load_particles(signal_path)

    # combine particle data
    combined_data = np.concatenate([signal_data, background_data], axis=0)

    # save combined dataset
    with h5py.File(output_path, 'w') as f:
        f.create_dataset('Particles', data=combined_data) # saving new particles data in combined dataset

# loop through and combine each signal with background
for output_name, signal_path in zip(output_names, signal_files):
    output_path = os.path.join(output_folder, f'{output_name}_with_background.h5')
    combine_signal_background(signal_path, background_file, output_path)
    print(f"saved: {output_path}")

: 

In [None]:
all_features = []

for name, filepath in file_dict.items():
    data = load_particles(filepath)
    data = data[:, :, :-1]
    data= data.reshape(data.shape[0], 57) # flattening the data to 2D to fit scaler
    all_features.append(data)

# combine all datasets into one array
combined_features = np.vstack(all_features)

# fit scaler to all combined data 
scaler = StandardScaler() # saved as scaler, can now call this later 
scaler.fit(combined_features)