In [1]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import random
import sklearn
import collections
from sklearn.model_selection import train_test_split
import json
import pylab
from scipy.optimize import curve_fit
from tensorflow.keras import layers, Model
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import load_model
from sklearn.metrics import roc_curve, auc
import matplotlib
import matplotlib.patches as mpatches
#import shap
import os
import pandas as pd
import tensorflow as tf
import tarfile
from tensorflow.keras.models import load_model
from qkeras import QActivation, QDense, QConv2D, QBatchNormalization

2024-10-10 15:57:08.186256: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-10 15:57:08.268256: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [33]:
# This is the list that will hold all of the data (data22 and MC)

datasets = {}

In [34]:
# Read MC data.

data_path = '../../../../ntuples/MC_07-17-2024/'

for filename in os.listdir(data_path):

    if filename.startswith('N') or filename.startswith('.'): continue

    dataset_tag = filename.split('_')[0]
    
    with h5py.File(data_path+filename, 'r') as hf:
        HLT_jets = hf['HLT_jets'][:]
        L1_jFexSR_jets = hf['L1_jFexSR_jets'][:]
        L1_jFexLR_jets = hf['L1_jFexLR_jets'][:]
        HLT_electrons = hf['HLT_electrons'][:]
        LRT_electrons = hf['LRT_electrons'][:]
        L1_egammas = hf['L1_egammas'][:]
        HLT_muons = hf['HLT_muons'][:]
        LRT_muons = hf['LRT_muons'][:]
        L1_muons = hf['L1_muons'][:]
        L1_eFex_taus = hf['L1_eFex_taus'][:]
        L1_jFex_taus = hf['L1_jFex_taus'][:]
        HLT_photons = hf['HLT_photons'][:]
        HLT_MET = hf['HLT_MET'][:].reshape(-1, 1, 4)  # Broadcasting MET
        L1_MET = hf['L1_MET'][:].reshape(-1, 1, 3)
        pass_L1_unprescaled = hf["pass_L1_unprescaled"][:]
        pass_HLT_unprescaled = hf["pass_HLT_unprescaled"][:]

        HLT_objects = np.concatenate([HLT_jets[:, :6, [0, 2, 3]], HLT_electrons[:, :3, [0, 2, 3]], HLT_muons[:, :3, [0, 2, 3]], HLT_photons[:, :3, [0, 2, 3]], HLT_MET[:, :, [0, 2, 3]]], axis=1)
        L1_objects = np.concatenate([L1_jFexSR_jets[:, :6, :], L1_egammas[:, :3, :], L1_muons[:, :3, :], L1_eFex_taus[:, :3, :], L1_MET], axis=1)
        
        datasets[dataset_tag] = {
            'HLT_data': HLT_objects,
            'L1_data': L1_objects,
            'passL1': pass_L1_unprescaled==1,
            'passHLT': pass_HLT_unprescaled==1,
            'weights': np.ones(len(HLT_objects)),
        }

        if len(HLT_objects) > 100000:
            datasets[dataset_tag] = {key: value[:100000] for key, value in datasets[dataset_tag].items()}

In [35]:
# Collect new EB data that will have incorrect ordering.

# Define the base path where the h5 files are stored
base_path = '/eos/home-m/mmcohen/ntuples/EB_h5_10-06-2024/'

# Initialize empty lists to collect arrays from all files
HLT_jets_list = []
ofl_jets_list = []
L1_jFexSR_jets_list = []
L1_jFexLR_jets_list = []
HLT_electrons_list = []
LRT_electrons_list = []
ofl_electrons_list = []
L1_egammas_list = []
HLT_muons_list = []
LRT_muons_list = []
ofl_muons_list = []
L1_muons_list = []
L1_eFex_taus_list = []
L1_jFex_taus_list = []
HLT_photons_list = []
ofl_photons_list = []
HLT_MET_list = []
L1_MET_list = []
pass_L1_unprescaled_list = []
pass_HLT_unprescaled_list = []
EB_weights_list = []
event_number_list = []
run_number_list = []

# Iterate over all files in the directory
for file_name in os.listdir(base_path):
    file_path = os.path.join(base_path, file_name)
    
    # Open each h5 file and append data to lists
    with h5py.File(file_path, 'r') as hf:
        HLT_jets = hf['HLT_jets'][:]
        ofl_jets = hf['ofl_jets'][:]
        L1_jFexSR_jets = hf['L1_jFexSR_jets'][:]
        L1_jFexLR_jets = hf['L1_jFexLR_jets'][:]
        HLT_electrons = hf['HLT_electrons'][:]
        LRT_electrons = hf['LRT_electrons'][:]
        ofl_electrons = hf['ofl_electrons'][:]
        L1_egammas = hf['L1_egammas'][:]
        HLT_muons = hf['HLT_muons'][:]
        LRT_muons = hf['LRT_muons'][:]
        ofl_muons = hf['ofl_muons'][:]
        L1_muons = hf['L1_muons'][:]
        L1_eFex_taus = hf['L1_eFex_taus'][:]
        L1_jFex_taus = hf['L1_jFex_taus'][:]
        HLT_photons = hf['HLT_photons'][:]
        ofl_photons = hf['ofl_photons'][:]
        HLT_MET = hf['HLT_MET'][:].reshape(-1, 1, 3)  # Broadcasting MET
        L1_MET = hf['L1_MET'][:].reshape(-1, 1, 3)
        pass_L1_unprescaled = hf["pass_L1_unprescaled"][:]
        pass_HLT_unprescaled = hf["pass_HLT_unprescaled"][:]
        EB_weights = hf["EB_weights"][:]
        event_number = hf["event_number"][:]
        run_number = hf["run_number"][:]
        mu = hf["mu"][:]

    HLT_objects = np.concatenate([HLT_jets[:, :6, [0, 2, 3]], HLT_electrons[:, :3, :], HLT_muons[:, :3, :], HLT_photons[:, :3, :], HLT_MET], axis=1)
    L1_objects = np.concatenate([L1_jFexSR_jets[:, :6, :], L1_egammas[:, :3, :], L1_muons[:, :3, :], L1_eFex_taus[:, :3, :], L1_MET], axis=1)
    
    datasets[file_name.split('_10')[0]] = {
        'HLT_data': HLT_objects,
        'L1_data': L1_objects,
        'passL1': pass_L1_unprescaled==1,
        'passHLT': pass_HLT_unprescaled==1,
        'weights': EB_weights,
        'event_numbers': event_number,
        'run_numbers': run_number,
        'pileups': mu
    }

def combine_data(datasets, tags_to_combine, new_tag):

    # initialize empty lists for new tag
    datasets[new_tag] = {key: [] for key in datasets[tags_to_combine[0]].keys()}

    # Loop through old tags and append np arrays to lists
    for tag in tags_to_combine:
        for key, value in datasets[tag].items():
            datasets[new_tag][key].append(value)

    # Concatenate lists into single np array
    for key, value in datasets[new_tag].items():
        datasets[new_tag][key] = np.concatenate(value, axis=0)

    # Delete old tags
    for tag in tags_to_combine:
        del datasets[tag]

    # Make sure everything is an np array
    for tag, data_dict in datasets.items():
        for key, value in data_dict.items():
            data_dict[key] = np.array(value)

    return datasets

datasets = combine_data(datasets, tags_to_combine=['EB_475341_0', 'EB_475341_1'], new_tag='HLT_noalg_eb_L1All')
#datasets = combine_data(datasets, tags_to_combine=['EB_473255_0', 'EB_475321_0', 'EB_482596_0'], new_tag='EB')
datasets = combine_data(datasets, tags_to_combine=['EB_473255_0', 'EB_475321_0'], new_tag='EB')

In [36]:
def has_duplicates(arr):
    _, counts = np.unique(arr, return_counts=True)
    return np.any(counts > 1)

def load_and_process_normal_data(file_name):
    with h5py.File(file_name, 'r') as hf:
        nmuon, nLRjet, nSRjet, negamma, netau, njtau = 4, 6, 6, 4, 4, 4

        def load_and_scale(dataset, n_objects, scale_factor=10/1024, eta_factor=10/16, phi_factor = 10/8):
            data = hf[dataset][:, 0:n_objects, :]
            data[:, :, 0] *= scale_factor  # Scale the pT value
            data[:, :, 1] *= eta_factor  # Scale the eta value
            data[:, :, 2] *= phi_factor  # Scale the phi value
            return data.reshape(-1, 3 * n_objects)

        L1_jFexSR_jets = load_and_scale('L1_jFexSR_jets', nSRjet)
        L1_jFexLR_jets = load_and_scale('L1_jFexLR_jets', nLRjet)
        L1_egammas = load_and_scale('L1_egammas', negamma)
        L1_muons = load_and_scale('L1_muons', nmuon, scale_factor=10000/64)  # Specific scaling for muons
        L1_eFex_taus = load_and_scale('L1_eFex_taus', netau)
        L1_jFex_taus = load_and_scale('L1_jFex_taus', njtau)

        L1_MET = hf['L1_MET'][:]
        L1_MET[:, 0] *= 10/8192
        L1_MET[:, 2] *= 10/8

        pass_L1_unprescaled = hf["pass_L1_unprescaled"][:]
        pass_HLT_unprescaled = hf["pass_HLT_unprescaled"][:]
        EB_weights = hf["EB_weights"][:]
        event_id_signal = hf['event_number'][:]
        run_id_signal = hf['run_number'][:]

        if has_duplicates(event_id_signal):
            print("event index show up more than once!!!")
        else:
            print("event index looks good :)")

        # Reformat L1_MET
        L1_MET_fixed = np.zeros((L1_MET.shape[0], 2))
        L1_MET_fixed[:, 0] = L1_MET[:, 0]
        L1_MET_fixed[:, 1] = L1_MET[:, 2]
        L1_MET = L1_MET_fixed

        # Combine arrays into Topo groups
        Topo_2A = np.concatenate([L1_jFexSR_jets, L1_eFex_taus, L1_muons, L1_MET], axis=1)
        Topo_2B = np.concatenate([L1_jFexSR_jets, L1_egammas, L1_jFex_taus, L1_MET], axis=1)
        Topo_3A = np.concatenate([L1_jFexSR_jets, L1_egammas, L1_eFex_taus, L1_MET], axis=1)

        # Handle NaN values
        def fill_median(array):
            for i in range(array.shape[1]):
                median_value = np.nanmedian(array[:, i])
                array[np.isnan(array[:, i]), i] = 0  # median_value
            return array

        Topo_2A = fill_median(Topo_2A)
        Topo_2B = fill_median(Topo_2B)
        Topo_3A = fill_median(Topo_3A)

        return Topo_2A, Topo_2B, Topo_3A, pass_L1_unprescaled, pass_HLT_unprescaled, EB_weights, event_id_signal, run_id_signal

def load_and_process_anomalous_data(file_name):
    with h5py.File(file_name, 'r') as hf:
        nmuon, nLRjet, nSRjet, negamma, netau, njtau = 4, 6, 6, 4, 4, 4
        print(hf.keys())

        def load_and_scale(dataset, n_objects, scale_factor=10/1024, eta_factor=10/16, phi_factor = 10/8):
            data = hf[dataset][:, 0:n_objects, :]
            data[:, :, 0] *= scale_factor  # Scale the pT value
            data[:, :, 1] *= eta_factor  # Scale the eta value
            data[:, :, 2] *= phi_factor  # Scale the phi value
            return data.reshape(-1, 3 * n_objects)

        L1_jFexSR_jets = load_and_scale('L1_jFexSR_jets', nSRjet)
        L1_jFexLR_jets = load_and_scale('L1_jFexLR_jets', nLRjet)
        L1_egammas = load_and_scale('L1_egammas', negamma)
        L1_muons = load_and_scale('L1_muons', nmuon, scale_factor=10000/64)  # Specific scaling for muons
        L1_eFex_taus = load_and_scale('L1_eFex_taus', netau)
        L1_jFex_taus = load_and_scale('L1_jFex_taus', njtau)

        L1_MET = hf['L1_MET'][:]
        L1_MET[:, 0] *= 10/8192
        L1_MET[:, 2] *= 10/8

        pass_L1_unprescaled = hf["pass_L1_unprescaled"][:]

        # Reformat L1_MET
        L1_MET_fixed = np.zeros((L1_MET.shape[0], 2))
        L1_MET_fixed[:, 0] = L1_MET[:, 0]
        L1_MET_fixed[:, 1] = L1_MET[:, 2]
        L1_MET = L1_MET_fixed

        # Combine arrays into Topo groups
        Topo_2A = np.concatenate([L1_jFexSR_jets, L1_eFex_taus, L1_muons, L1_MET], axis=1)
        Topo_2B = np.concatenate([L1_jFexSR_jets, L1_egammas, L1_jFex_taus, L1_MET], axis=1)
        Topo_3A = np.concatenate([L1_jFexSR_jets, L1_egammas, L1_eFex_taus, L1_MET], axis=1)

        # Handle NaN values
        def fill_median(array):
            for i in range(array.shape[1]):
                median_value = np.nanmedian(array[:, i])
                array[np.isnan(array[:, i]), i] = 0  # median_value
            return array

        Topo_2A = fill_median(Topo_2A)
        Topo_2B = fill_median(Topo_2B)
        Topo_3A = fill_median(Topo_3A)

        return Topo_2A, Topo_2B, Topo_3A, pass_L1_unprescaled

def load_model_from_targz(targz_path, model_name):
    with tarfile.open(targz_path, 'r:gz') as tar:
        tar.extractall(path='temp_model')
    
    model_path = os.path.join('temp_model', model_name)
    custom_objects = {
        'QDense': QDense,
        'QActivation': QActivation,
        'QBatchNormalization': QBatchNormalization
    }
    model = load_model(model_path, custom_objects=custom_objects)
    
    # Clean up the temporary directory
    for root, dirs, files in os.walk('temp_model', topdown=False):
        for name in files:
            os.remove(os.path.join(root, name))
        for name in dirs:
            os.rmdir(os.path.join(root, name))
    os.rmdir('temp_model')
    
    return model

In [37]:
# Load and process normal data
Topo_2A, Topo_2B, Topo_3A, pass_L1_unprescaled, pass_HLT_unprescaled, EB_weights, event_id, run_id = load_and_process_normal_data('/eos/home-m/mmcohen/ntuples/EB_ntuples_08-13-2024.h5')


# Splitting training and testing datasets for Topo_2A and weights
Topo_2A_train = np.concatenate((Topo_2A[0:450000], Topo_2A[800000:]), axis=0)
Topo_train_weights = np.concatenate((EB_weights[0:450000], EB_weights[800000:]), axis=0)
train_event_id = np.concatenate((event_id[0:450000], event_id[800000:]), axis=0)
train_run_id = np.concatenate((run_id[0:450000], run_id[800000:]), axis=0)

Topo_2A_test = Topo_2A[450000:800000, :]
test_event_id = event_id[450000:800000]
test_run_id = run_id[450000:800000]  # Adding split for run_id in the test set
Topo_test_weights = EB_weights[450000:800000]

# Output the shapes of the relevant arrays
print("Shape of Topo_2A_test:", Topo_2A_test.shape)
print("Shape of Topo_2A_train:", Topo_2A_train.shape)

event index show up more than once!!!
Shape of Topo_2A_test: (350000, 44)
Shape of Topo_2A_train: (1754833, 44)


In [38]:
# Sort topo2A training data
train_sort_indices = np.lexsort((train_event_id, train_run_id))
Topo_2A_train = Topo_2A_train[train_sort_indices]
Topo_train_weights = Topo_train_weights[train_sort_indices]
train_event_id = train_event_id[train_sort_indices]
train_run_id = train_run_id[train_sort_indices]

# Sort test data
test_sort_indices = np.lexsort((test_event_id, test_run_id))
Topo_2A_test = Topo_2A_test[test_sort_indices]
Topo_test_weights = Topo_test_weights[test_sort_indices]
test_event_id = test_event_id[test_sort_indices]
test_run_id = test_run_id[test_sort_indices]

print("Sorted shapes:")
print("Topo_2A_train:", Topo_2A_train.shape)
print("Topo_2A_test:", Topo_2A_test.shape)

# Print the first few elements of event and run numbers for train and test sets
print("\nFirst few elements of train set:")
print("Event IDs:", train_event_id[:5])
print("Run IDs:", train_run_id[:5])

print("\nFirst few elements of test set:")
print("Event IDs:", test_event_id[:5])
print("Run IDs:", test_run_id[:5])




Sorted shapes:
Topo_2A_train: (1754833, 44)
Topo_2A_test: (350000, 44)

First few elements of train set:
Event IDs: [480857418 480857460 480857610 480857642 480857751]
Run IDs: [473255 473255 473255 473255 473255]

First few elements of test set:
Event IDs: [486708620 486708669 486708930 486708934 486709125]
Run IDs: [473255 473255 473255 473255 473255]


In [39]:
# Now split my EB data into train and test sets according to the run / event numbers from topo2A

# Create sets of (run, event) tuples for train and test data
train_set = set(zip(train_run_id, train_event_id))
test_set = set(zip(test_run_id, test_event_id))

# Initialize dictionaries for EB train and test data
EB_train = {key: [] for key in datasets['EB'].keys()}
EB_test = {key: [] for key in datasets['EB'].keys()}

# Iterate through the EB dataset
for i in range(len(datasets['EB']['run_numbers'])):
    run = datasets['EB']['run_numbers'][i]
    event = datasets['EB']['event_numbers'][i]
    
    if (run, event) in train_set:
        for key in datasets['EB'].keys():
            EB_train[key].append(datasets['EB'][key][i])
    else:
        for key in datasets['EB'].keys():
            EB_test[key].append(datasets['EB'][key][i])

# Convert lists to numpy arrays
for key in EB_train.keys():
    EB_train[key] = np.array(EB_train[key])
    EB_test[key] = np.array(EB_test[key])

# Print some information about the split
print("EB Train set size:", len(EB_train['run_numbers']))
print("EB Test set size:", len(EB_test['run_numbers']))

# Print the first few elements of event and run numbers for EB train and test sets
print("\nFirst few elements of EB train set:")
print("Event IDs:", EB_train['event_numbers'][:5])
print("Run IDs:", EB_train['run_numbers'][:5])

print("\nFirst few elements of EB test set:")
print("Event IDs:", EB_test['event_numbers'][:5])
print("Run IDs:", EB_test['run_numbers'][:5])



EB Train set size: 1754041
EB Test set size: 511728

First few elements of EB train set:
Event IDs: [507531069 507544334 507542373 507532437 507539498]
Run IDs: [473255 473255 473255 473255 473255]

First few elements of EB test set:
Event IDs: [448033363 448022581 448026386 448024119 448024409]
Run IDs: [473255 473255 473255 473255 473255]


In [40]:
# Order EB train data using lexsort
train_order = np.lexsort((EB_train['event_numbers'], EB_train['run_numbers']))
for key in EB_train.keys():
   EB_train[key] = EB_train[key][train_order]

# Order EB test data using lexsort
test_order = np.lexsort((EB_test['event_numbers'], EB_test['run_numbers']))
for key in EB_test.keys():
    EB_test[key] = EB_test[key][test_order]

# Print the first few elements of ordered event and run numbers for EB train and test sets
print("\nFirst few elements of ordered EB train set:")
print("Event IDs:", EB_train['event_numbers'][:5])
print("Run IDs:", EB_train['run_numbers'][:5])

print("\nFirst few elements of ordered EB test set:")
print("Event IDs:", EB_test['event_numbers'][:5])
print("Run IDs:", EB_test['run_numbers'][:5])



First few elements of ordered EB train set:
Event IDs: [480857418 480857460 480857610 480857642 480857751]
Run IDs: [473255 473255 473255 473255 473255]

First few elements of ordered EB test set:
Event IDs: [448012770 448012970 448013451 448013665 448014097]
Run IDs: [473255 473255 473255 473255 473255]


In [41]:
"""now remove events from the topo2A data that dont exist in EB"""

# Create sets of (run_number, event_number) pairs for EB and topo2A
EB_pairs = set(zip(EB_train['run_numbers'], EB_train['event_numbers']))
EB_pairs.update(zip(EB_test['run_numbers'], EB_test['event_numbers']))

topo2A_pairs = set(zip(train_run_id, train_event_id))
topo2A_pairs.update(zip(test_run_id, test_event_id))

# Find pairs that are in topo2A but not in EB, and pairs that are in EB but not in topo2A
pairs_to_remove = (topo2A_pairs - EB_pairs) | (EB_pairs - topo2A_pairs)

# Function to remove pairs from EB dataset
# def remove_pairs_EB(dataset, pairs_to_remove):
#     mask = np.ones(len(dataset['run_numbers']), dtype=bool)
#     for run, event in pairs_to_remove:
#         mask &= ~((dataset['run_numbers'] == run) & (dataset['event_numbers'] == event))
#     return {key: dataset[key][mask] for key in dataset.keys()}

# # Function to remove pairs from topo2A arrays
# def remove_pairs_topo2A(run_number, event_number, data, pairs_to_remove):
#     mask = np.ones(len(run_number), dtype=bool)
#     for run, event in pairs_to_remove:
#         mask &= ~((run_number == run) & (event_number == event))
#     return run_number[mask], event_number[mask], data[mask]

def remove_pairs_EB(dataset, pairs_to_remove):
    # Create a mask that will keep all elements initially
    mask = np.ones(len(dataset['run_numbers']), dtype=bool)

    # Iterate over the dataset and mark elements to remove based on pairs
    for i in range(len(dataset['run_numbers'])):
        if (dataset['run_numbers'][i], dataset['event_numbers'][i]) in pairs_to_remove:
            mask[i] = False  # Mark to remove

    # Apply the mask to all dataset keys
    return {key: dataset[key][mask] for key in dataset}

def remove_pairs_topo2A(run_number, event_number, data, pairs_to_remove):
    # Create a mask that will keep all elements initially
    mask = np.ones(len(run_number), dtype=bool)

    # Iterate over run_number and event_number and mark elements to remove based on pairs
    for i in range(len(run_number)):
        if (run_number[i], event_number[i]) in pairs_to_remove:
            mask[i] = False  # Mark to remove

    # Apply the mask and return the filtered data
    return run_number[mask], event_number[mask], data[mask]

# Remove pairs from topo2A and EB train and test sets
topo2A_train_run_number, topo2A_train_event_number, topo2A_train = remove_pairs_topo2A(train_run_id, train_event_id, Topo_2A_train, pairs_to_remove)
topo2A_test_run_number, topo2A_test_event_number, topo2A_test = remove_pairs_topo2A(test_run_id, test_event_id, Topo_2A_test, pairs_to_remove)
EB_train = remove_pairs_EB(EB_train, pairs_to_remove)
EB_test = remove_pairs_EB(EB_test, pairs_to_remove)



In [42]:
print(topo2A_train_event_number[0], topo2A_train_run_number[0])
print(topo2A_test_event_number[0], topo2A_test_run_number[0])
print(EB_train['event_numbers'][0], EB_train['run_numbers'][0])
print(EB_test['event_numbers'][0], EB_test['run_numbers'][0])

480857418 473255
486708620 473255
480857418 473255
486708620 473255


In [43]:
idx = 345254
print(topo2A_train_event_number[idx], topo2A_train_run_number[idx])
print(topo2A_test_event_number[idx], topo2A_test_run_number[idx])
print(EB_train['event_numbers'][idx], EB_train['run_numbers'][idx])
print(EB_test['event_numbers'][idx], EB_test['run_numbers'][idx])

616549693 473255
700376589 473255
616549693 473255
700376589 473255


In [44]:
for i in range(len(topo2A_train_event_number)):
    assert (topo2A_train_event_number[idx] == EB_train['event_numbers'][idx]), f'train error at index {i}'

print(f'all events in topo2A_train matched!')

for i in range(len(topo2A_test_event_number)):
    assert (topo2A_test_event_number[idx] == EB_test['event_numbers'][idx]), f'test error at index {i}'

print(f'all events in topo2A_test matched!')

all events in topo2A_train matched!
all events in topo2A_test matched!


In [45]:
# Now add the additional run to the EB datasets dict
datasets['EB_train'] = EB_train
datasets['EB_test'] = EB_test

datasets = combine_data(datasets, tags_to_combine=['EB_test', 'EB_482596_0'], new_tag='EB_test2')

In [46]:
# Now add the additional run to the topo2A data
Topo_2A_new, Topo_2B_new, Topo_3A_new, pass_L1_unprescaled_new, pass_HLT_unprescaled_new, EB_weights_new, event_id_new, run_id_new = load_and_process_normal_data('/eos/home-m/mmcohen/ntuples/EB_h5_10-06-2024/EB_482596_0_10-05-2024.h5')

topo2A_test_run_number = np.concatenate((topo2A_test_run_number, run_id_new), axis=0)
topo2A_test_event_number = np.concatenate((topo2A_test_event_number, event_id_new), axis=0)
topo2A_test = np.concatenate((topo2A_test, Topo_2A_new), axis=0)

event index looks good :)


In [47]:
# Load the HLTnalg_L1all run
Topo_2A_L1all, Topo_2B_L1all, Topo_3A_L1all, pass_L1_unprescaled_L1all, pass_HLT_unprescaled_L1all, EB_weights_L1all, event_id_L1all, run_id_L1all = load_and_process_normal_data('/eos/home-m/mmcohen/ntuples/EB_h5_10-06-2024/EB_475341_0_10-05-2024.h5')
Topo_2A_L1all2, Topo_2B_L1all2, Topo_3A_L1all2, pass_L1_unprescaled_L1all2, pass_HLT_unprescaled_L1all2, EB_weights_L1all2, event_id_L1all2, run_id_L1all2 = load_and_process_normal_data('/eos/home-m/mmcohen/ntuples/EB_h5_10-06-2024/EB_475341_1_10-05-2024.h5')

Topo_2A_L1all = np.concatenate((Topo_2A_L1all, Topo_2A_L1all2), axis=0)
event_id_L1all = np.concatenate((event_id_L1all, event_id_L1all2), axis=0)
run_id_L1all = np.concatenate((run_id_L1all, run_id_L1all2), axis=0)

event index looks good :)
event index looks good :)


In [48]:
for i in range(len(topo2A_train_event_number)):
    assert (topo2A_train_event_number[idx] == EB_train['event_numbers'][idx]), f'train error at index {i}'

print(f'all events in topo2A_train matched!')

for i in range(len(topo2A_test_event_number)):
    assert (topo2A_test_event_number[idx] == EB_test['event_numbers'][idx]), f'test error at index {i}'

print(f'all events in topo2A_test matched!')

for i in range(len(event_id_L1all)):
    assert (event_id_L1all[idx] == datasets['HLT_noalg_eb_L1All']['event_numbers'][idx]), f'test error at index {i}'

print(f'all events in HLT_noalg_eb_L1All matched!')

all events in topo2A_train matched!
all events in topo2A_test matched!
all events in HLT_noalg_eb_L1All matched!


In [49]:
# Load anomalous data
Topo_2A_HHbbtt, _, _, pass_L1_HHbbtt = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/HHbbttHadHad_07-10-2024.h5')
Topo_2A_A14, _, _, pass_L1_A14 = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/A14N23LO_07-17-2024.h5')
Topo_2A_HAHMggf, _, _, pass_L1_HAHMggf = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/HAHMggfZdZd2l2nu_07-17-2024.h5')
Topo_2A_qqa, _, _, pass_L1_qqa = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/qqa_07-17-2024.h5')
Topo_2A_Zprime, _, _, pass_L1_Zprime = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/Zprime2EJs_07-17-2024.h5')
Topo_2A_ZZ4lep, _, _, pass_L1_ZZ4lep = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/ZZ4lep_07-17-2024.h5')
Topo_2A_jz1, _, _, pass_L1_jz1 = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/jjJZ1_07-17-2024.h5')
Topo_2A_jz2, _, _, pass_L1_jz2 = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/jjJZ2_07-17-2024.h5')
Topo_2A_jz4, _, _, pass_L1_jz4 = load_and_process_anomalous_data('/eos/home-m/mmcohen/ntuples/MC_07-17-2024/jjJZ4_07-17-2024.h5')

# Topo_2A_HLT_passed = Topo_2A_test_signal[HLT_pass_test==1]
# Topo_2A_L1_passed = Topo_2A_test_signal[L1_pass_test==1]
# Topo_2A_just_L1_passed = Topo_2A_L1_passed[HLT_pass_test[L1_pass_test==1]==0]

# Topo_2A_HHbbtt_pure = Topo_2A_HHbbtt[pass_L1_HHbbtt==0]
# Topo_2A_A14_pure = Topo_2A_A14[pass_L1_A14==0]
# Topo_2A_HAHMggf_pure = Topo_2A_HAHMggf[pass_L1_HAHMggf==0]
# Topo_2A_qqa_pure = Topo_2A_qqa[pass_L1_qqa==0]
# Topo_2A_Zprime_pure = Topo_2A_Zprime[pass_L1_Zprime==0]
# Topo_2A_ZZ4lep_pure = Topo_2A_ZZ4lep[pass_L1_ZZ4lep==0]
# Topo_2A_jz1_pure = Topo_2A_jz1[pass_L1_jz1==0]
# Topo_2A_jz2_pure = Topo_2A_jz2[pass_L1_jz2==0]
# Topo_2A_jz4_pure = Topo_2A_jz4[pass_L1_jz4==0]

<KeysViewHDF5 ['HLT_MET', 'HLT_electrons', 'HLT_jets', 'HLT_muons', 'HLT_photons', 'L1_MET', 'L1_eFex_taus', 'L1_egammas', 'L1_jFexLR_jets', 'L1_jFexSR_jets', 'L1_jFex_taus', 'L1_muons', 'LRT_electrons', 'LRT_muons', 'pass_HLT_unprescaled', 'pass_L1_unprescaled']>
<KeysViewHDF5 ['HLT_MET', 'HLT_electrons', 'HLT_jets', 'HLT_muons', 'HLT_photons', 'L1_MET', 'L1_eFex_taus', 'L1_egammas', 'L1_jFexLR_jets', 'L1_jFexSR_jets', 'L1_jFex_taus', 'L1_muons', 'LRT_electrons', 'LRT_muons', 'pass_HLT_unprescaled', 'pass_L1_unprescaled']>
<KeysViewHDF5 ['HLT_MET', 'HLT_electrons', 'HLT_jets', 'HLT_muons', 'HLT_photons', 'L1_MET', 'L1_eFex_taus', 'L1_egammas', 'L1_jFexLR_jets', 'L1_jFexSR_jets', 'L1_jFex_taus', 'L1_muons', 'LRT_electrons', 'LRT_muons', 'pass_HLT_unprescaled', 'pass_L1_unprescaled']>
<KeysViewHDF5 ['HLT_MET', 'HLT_electrons', 'HLT_jets', 'HLT_muons', 'HLT_photons', 'L1_MET', 'L1_eFex_taus', 'L1_egammas', 'L1_jFexLR_jets', 'L1_jFexSR_jets', 'L1_jFex_taus', 'L1_muons', 'LRT_electrons', '

In [50]:
# Load the model
targz_path = './trained_models/software_model_BESTOFLONGRUN.tar.gz'  # Replace with the actual path to your .tar.gz file
model_name = '2A_AE_model_V9_BESTOFLONGRUN'
model = load_model_from_targz(targz_path, model_name)



In [51]:
# Define signals and signal names
signals = [
    topo2A_test,
    #Topo_2A_HLT_passed,
    #Topo_2A_L1_passed,
    Topo_2A_HHbbtt[0:100000],
    Topo_2A_jz1[0:100000],
    Topo_2A_jz2[0:100000],
    Topo_2A_jz4[0:100000],
    Topo_2A_A14[0:100000],
    Topo_2A_HAHMggf[0:100000],
    Topo_2A_qqa[0:100000],
    Topo_2A_Zprime[0:100000],
    Topo_2A_ZZ4lep[0:100000],
    topo2A_train,
    Topo_2A_L1all
]

# signal_names = [
#     "Topo_2A_test_signal",
#     "Topo_2A_HLT_passed",
#     "Topo_2A_L1_passed",
#     "Topo_2A_HHbbtt",
#     "Topo_2A_jz1",
#     "Topo_2A_jz2",
#     "Topo_2A_jz4",
#     "Topo_2A_A14",
#     "Topo_2A_HAHMggf",
#     "Topo_2A_qqa",
#     "Topo_2A_Zprime",
#     "Topo_2A_ZZ4lep"
# ]

signal_names = [
    "EB_test2",
    "HHbbttHadHad",
    "jjJZ1",
    "jjJZ2",
    "jjJZ4",
    "A14N23LO",
    "HAHMggfZdZd2l2nu",
    "qqa",
    "Zprime2EJs",
    "ZZ4lep",
    "EB_train",
    "HLT_noalg_eb_L1All"
]

In [52]:
# Make predictions and save triggers
for signal, signal_name in zip(signals, signal_names):
    predictions = model.predict(signal)
    AD_score = np.mean(np.square(predictions), axis=1)
    datasets[signal_name]['topo2A_AD_scores'] = AD_score
    # threshold = 0.0535068511962890625
    # above_threshold = AD_score > threshold
    # trigger = above_threshold.astype(int)
    
    # # Save the trigger for each signal
    # np.save(f'{signal_name}_trigger.npy', trigger)

#print("Predictions and triggers have been saved for all signals.")



In [53]:
# Display the structure of updated datasets
for tag, data in datasets.items():
    print(f'\n{tag}:')
    for key, value in data.items():
        print(f'{key}: {value.shape}')


A14N23LO:
HLT_data: (10000, 16, 3)
L1_data: (10000, 16, 3)
passL1: (10000,)
passHLT: (10000,)
weights: (10000,)
topo2A_AD_scores: (10000,)

HAHMggfZdZd2l2nu:
HLT_data: (70000, 16, 3)
L1_data: (70000, 16, 3)
passL1: (70000,)
passHLT: (70000,)
weights: (70000,)
topo2A_AD_scores: (70000,)

HHbbttHadHad:
HLT_data: (100000, 16, 3)
L1_data: (100000, 16, 3)
passL1: (100000,)
passHLT: (100000,)
weights: (100000,)
topo2A_AD_scores: (100000,)

ZZ4lep:
HLT_data: (100000, 16, 3)
L1_data: (100000, 16, 3)
passL1: (100000,)
passHLT: (100000,)
weights: (100000,)
topo2A_AD_scores: (100000,)

Zprime2EJs:
HLT_data: (100000, 16, 3)
L1_data: (100000, 16, 3)
passL1: (100000,)
passHLT: (100000,)
weights: (100000,)
topo2A_AD_scores: (100000,)

jjJZ1:
HLT_data: (100000, 16, 3)
L1_data: (100000, 16, 3)
passL1: (100000,)
passHLT: (100000,)
weights: (100000,)
topo2A_AD_scores: (100000,)

jjJZ2:
HLT_data: (100000, 16, 3)
L1_data: (100000, 16, 3)
passL1: (100000,)
passHLT: (100000,)
weights: (100000,)
topo2A_AD_sc

In [55]:
def save_subdicts_to_h5(main_dict, save_dir):
    """
    Saves each sub-dictionary of NumPy arrays in the main_dict to separate HDF5 files.
    
    Args:
        main_dict (dict): A dictionary of dictionaries where the innermost values are NumPy arrays.
        save_dir (str): The directory where the HDF5 files will be saved.
    """
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    for sub_dict_name, sub_dict in main_dict.items():
        file_path = os.path.join(save_dir, f"{sub_dict_name}.h5")
        with h5py.File(file_path, 'w') as f:
            for key, arr in sub_dict.items():
                f.create_dataset(key, data=arr)
        print(f"Saved {sub_dict_name} to {file_path}")


def load_subdicts_from_h5(save_dir):
    """
    Loads sub-dictionaries of NumPy arrays from HDF5 files in a directory and reconstructs the original structure.
    
    Args:
        save_dir (str): The directory where the HDF5 files are stored.
    
    Returns:
        main_dict (dict): A dictionary of dictionaries where the innermost values are NumPy arrays.
    """
    main_dict = {}
    
    for filename in os.listdir(save_dir):
        if filename.endswith(".h5"):
            sub_dict_name = os.path.splitext(filename)[0]
            file_path = os.path.join(save_dir, filename)
            with h5py.File(file_path, 'r') as f:
                sub_dict = {key: np.array(f[key]) for key in f}
            main_dict[sub_dict_name] = sub_dict
            print(f"Loaded {sub_dict_name} from {file_path}")
    
    return main_dict

In [56]:
save_subdicts_to_h5(datasets, save_dir='./h5_ntuples')

Saved A14N23LO to ./h5_ntuples/A14N23LO.h5
Saved HAHMggfZdZd2l2nu to ./h5_ntuples/HAHMggfZdZd2l2nu.h5
Saved HHbbttHadHad to ./h5_ntuples/HHbbttHadHad.h5
Saved ZZ4lep to ./h5_ntuples/ZZ4lep.h5
Saved Zprime2EJs to ./h5_ntuples/Zprime2EJs.h5
Saved jjJZ1 to ./h5_ntuples/jjJZ1.h5
Saved jjJZ2 to ./h5_ntuples/jjJZ2.h5
Saved jjJZ4 to ./h5_ntuples/jjJZ4.h5
Saved qqa to ./h5_ntuples/qqa.h5
Saved HLT_noalg_eb_L1All to ./h5_ntuples/HLT_noalg_eb_L1All.h5
Saved EB to ./h5_ntuples/EB.h5
Saved EB_train to ./h5_ntuples/EB_train.h5
Saved EB_test2 to ./h5_ntuples/EB_test2.h5


In [59]:
datasets = load_subdicts_from_h5('./h5_ntuples')

Loaded A14N23LO from ./h5_ntuples/A14N23LO.h5
Loaded EB from ./h5_ntuples/EB.h5
Loaded EB_test2 from ./h5_ntuples/EB_test2.h5
Loaded EB_train from ./h5_ntuples/EB_train.h5
Loaded HAHMggfZdZd2l2nu from ./h5_ntuples/HAHMggfZdZd2l2nu.h5
Loaded HHbbttHadHad from ./h5_ntuples/HHbbttHadHad.h5
Loaded HLT_noalg_eb_L1All from ./h5_ntuples/HLT_noalg_eb_L1All.h5
Loaded ZZ4lep from ./h5_ntuples/ZZ4lep.h5
Loaded Zprime2EJs from ./h5_ntuples/Zprime2EJs.h5
Loaded jjJZ1 from ./h5_ntuples/jjJZ1.h5
Loaded jjJZ2 from ./h5_ntuples/jjJZ2.h5
Loaded jjJZ4 from ./h5_ntuples/jjJZ4.h5
Loaded qqa from ./h5_ntuples/qqa.h5


In [None]:
# Now let's do a few checks

In [60]:
idxs = [0, 2314, 132445, -1]


# Test data ------------------
print(f'\n\nStarting checks of test data!\n\n')

# run these events through the model
predictions = model.predict(topo2A_test[idxs])
_AD_scores = np.mean(np.square(predictions), axis=1)

for i, idx in enumerate(idxs):
    print(f'\n---\nStarting check for index {idx}')
    print(f'topo 2A event:\n{topo2A_test[idx]}\n')
    print(f"EB_test event:\n{datasets['EB_test2']['L1_data'][idx]}\n")
    print(f"topo2A AD score held in datasets dict:\n{datasets['EB_test2']['topo2A_AD_scores'][idx]}")

    # now run the event through the model
    print(f'AD score calculate by running the event through the model:\n{_AD_scores[i]}')

# Train data ------------------
print(f'\n\nStarting checks of test data!\n\n')

# run these events through the model
predictions = model.predict(topo2A_train[idxs])
_AD_scores = np.mean(np.square(predictions), axis=1)

for i, idx in enumerate(idxs):
    print(f'\n---\nStarting check for index {idx}')
    print(f'topo 2A event:\n{topo2A_train[idx]}\n')
    print(f"EB_test event:\n{datasets['EB_train']['L1_data'][idx]}\n")
    print(f"topo2A AD score held in datasets dict:\n{datasets['EB_train']['topo2A_AD_scores'][idx]}")

    # now run the event through the model
    print(f'AD score calculate by running the event through the model:\n{_AD_scores[i]}')


# HLTnoAlgL1All data ------------------
print(f'\n\nStarting checks of HLT_noalg_L1_all data!\n\n')

# run these events through the model
predictions = model.predict(Topo_2A_L1all[idxs])
_AD_scores = np.mean(np.square(predictions), axis=1)

for i, idx in enumerate(idxs):
    print(f'\n---\nStarting check for index {idx}')
    print(f'topo 2A event:\n{Topo_2A_L1all[idx]}\n')
    print(f"EB_test event:\n{datasets['HLT_noalg_eb_L1All']['L1_data'][idx]}\n")
    print(f"topo2A AD score held in datasets dict:\n{datasets['HLT_noalg_eb_L1All']['topo2A_AD_scores'][idx]}")

    # now run the event through the model
    print(f'AD score calculate by running the event through the model:\n{_AD_scores[i]}')



Starting checks of test data!



---
Starting check for index 0
topo 2A event:
[ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.09667968 -1.34375006 -3.37475777  0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.01215075  0.48962492]

EB_test event:
[[ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [10.5        -2.16250014 -2.69980621]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.   

In [58]:
idxs = [0, 2314, 132445, -1]


# Test data ------------------
print(f'\n\nStarting checks of test data!\n\n')

# run these events through the model
predictions = model.predict(topo2A_test[idxs])
_AD_scores = np.mean(np.square(predictions), axis=1)

for i, idx in enumerate(idxs):
    print(f'\n---\nStarting check for index {idx}')
    print(f'topo 2A event:\n{topo2A_test[idx]}\n')
    print(f"EB_test event:\n{datasets['EB_test2']['L1_data'][idx]}\n")
    print(f"topo2A AD score held in datasets dict:\n{datasets['EB_test2']['topo2A_AD_scores'][idx]}")

    # now run the event through the model
    print(f'AD score calculate by running the event through the model:\n{_AD_scores[i]}')

# Train data ------------------
print(f'\n\nStarting checks of test data!\n\n')

# run these events through the model
predictions = model.predict(topo2A_train[idxs])
_AD_scores = np.mean(np.square(predictions), axis=1)

for i, idx in enumerate(idxs):
    print(f'\n---\nStarting check for index {idx}')
    print(f'topo 2A event:\n{topo2A_train[idx]}\n')
    print(f"EB_test event:\n{datasets['EB_train']['L1_data'][idx]}\n")
    print(f"topo2A AD score held in datasets dict:\n{datasets['EB_train']['topo2A_AD_scores'][idx]}")

    # now run the event through the model
    print(f'AD score calculate by running the event through the model:\n{_AD_scores[i]}')


# HLTnoAlgL1All data ------------------
print(f'\n\nStarting checks of HLT_noalg_L1_all data!\n\n')

# run these events through the model
predictions = model.predict(Topo_2A_L1all[idxs])
_AD_scores = np.mean(np.square(predictions), axis=1)

for i, idx in enumerate(idxs):
    print(f'\n---\nStarting check for index {idx}')
    print(f'topo 2A event:\n{Topo_2A_L1all[idx]}\n')
    print(f"EB_test event:\n{datasets['HLT_noalg_eb_L1All']['L1_data'][idx]}\n")
    print(f"topo2A AD score held in datasets dict:\n{datasets['HLT_noalg_eb_L1All']['topo2A_AD_scores'][idx]}")

    # now run the event through the model
    print(f'AD score calculate by running the event through the model:\n{_AD_scores[i]}')



Starting checks of test data!



---
Starting check for index 0
topo 2A event:
[ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.09667968 -1.34375006 -3.37475777  0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.01215075  0.48962492]

EB_test event:
[[ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [10.5        -2.16250014 -2.69980621]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.   