In [1]:
import sys
repo_path = '/home/micael.verissimo/paper_lzt/exp-connection-hep/'
sys.path.insert(0, repo_path)

In [2]:
import os
import json 
import time 
import pickle
import itertools

import numpy as np
import pandas as pd

# Configure TensorFlow before importing other modules
import tensorflow as tf

# Suppress TensorFlow warnings (optional)
tf.get_logger().setLevel('ERROR')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import matplotlib.pyplot as plt

from typing import Any, List, Callable, Dict, Union

from src.callbacks import sp_index
from src.constants import GeV, et_bins, eta_bins
from src.decorators import Summary, Reference

from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight




2025-07-09 19:34:00.287630: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-09 19:34:00.287677: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-09 19:34:00.288562: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-09 19:34:00.293916: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Welcome to JupyROOT 6.30/02


In [3]:
# Configure TensorFlow to avoid BLAS errors and handle GPU/CPU properly
import tensorflow as tf

# Check for GPU availability and configure accordingly
print("TensorFlow version:", tf.__version__)
print("Available devices:")
for device in tf.config.list_physical_devices():
    print(f"  {device}")

# Configure GPU memory growth to avoid memory issues
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth for each GPU
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU memory growth enabled for {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")
else:
    print("No GPUs detected, using CPU")

# Alternative: Force CPU usage if GPU issues persist
# Uncomment the next lines if you want to force CPU usage
# tf.config.set_visible_devices([], 'GPU')
# print("Forcing CPU usage")

# Set number of threads for CPU operations
tf.config.threading.set_intra_op_parallelism_threads(0)  # Use all available cores
tf.config.threading.set_inter_op_parallelism_threads(0)  # Use all available cores

print("TensorFlow configuration completed")

TensorFlow version: 2.15.0
Available devices:
  PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
  PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
GPU memory growth enabled for 1 GPU(s)
TensorFlow configuration completed


2025-07-09 19:34:41.956313: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-09 19:34:42.190267: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-09 19:34:42.190465: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [4]:
# load the data
sgn_df = pd.concat([pd.read_parquet(os.path.join(repo_path, f'data/processed/{iname}')) for iname in ['zee_avg250_100k.parquet']], axis=0)
bkg_df = pd.concat([pd.read_parquet(os.path.join(repo_path, f'data/processed/{iname}')) for iname in ['jf17_avg250_100k.parquet']], axis=0)

m_df = pd.concat([sgn_df, bkg_df], axis=0)
m_df = m_df.loc[m_df.cl_eta.abs() <= 2.5]
m_df = m_df.loc[m_df.cl_et >= 15000]

input_cols = [f"cl_ring_{idx}" for idx in range(100)]

In [5]:
m_df.target.value_counts()

target
1    121786
0    113291
Name: count, dtype: int64

In [6]:

tot = 0
for iet, (l_iet, h_iet) in enumerate(et_bins):
    for ieta, (l_ieta, h_ieta) in enumerate(eta_bins):
        print(f'Processing et bin = [{l_iet}, {h_iet}[ and eta bin = [{l_ieta}, {h_ieta}[')
        m_df.loc[(m_df.cl_et >= l_iet*GeV) & (m_df.cl_et < h_iet*GeV), 'et_bin'] = int(iet)
        m_df.loc[(m_df.cl_eta.abs() >= l_ieta) & (m_df.cl_eta.abs() < h_ieta), 'eta_bin'] = int(ieta)
        l_samples = len(m_df.loc[(m_df.et_bin == iet) & (m_df.eta_bin == ieta)])
        tot += l_samples

Processing et bin = [15.0, 30[ and eta bin = [0.0, 0.8[
Processing et bin = [15.0, 30[ and eta bin = [0.8, 1.37[
Processing et bin = [15.0, 30[ and eta bin = [1.37, 1.54[
Processing et bin = [15.0, 30[ and eta bin = [1.54, 2.37[
Processing et bin = [15.0, 30[ and eta bin = [2.37, 2.5[
Processing et bin = [30.0, 50.0[ and eta bin = [0.0, 0.8[
Processing et bin = [30.0, 50.0[ and eta bin = [0.8, 1.37[
Processing et bin = [30.0, 50.0[ and eta bin = [1.37, 1.54[
Processing et bin = [30.0, 50.0[ and eta bin = [1.54, 2.37[
Processing et bin = [30.0, 50.0[ and eta bin = [2.37, 2.5[
Processing et bin = [50.0, inf[ and eta bin = [0.0, 0.8[
Processing et bin = [50.0, inf[ and eta bin = [0.8, 1.37[
Processing et bin = [50.0, inf[ and eta bin = [1.37, 1.54[
Processing et bin = [50.0, inf[ and eta bin = [1.54, 2.37[
Processing et bin = [50.0, inf[ and eta bin = [2.37, 2.5[


In [7]:
len(m_df), tot


(235077, 235077)

In [8]:
m_df.et_bin.unique(), m_df.eta_bin.unique()

(array([0., 1., 2.]), array([1., 2., 3., 0., 4.]))

In [9]:
def norm1(data):
    norms = np.abs(data.sum(axis=1))
    norms[norms == 0] = 1
    return data / norms[:, None]

In [10]:
def build_simple_mlp(input_shape: int, n_layers: int, n_units: int, seed: int=512) -> tf.keras.Model:
    """
    Builds a simple Multi-Layer Perceptron (MLP) model.

    Args:
        input_shape (int): The number of input features for the model.
        n_layers (int): The number of hidden layers in the MLP.
        n_units (int): The number of neurons in each hidden layer.

    Returns:
        tf.keras.Model: A compiled Keras model with the specified architecture.
    """
    inputs = tf.keras.layers.Input(shape=(input_shape,), name='Input')
    for ilayer in range(n_layers):
        dense = tf.keras.layers.Dense(n_units, activation='relu', name=f'dense_layer_{ilayer}',
                                      kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed),
                                      bias_initializer='zeros')(inputs if ilayer == 0 else dense)
    # classification layer
    dense = tf.keras.layers.Dense(1, activation='linear', name='output_for_inference',
                                  kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed),
                                  bias_initializer='zeros')(dense) 
    output = tf.keras.layers.Activation('sigmoid', name='output_for_training')(dense)
    model = tf.keras.Model(inputs, output, name="model")
    return model

def class_weight(target: np.ndarray) -> pd.Series:
    """
    Calculate class weights and sample weights for binary classification.

    Args:
        target (np.ndarray): Array of binary target labels (e.g., 0 and 1).

    Returns:
        pd.Series: A pandas Series containing sample weights for each instance, 
                   with the column name 'weight'.
    """
    classes = np.unique(target)
    # [-1,1] or [0,1]
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=target)
    class_weights = {cl: weights[idx] for idx, cl in enumerate(classes)}
    sample_weight = np.ones_like(target, dtype=np.float32)
    sample_weight[target == 1] = weights[1]
    sample_weight[target != 1] = weights[0]
    return pd.Series(sample_weight).to_frame('weight')


In [11]:
def train_model(dataframe: pd.DataFrame, input_cols: List[str], target_col: str,
                n_folds: int=5,
                n_epochs: int=100, 
                batch_size: int=1024, 
                seed: int=512,
                optimizer: Any=None,  # Changed to None to create fresh optimizers
                loss: Any='binary_crossentropy',
                decorators: List=[],
                patience: int=25,
                verbose: bool=True,
                save_path: str=os.path.join(repo_path, 'data/models')) -> None:
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    X, y = norm1(dataframe[input_cols].values), dataframe[target_col].values

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):

        x_train, y_train = X[train_index], y[train_index]
        x_test, y_test   = X[test_index] , y[test_index]
    
        l_model = build_simple_mlp(input_shape=X.shape[1], n_layers=1, n_units=5, seed=np.random.randint(0, 1000))
        
        # Create a fresh optimizer instance for each model to avoid state conflicts
        if optimizer is None:
            # Use legacy Adam optimizer to avoid the variable recognition issue
            fresh_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001)
        else:
            # Create a new instance of the same optimizer type
            if hasattr(optimizer, 'get_config'):
                config = optimizer.get_config()
                fresh_optimizer = optimizer.__class__.from_config(config)
            else:
                # Fallback to legacy Adam
                fresh_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001)
        
        l_model.compile(optimizer=fresh_optimizer, 
                    loss=loss, 
                    metrics=['accuracy']
                    )
        
        sp_index_callback = sp_index(validation_data=(x_test, y_test),
                                    patience=patience, verbose=False, save_the_best=True)
        start = time.time()
        history = l_model.fit(x_train, y_train, epochs=n_epochs,
            batch_size=batch_size,
            verbose=verbose,
            validation_data=(x_test, y_test),
            sample_weight=class_weight(y_train),
            callbacks=[sp_index_callback],
            shuffle=True
        ).history
        
        end = time.time()
        
        # Run decorators with output capture
        et_bin = int(dataframe.et_bin.unique()[0])
        eta_bin = int(dataframe.eta_bin.unique()[0])
        
        
        for decorator in decorators:
            decorator(history , {'model':l_model, 'data':(x_train, y_train),  'data_val':(x_test, y_test) })
            
        d = { 
            'history'          : history, 
            'model'            : json.loads(l_model.to_json()), 
            'weights'          : l_model.get_weights(),
            'folds'            : i,
            'model_type'       : 'mlp_ss',
            'time_to_train'    : (end-start)}
        output = os.path.join(save_path, 
                              f'ss_model_et{et_bin}_eta{eta_bin}_fold{i}.pkl')
        pickle.dump(d, open(output, 'wb'))
        tf.keras.backend.clear_session()

In [12]:
for iet, ieta in itertools.product(range(3), range(5)):
        
    with open(os.path.join(repo_path, f'data/processed/references_et{iet}_eta{ieta}.json'), 'r') as f:
        ref = json.load(f)
    
    decorators = [Summary(detailed=True, verbose=False), Reference(ref, verbose=False)]
    l_data = m_df.loc[(m_df.et_bin == iet) & (m_df.eta_bin == ieta)]
    print(f'Training model for et bin = {iet} and eta bin = {ieta}')
    train_model(l_data, input_cols=input_cols, target_col='target',
                n_folds=5,
                n_epochs=1000,
                batch_size=512,
                seed=512,
                optimizer=None,  # Changed to None to use legacy Adam optimizer
                loss='binary_crossentropy',
                decorators=decorators,
                verbose=False,
                patience=25,
                save_path=os.path.join(repo_path, 'data/models/rings_v0_models'))
    tf.keras.backend.clear_session()
    

Training model for et bin = 0 and eta bin = 0
Training model for et bin = 0 and eta bin = 1
Training model for et bin = 0 and eta bin = 2
Training model for et bin = 0 and eta bin = 3
Training model for et bin = 0 and eta bin = 4
Training model for et bin = 1 and eta bin = 0
Training model for et bin = 1 and eta bin = 1
Training model for et bin = 1 and eta bin = 2
Training model for et bin = 1 and eta bin = 3
Training model for et bin = 1 and eta bin = 4
Training model for et bin = 2 and eta bin = 0
Training model for et bin = 2 and eta bin = 1
Training model for et bin = 2 and eta bin = 2
Training model for et bin = 2 and eta bin = 3
Training model for et bin = 2 and eta bin = 4


2025-07-09 19:34:44.839411: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-09 19:34:44.839628: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-07-09 19:34:44.839758: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-