In [1]:
import sys
repo_path = '/home/micael.verissimo/paper_lzt/exp-connection-hep/'
sys.path.insert(0, repo_path)

In [2]:
import os
import json
import time
import pickle
import itertools

import numpy as np
import pandas as pd

from loguru import logger
from typing import List, Dict, Union
from src.constants import GeV, et_bins, eta_bins

import tensorflow as tf
from tensorflow.keras.models import Model
model_from_json = tf.keras.models.model_from_json

# Configurações do TensorFlow para evitar warnings
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

# Configurações adicionais para melhor performance e menos warnings
tf.get_logger().setLevel('ERROR')  # Reduz logs verbosos
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suprime warnings de compilação

# Tentar importar pandarallel, instalar se necessário
#from pandarallel import pandarallel
#pandarallel.initialize(progress_bar=True, nb_workers=6)

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing as mp

Welcome to JupyROOT 6.30/02


2025-07-11 17:18:51.412308: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-11 17:18:51.412352: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-11 17:18:51.413185: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-11 17:18:51.418100: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-07-11 17:19:08.095556: I external/local_xla/xla/

In [3]:
prefix = "cl_ring_%i"

# rings presample 
presample = [prefix % iring for iring in range(8 // 2)]

# EM1 list
sum_rings = 8
em1 = [prefix % iring for iring in range(sum_rings, sum_rings + (64 // 2))]

# EM2 list
sum_rings = 8 + 64
em2 = [prefix % iring for iring in range(sum_rings, sum_rings + (8 // 2))]

# EM3 list
sum_rings = 8 + 64 + 8
em3 = [prefix % iring for iring in range(sum_rings, sum_rings + (8 // 2))]

# HAD1 list
sum_rings = 8 + 64 + 8 + 8
had1 = [prefix % iring for iring in range(sum_rings, sum_rings + (4 // 2))]

# HAD2 list
sum_rings = 8 + 64 + 8 + 8 + 4
had2 = [prefix % iring for iring in range(sum_rings, sum_rings + (4 // 2))]

# HAD3 list
sum_rings = 8 + 64 + 8 + 8 + 4 + 4
had3 = [prefix % iring for iring in range(sum_rings, sum_rings + (4 // 2))]

input_cols_map = {
    'MLP-5-rings_v0' : [f"cl_ring_{idx}" for idx in range(100)],
    'MLP-5-rings_v1' : presample + em1 + em2 + em3 + had1 + had2 + had3,
    'MLP-5-SS' : ['cl_reta', 'cl_eratio', 'cl_rhad']    
}

In [4]:
sgn_df = pd.concat([pd.read_parquet(os.path.join(repo_path, f'data/processed/{iname}')) for iname in ['zee_avg300_50k.parquet']], axis=0)
bkg_df = pd.concat([pd.read_parquet(os.path.join(repo_path, f'data/processed/{iname}')) for iname in ['jf17_avg300_50k.parquet']], axis=0)

t_df = pd.concat([sgn_df, bkg_df], axis=0)
t_df = t_df.loc[t_df.cl_eta.abs() <= 2.5]
t_df = t_df.loc[t_df.cl_et >= 15000]

In [5]:
t_df.duplicated().sum()

0

In [6]:

tot = 0
for iet, (l_iet, h_iet) in enumerate(et_bins):
    for ieta, (l_ieta, h_ieta) in enumerate(eta_bins):
        print(f'Processing et bin = [{l_iet}, {h_iet}[ and eta bin = [{l_ieta}, {h_ieta}[')
        t_df.loc[(t_df.cl_et >= l_iet*GeV) & (t_df.cl_et < h_iet*GeV), 'et_bin'] = int(iet)
        t_df.loc[(t_df.cl_eta.abs() >= l_ieta) & (t_df.cl_eta.abs() < h_ieta), 'eta_bin'] = int(ieta)
        l_samples = len(t_df.loc[(t_df.et_bin == iet) & (t_df.eta_bin == ieta)])
        tot += l_samples

Processing et bin = [15.0, 30[ and eta bin = [0.0, 0.8[
Processing et bin = [15.0, 30[ and eta bin = [0.8, 1.37[
Processing et bin = [15.0, 30[ and eta bin = [1.37, 1.54[
Processing et bin = [15.0, 30[ and eta bin = [1.54, 2.37[
Processing et bin = [15.0, 30[ and eta bin = [2.37, 2.5[
Processing et bin = [30.0, 50.0[ and eta bin = [0.0, 0.8[
Processing et bin = [30.0, 50.0[ and eta bin = [0.8, 1.37[
Processing et bin = [30.0, 50.0[ and eta bin = [1.37, 1.54[
Processing et bin = [30.0, 50.0[ and eta bin = [1.54, 2.37[
Processing et bin = [30.0, 50.0[ and eta bin = [2.37, 2.5[
Processing et bin = [50.0, inf[ and eta bin = [0.0, 0.8[
Processing et bin = [50.0, inf[ and eta bin = [0.8, 1.37[
Processing et bin = [50.0, inf[ and eta bin = [1.37, 1.54[
Processing et bin = [50.0, inf[ and eta bin = [1.54, 2.37[
Processing et bin = [50.0, inf[ and eta bin = [2.37, 2.5[


In [7]:
best_sorts = pd.read_csv(os.path.join(repo_path, 'data/processed/best_sorts.csv'))
best_sorts.head()

Unnamed: 0,train_tag,et_bin,eta_bin,model_idx,sort,init,file_name,op_name,max_sp_val,max_sp_pd_val,...,pd_ref_passed,pd_ref_total,pd_val_passed,fa_val_passed,pd_val_total,fa_val_total,pd_op_passed,fa_op_passed,pd_op_total,fa_op_total
0,MLP-5-rings_v0,0,0,0,3,0,/home/micael.verissimo/paper_lzt/exp-connectio...,loose,0.983954,0.989324,...,11129.0,11241,2225,126,2248,5280,11129,519,11241,26399
1,MLP-5-rings_v0,0,0,0,3,0,/home/micael.verissimo/paper_lzt/exp-connectio...,medium,0.983954,0.989324,...,10679.0,11241,2138,44,2248,5280,10687,227,11241,26399
2,MLP-5-rings_v0,0,0,0,3,0,/home/micael.verissimo/paper_lzt/exp-connectio...,tight,0.983954,0.989324,...,10117.0,11241,2020,34,2248,5280,10119,158,11241,26399
3,MLP-5-rings_v0,0,1,0,1,0,/home/micael.verissimo/paper_lzt/exp-connectio...,loose,0.984891,0.991677,...,8326.0,8410,1667,78,1682,3612,8325,360,8410,18057
4,MLP-5-rings_v0,0,1,0,1,0,/home/micael.verissimo/paper_lzt/exp-connectio...,medium,0.984891,0.991677,...,7990.0,8410,1596,28,1682,3612,7989,149,8410,18057


In [8]:
def get_models_from_path(model_path: str):
    """
    Get the models from the given path.
    
    Parameters:
    model_path (str): The path to the model.
    
    Returns:
    list: A list of models.
    """
    with open(model_path, 'rb') as f:
        tuned_file = pickle.load(f)
    
    model = tuned_file['model']
    weights = tuned_file['weights']

    model = model_from_json( json.dumps(model, separators=(',', ':')) ) #custom_objects={'RpLayer':RpLayer} )
    model.set_weights( weights )
    new_model = Model(model.inputs, model.layers[-2].output)
    return model, new_model

def generate_model_dict(train_tag: str) -> Dict[str, Dict[str, Union[Model, float]]]:
    """
    Add classifier information to the dataframe.
    
    Parameters:
    df (dataframe): The input dataframe.
    train_tag (str): The tag of the training.
    
    Returns:
    dataframe: The dataframe with classifier information added.
    """
    model_dict = {}
    model_dict['input_cols'] = input_cols_map[train_tag]
    models_df = best_sorts.loc[best_sorts.train_tag == train_tag].copy()
    for iet, ieta in itertools.product(range(len(et_bins)),
                                       range(len(eta_bins))):
        logger.info(f'Processing et bin = {iet} and eta bin = {ieta}')
        bin_key    = f'et{iet}_eta{ieta}'
        bin_models = models_df.loc[(models_df.et_bin == iet) & (models_df.eta_bin == ieta)]
        model_dict[bin_key] = {}

        sigmoid_model, linear_model = get_models_from_path(bin_models.file_name.unique()[0])
        model_dict[bin_key]['s_model'] = sigmoid_model
        model_dict[bin_key]['l_model'] = linear_model
        for iop in ['tight', 'medium', 'loose']:
            model_dict[bin_key][iop] = bin_models.loc[bin_models.op_name == iop].iloc[0].thr_val

    return model_dict
        

In [9]:
ss_models = generate_model_dict('MLP-5-SS')
v0_models = generate_model_dict('MLP-5-rings_v0')
v1_models = generate_model_dict('MLP-5-rings_v1')

[32m2025-07-11 17:19:10.875[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_model_dict[0m:[36m38[0m - [1mProcessing et bin = 0 and eta bin = 0[0m
[32m2025-07-11 17:19:10.920[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_model_dict[0m:[36m38[0m - [1mProcessing et bin = 0 and eta bin = 1[0m
[32m2025-07-11 17:19:10.941[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_model_dict[0m:[36m38[0m - [1mProcessing et bin = 0 and eta bin = 2[0m
[32m2025-07-11 17:19:10.962[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_model_dict[0m:[36m38[0m - [1mProcessing et bin = 0 and eta bin = 3[0m
[32m2025-07-11 17:19:11.002[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_model_dict[0m:[36m38[0m - [1mProcessing et bin = 0 and eta bin = 4[0m
[32m2025-07-11 17:19:11.022[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_model_dict[0m:[36m38[0m - [1mProcessing et bin = 1 and eta bin = 0[0m
[32m2025-07-11 17:19:11.042[0m |

In [10]:
def norm1(data):
    norms = np.abs(data.sum(axis=1))
    norms[norms == 0] = 1
    return data / norms[:, None]

def add_classifier_information(row: pd.Series, 
                               model_dict: Dict[str, Dict[str, Union[Model, float]]],
                               ):
    """
    Add classifier information to the dataframe.
    
    Parameters:
    df (dataframe): The input dataframe.
    model_dict (dict): The dictionary containing the models and thresholds.
    train_tag (str): The tag of the training.
    
    Returns:
    dataframe: The dataframe with classifier information added.
    """
    et_bin = int(row['et_bin'])
    eta_bin = int(row['eta_bin'])
    bin_key = f'et{et_bin}_eta{eta_bin}'
    
    input_cols = model_dict['input_cols']
    if 'rings' in model_dict['input_cols'][0]:
        # For rings models, we need to normalize the input
        inputs = row[input_cols].values.reshape(1, -1)
        inputs = norm1(inputs).astype(np.float32)
    else:
        # For SS models, we do not need to normalize the input 
        inputs = row[input_cols].values.reshape(1, -1).astype(np.float32)
    # Use the processed inputs for both models
    s_o = model_dict[bin_key]['s_model'].predict(inputs, verbose=0)[0][0]
    l_o = model_dict[bin_key]['l_model'].predict(inputs, verbose=0)[0][0]
    
    tight_passed = int(s_o >= model_dict[bin_key]['tight'])
    medium_passed = int(s_o >= model_dict[bin_key]['medium'])
    loose_passed = int(s_o >= model_dict[bin_key]['loose'])

    return s_o, l_o, tight_passed, medium_passed, loose_passed

In [11]:


# Função otimizada para processamento em lote
def process_batch_vectorized(df_batch, model_dict, train_tag):
    """
    Processa um lote de dados de forma vetorizada
    """
    results = []
    input_cols = model_dict['input_cols']
    df_batch[f'{train_tag}_sigmoid']       = 0
    df_batch[f'{train_tag}_linear']        = 0
    df_batch[f'{train_tag}_tight_passed']  = 0
    df_batch[f'{train_tag}_medium_passed'] = 0
    df_batch[f'{train_tag}_loose_passed']  = 0
    # Agrupar por et_bin e eta_bin para processar em lotes maiores
    for iet, ieta in itertools.product(range(len(et_bins)), range(len(eta_bins))):
        logger.info(f'Processing et bin = {iet} and eta bin = {ieta}')
        # Filtrar o DataFrame para o bin atual       
        bin_filter = (df_batch.et_bin == iet) & (df_batch.eta_bin == ieta) 
        l_df = df_batch.loc[bin_filter]
        bin_key = f'et{int(iet)}_eta{int(ieta)}'
        
        if bin_key not in model_dict:
            continue
            
        # Preparar dados em lote
        if 'rings' in input_cols[0]:
            # Para rings models, normalizar
            batch_inputs = l_df[input_cols].values
            batch_inputs = norm1(batch_inputs).astype(np.float32)
        else:
            # Para SS models, não normalizar
            batch_inputs = l_df[input_cols].values.astype(np.float32)
        
        # Predição em lote (muito mais eficiente)
        s_outputs = model_dict[bin_key]['s_model'].predict(batch_inputs, verbose=0).flatten()
        l_outputs = model_dict[bin_key]['l_model'].predict(batch_inputs, verbose=0).flatten()
        df_batch.loc[bin_filter, f'{train_tag}_sigmoid'] = s_outputs
        df_batch.loc[bin_filter, f'{train_tag}_linear']  = l_outputs
        
        # Aplicar thresholds vetorizadamente
        df_batch.loc[bin_filter, f'{train_tag}_tight_passed']  = (s_outputs >= model_dict[bin_key]['tight']).astype(int)
        df_batch.loc[bin_filter, f'{train_tag}_medium_passed'] = (s_outputs >= model_dict[bin_key]['medium']).astype(int)
        df_batch.loc[bin_filter, f'{train_tag}_loose_passed']  = (s_outputs >= model_dict[bin_key]['loose']).astype(int)
    
    return df_batch
# Aplicar classificadores de forma otimizada
map_dict = {
    'ss' : ss_models,
    'v0' : v0_models,
    'v1' : v1_models,
}

print(f"Processando {len(t_df)} amostras...")

for train_tag, classifiers in map_dict.items():
    logger.info(f'Processing train tag = {train_tag} com processamento vetorizado...')
    start_time = time.time()
    # Método 1: Processamento totalmente vetorizado (mais rápido)
    try:
        results = process_batch_vectorized(t_df, classifiers, train_tag)
            
    except Exception as e:
        logger.error(f"Erro no processamento vetorizado: {e}")
    
    end_time = time.time()
    logger.info(f'Completed {train_tag} in {end_time - start_time:.2f} seconds')
logger.info("Processamento concluído!")

[32m2025-07-11 17:19:12.077[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m55[0m - [1mProcessing train tag = ss com processamento vetorizado...[0m
[32m2025-07-11 17:19:12.079[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_batch_vectorized[0m:[36m15[0m - [1mProcessing et bin = 0 and eta bin = 0[0m


Processando 117259 amostras...


 4.8158825e-03]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_batch.loc[bin_filter, f'{train_tag}_sigmoid'] = s_outputs
 -5.3310084 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_batch.loc[bin_filter, f'{train_tag}_linear']  = l_outputs
[32m2025-07-11 17:19:14.951[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_batch_vectorized[0m:[36m15[0m - [1mProcessing et bin = 0 and eta bin = 1[0m
[32m2025-07-11 17:19:16.853[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_batch_vectorized[0m:[36m15[0m - [1mProcessing et bin = 0 and eta bin = 2[0m
[32m2025-07-11 17:19:17.391[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_batch_vectorized[0m:[36m15[0m - [1mProcessing et bin = 0 and eta bin = 3[0m
[32m2025-07-11 17:19:20.392[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_batch_vectorized[0m:[36m15[0m - [1mProcessing et bin = 0 and eta bin = 4[0m
[32

In [12]:
t_df.head()

Unnamed: 0,EventNumber,RunNumber,avgmu,cl_deta,cl_dphi,cl_e,cl_e0,cl_e1,cl_e2,cl_e233,...,v0_sigmoid,v0_linear,v0_tight_passed,v0_medium_passed,v0_loose_passed,v1_sigmoid,v1_linear,v1_tight_passed,v1_medium_passed,v1_loose_passed
0,40000,0,276.0,0.2,0.2,50539.238281,682.221191,40379.671875,9523.637695,14319.364258,...,1.0,13971.744141,1,1,1,1.0,45603.742188,1,1,1
3,40002,0,316.0,0.2,0.2,92922.898438,-33.155022,41428.152344,50405.484375,34472.34375,...,1.0,22548.0,1,1,1,1.0,55837.890625,1,1,1
5,40040,0,342.0,0.2,0.2,83985.148438,-310.291595,64925.390625,19492.830078,18459.160156,...,1.0,115207.484375,1,1,1,1.0,59005.414062,1,1,1
6,40040,0,342.0,0.2,0.2,69624.15625,-301.798492,50327.515625,19697.140625,14645.37207,...,1.0,115207.484375,1,1,1,1.0,59005.414062,1,1,1
7,40041,0,231.0,0.2,0.2,48442.382812,233.507751,33527.433594,14191.050781,15718.147461,...,1.0,2353.416504,1,1,1,1.0,25257.417969,1,1,1


In [13]:
t_df.duplicated().sum()

0

In [14]:
t_df.to_parquet(os.path.join(repo_path, 'data/processed/lzt_zee_jf17_avg300_50k_classified.parquet'))