In [9]:
# Biblioteca
import polars as pl
import numpy as np
import os
import re

# Função para formatar o nome do arquivo
def formatar_string(s):
    match = re.match(r'([a-zA-Z]+)([0-9]+(?:\.[0-9]+)?[Ee][0-9]+)', s)
    if match:
        palavra = match.group(1).capitalize()
        numero = match.group(2)
        return f'{palavra} {numero}'
    else:
        return s

pasta = 'tratados_3det'
dataframes = []

for root, dirs, files in os.walk(pasta):
    for arquivo in files:
        caminho_completo = os.path.join(root, arquivo)  
        print(f'Processando arquivo: {arquivo}')
        
        try:
            # Ler todo o arquivo
            df_total = pl.read_csv(caminho_completo, has_header=False)
            
            # Adicionar ID de simulação (blocos de 4 linhas)
            df_total = df_total.with_columns(
                (pl.arange(0, df_total.height, eager=True) // 4).alias("simulation_id"))
            
            # Processar linhas TRIG para identificar simulações válidas
            df_trig = df_total.filter(pl.col('column_1').str.contains('TRIG'))
            df_trig = df_trig.with_columns(
                pl.col("column_1").str.split(" ").alias("split_column")
            )
            df_trig = df_trig.with_columns(
                pl.col("split_column").list.get(0).alias("TRIG"),
                pl.col("split_column").list.get(1).cast(pl.Int64).alias("positrons"),
                pl.col("split_column").list.get(2).cast(pl.Int64).alias("electrons"),
                pl.col("split_column").list.get(3).cast(pl.Int64).alias("muons_plus"),
                pl.col("split_column").list.get(4).cast(pl.Int64).alias("muons_minus"),
                pl.col("split_column").list.get(5).cast(pl.Float64).alias("time")
            )
            df_trig = df_trig.with_columns(
                (pl.col("positrons") + pl.col("electrons") + 
                 pl.col("muons_plus") + pl.col("muons_minus")).alias("total_particles")
            )
            
            # Identificar simulações válidas
            valid_simulations = (
                df_trig.group_by("simulation_id")
                .agg(pl.col("total_particles").min() > 0)
                .filter(pl.col("total_particles"))
                .select("simulation_id")
            )
            
            # Processar linhas EVTH para extrair ângulos
            df_evth = df_total.filter(
                (pl.col('column_1').str.contains('EVTH')) &
                (pl.col("simulation_id").is_in(valid_simulations["simulation_id"])))
            
            # Extrair theta e phi (últimos dois valores numéricos)
            df_evth = df_evth.with_columns(
                pl.col("column_1").str.split(" ").alias("split_column")
            )
            df_evth = df_evth.with_columns(
                pl.col("split_column").list.tail(2).list.get(0).cast(pl.Float64).alias("theta"),
                pl.col("split_column").list.tail(2).list.get(1).cast(pl.Float64).alias("phi")
            ).select(["simulation_id", "theta", "phi"])
            
            # Juntar dados TRIG com ângulos
            df_trig_valid = df_trig.filter(
                pl.col("simulation_id").is_in(valid_simulations["simulation_id"])
            )
            df_final_file = df_trig_valid.join(df_evth, on="simulation_id", how="left")
            
            # Adicionar metadados
            name = formatar_string(arquivo)
            df_final_file = df_final_file.with_columns(
                pl.lit(name.split()[0]).alias("composition"),
                pl.lit(name.split()[1]).alias("energy")
            )
            
            # Selecionar colunas relevantes
            df_final_file = df_final_file.select(
                ["TRIG", "total_particles", "simulation_id", "time", "theta", "phi", "composition", "energy"]
            ).rename({"total_particles": "particles"})
            
            dataframes.append(df_final_file)
            
        except Exception as e:
            print(f'Erro ao processar {caminho_completo}: {e}')

# Combina todos os dataframes
if dataframes:
    df_final = pl.concat(dataframes)
    df_final = df_final.sample(fraction=1.0, shuffle=True)
    print(df_final)
else:
    print("Nenhum dado válido encontrado.")

Processando arquivo: carbon1E14
Processando arquivo: carbon1E15
Processando arquivo: carbon3.16E14
Processando arquivo: carbon3.16E15
Processando arquivo: iron1E14
Processando arquivo: iron1E15
Processando arquivo: iron3.16E14
Processando arquivo: iron3.16E15
Processando arquivo: nitrogen1E14
Processando arquivo: nitrogen1E15
Processando arquivo: nitrogen3.16E14
Processando arquivo: nitrogen3.16E15
Processando arquivo: oxygen1E14
Processando arquivo: oxygen1E15
Processando arquivo: oxygen3.16E14
Processando arquivo: oxygen3.16E15
Processando arquivo: photon1E14
Processando arquivo: photon1E15
Processando arquivo: photon3.16E14
Processando arquivo: photon3.16E15
Processando arquivo: proton1E14
Processando arquivo: proton1E15
Processando arquivo: proton3.16E14
Processando arquivo: proton3.16E15
shape: (13_518, 8)
┌───────┬───────────┬───────────────┬──────────┬────────┬──────────┬─────────────┬─────────┐
│ TRIG  ┆ particles ┆ simulation_id ┆ time     ┆ theta  ┆ phi      ┆ composition ┆ e

In [10]:
df_final

TRIG,particles,simulation_id,time,theta,phi,composition,energy
str,i64,i64,f64,f64,f64,str,str
"""TRIG2""",6,847,95827.3,37.259,5.451,"""Photon""","""1E15"""
"""TRIG2""",4,36,132236.0,25.93,141.853,"""Nitrogen""","""1E15"""
"""TRIG1""",11,153,95358.0,25.005,-161.438,"""Carbon""","""1E15"""
"""TRIG2""",654,228,94215.2,15.905,6.843,"""Photon""","""3.16E15"""
"""TRIG2""",45,560,101724.0,16.438,108.59,"""Photon""","""1E15"""
…,…,…,…,…,…,…,…
"""TRIG2""",1,311,77381.7,11.267,80.708,"""Photon""","""1E14"""
"""TRIG3""",393,229,35777.6,15.505,104.674,"""Proton""","""1E15"""
"""TRIG2""",1,788,141217.0,34.627,-148.253,"""Iron""","""1E15"""
"""TRIG3""",5,400,58698.0,30.571,104.096,"""Proton""","""3.16E14"""


In [3]:
df_final.filter(
    (pl.col("TRIG") == "TRIG1") & (pl.col("composition") == 'Photon') & (pl.col("simulation_id") == 13)
)

TRIG,particles,simulation_id,composition,energy
str,i64,i64,str,str
"""TRIG1""",7,13,"""Photon""","""1E15"""
"""TRIG1""",68,13,"""Photon""","""3.16E15"""


In [4]:
df_final.filter(
    (pl.col("TRIG") == "TRIG1") & (pl.col("composition") == 'Photon')
)

TRIG,particles,simulation_id,composition,energy
str,i64,i64,str,str
"""TRIG1""",36,995,"""Photon""","""1E15"""
"""TRIG1""",9,583,"""Photon""","""3.16E14"""
"""TRIG1""",2,352,"""Photon""","""1E14"""
"""TRIG1""",4,216,"""Photon""","""1E15"""
"""TRIG1""",45,589,"""Photon""","""1E15"""
…,…,…,…,…
"""TRIG1""",6,991,"""Photon""","""1E15"""
"""TRIG1""",43,688,"""Photon""","""3.16E14"""
"""TRIG1""",18,612,"""Photon""","""1E15"""
"""TRIG1""",135,941,"""Photon""","""1E15"""


In [11]:
# Normalização da densidade
density_min = df_final.select(pl.col('particles').min()).to_numpy()[0, 0]
density_max = df_final.select(pl.col('particles').max()).to_numpy()[0, 0]
    
df_normalized = df_final.with_columns(
      ((pl.col('particles') - density_min) / (density_max - density_min)).alias('particles_normalized')
    )
df_normalized
df_normalized.write_csv('data_neural_network_3det.csv')

In [12]:
df_normalized

TRIG,particles,simulation_id,time,theta,phi,composition,energy,particles_normalized
str,i64,i64,f64,f64,f64,str,str,f64
"""TRIG2""",6,847,95827.3,37.259,5.451,"""Photon""","""1E15""",0.004223
"""TRIG2""",4,36,132236.0,25.93,141.853,"""Nitrogen""","""1E15""",0.002534
"""TRIG1""",11,153,95358.0,25.005,-161.438,"""Carbon""","""1E15""",0.008446
"""TRIG2""",654,228,94215.2,15.905,6.843,"""Photon""","""3.16E15""",0.55152
"""TRIG2""",45,560,101724.0,16.438,108.59,"""Photon""","""1E15""",0.037162
…,…,…,…,…,…,…,…,…
"""TRIG2""",1,311,77381.7,11.267,80.708,"""Photon""","""1E14""",0.0
"""TRIG3""",393,229,35777.6,15.505,104.674,"""Proton""","""1E15""",0.331081
"""TRIG2""",1,788,141217.0,34.627,-148.253,"""Iron""","""1E15""",0.0
"""TRIG3""",5,400,58698.0,30.571,104.096,"""Proton""","""3.16E14""",0.003378


In [7]:
# Leitura do arquivo

arquivo = 'tratados_3det/photon/photon1E15'

df = pl.read_csv(arquivo, has_header= False).filter(pl.col('column_1').str.contains('TRIG'))

df = df.with_columns(pl.col("column_1").str.split(" ").alias("split_column"))
df = df.with_columns(pl.col("split_column").list.get(0).alias("TRIG"))
df = df.with_columns(pl.col("split_column").list.get(1).cast(pl.Int64).alias("positrons"))
df = df.with_columns(pl.col("split_column").list.get(2).cast(pl.Int64).alias("electrons"))
df = df.with_columns(pl.col("split_column").list.get(3).cast(pl.Int64).alias("muons_plus"))
df = df.with_columns(pl.col("split_column").list.get(4).cast(pl.Int64).alias("muons_minus"))
df = df.with_columns(
    (pl.col("positrons") + pl.col("electrons") + pl.col("muons_plus") + pl.col("muons_minus")).alias("total_particles")
)

# Adicionar uma coluna para identificar a simulação
df = df.with_columns((pl.arange(0, df.height, eager=True) // 3).alias("simulation_id"))

# Agregar por simulation_id e verificar se todos os detectores têm total_particles > 0
valid_simulations = (
    df.group_by("simulation_id")
    .agg((pl.col("total_particles") > 0).all().alias("all_detectors_positive"))
    .filter(pl.col("all_detectors_positive"))
    .select("simulation_id")
)

# Filtrar o DataFrame original para manter apenas as simulações válidas
filtered_df = df.filter(pl.col("simulation_id").is_in(valid_simulations["simulation_id"]))

print(filtered_df)

shape: (1_662, 9)
┌────────────┬────────────┬───────┬───────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ column_1   ┆ split_colu ┆ TRIG  ┆ positrons ┆ … ┆ muons_plus ┆ muons_min ┆ total_par ┆ simulatio │
│ ---        ┆ mn         ┆ ---   ┆ ---       ┆   ┆ ---        ┆ us        ┆ ticles    ┆ n_id      │
│ str        ┆ ---        ┆ str   ┆ i64       ┆   ┆ i64        ┆ ---       ┆ ---       ┆ ---       │
│            ┆ list[str]  ┆       ┆           ┆   ┆            ┆ i64       ┆ i64       ┆ i64       │
╞════════════╪════════════╪═══════╪═══════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ TRIG1 16   ┆ ["TRIG1",  ┆ TRIG1 ┆ 16        ┆ … ┆ 0          ┆ 0         ┆ 32        ┆ 5         │
│ 16 0 0     ┆ "16", …    ┆       ┆           ┆   ┆            ┆           ┆           ┆           │
│            ┆ "0"]       ┆       ┆           ┆   ┆            ┆           ┆           ┆           │
│ TRIG2 10 8 ┆ ["TRIG2",  ┆ TRIG2 ┆ 10        ┆ … ┆ 0          ┆ 0       