In [2]:
import os
import csv

In [3]:
from root_manager.settings import FilterParams, ProcessorConfig, ChunkGeneratorConfig
from settings import BatchGeneratorConfig, NormParams, AugmentParams
from ustils_for_stats import StatsCollector
import config_manager as cfgm

## Choose paths and their proportion to create dataset

In [4]:
path_mu = "/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/"
path_nuatm = "/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/nuatm/root/all/"
path_nu2 = "/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/nue2_100pev/root/all/"
def explore_paths(p: str, start: int, stop: int):
    files = sorted(os.listdir(p))[start:stop]
    return [f"{p}{file}" for file in files]

In [5]:
# Read train paths
mu_paths = explore_paths(path_mu, 0, 80)
nuatm_paths = explore_paths(path_nuatm, 0, 100)
nu2_paths = explore_paths(path_nu2, 0, 6)

train_paths = mu_paths + nuatm_paths + nu2_paths

## Choose processing settings

In [7]:
proc_cfg = ProcessorConfig(
    center_times= True, 
    calc_tres = False, 
    filter_cfg = FilterParams(
        only_signal = True,
        min_hits = 5,
        min_strings = 2,
        min_Q = 0,
        t_threshold = 100000
    )
)

## Collect stats of processed events to adjust proportion of files

In [11]:
stats = StatsCollector(mu_paths, nuatm_paths, nu2_paths, proc_cfg)

In [12]:
stats.get_stats(0, 50)
stats_dict = stats.return_stats()

2024-10-24 12:21:43,570 - INFO - Processing datasets from indices 0 to 50.
2024-10-24 12:23:21,567 - INFO - Calculating mean and std for Q and t
2024-10-24 12:23:21,569 - INFO - Estimating mean and std for field: PulsesAmpl
2024-10-24 12:23:21,731 - INFO - Field 'PulsesAmpl': Estimated Mean = 5.1921, Std = 57.4926
2024-10-24 12:23:21,733 - INFO - Estimating mean and std for field: PulsesTime
2024-10-24 12:23:21,841 - INFO - Field 'PulsesTime': Estimated Mean = -0.0000, Std = 239.8131
2024-10-24 12:23:21,842 - INFO - Collected dataset statistics successfully. Ratios - Mu/Nu: 0.9867, NuAtm/Nu2: 9.3249


In [13]:
stats_dict

{'proc_cfg': ProcessorConfig(center_times=True, calc_tres=False, filter_cfg=FilterParams(only_signal=True, min_hits=5, min_strings=2, min_Q=0, t_threshold=100000)),
 'mu_num_estimated': 1082993.6,
 'nuatm_num_estimated': 991286.0,
 'nu2_num_estimated': 106305.0,
 'mu_filter_koef': 0.4209888967118046,
 'nuatm_filter_koef': 0.25229468476080913,
 'nu2_filter_koef': 0.8313846635122981,
 'mu_nu_ratio': 0.9867005104815911,
 'nuatm_nu2_ratio': 9.32492356897606,
 'Q_mean': 5.192124376403326,
 'Q_std': 57.492594050332194,
 't_mean': -6.81645089946574e-09,
 't_std': 239.81307352095754}

## Create final config of batch generator

In [10]:
cfg = BatchGeneratorConfig(
    chunk_generator_cfg=ChunkGeneratorConfig(
            chunk_size = 25,
            processor_params = proc_cfg,
            fields = ['PulsesAmpl', 'PulsesTime', 'Xrel', 'Yrel', 'Zrel', 'nu_induced'],
            shuffle_paths = True
        ),
    batch_size = 256,
    features_name = ["PulsesAmpl", "PulsesTime", "Xrel", "Yrel", "Zrel"],
    labels_name = ["nu_induced"],
    do_norm = True,
    norm_params = NormParams(
        PulsesTime = [0, 240.],
        PulsesAmpl = [5.2, 57.5],
        Xrel = [0, 60],
        Yrel = [0, 60],
        Zrel = [0, 260]
        ),
    do_augment = True,
    augment_params = AugmentParams(
        PulsesTime = 5,
        PulsesAmpl = 0.1,
        Xrel = 2,
        Yrel = 2,
        Zrel = 5
        ),
    shuffle = True
)

## Choose files assuming got stats

In [11]:
# SMALL
# Read train paths
mu_paths = explore_paths(path_mu, 0, 80)
nuatm_paths = explore_paths(path_nuatm, 0, 100)
nu2_paths = explore_paths(path_nu2, 0, 6)

train_paths = mu_paths + nuatm_paths + nu2_paths

# Read test paths
test_mu_paths = explore_paths(path_mu, 80, 80+40)
test_nuatm_paths = explore_paths(path_nuatm, 100, 100+50)
test_nu2_paths = explore_paths(path_nu2, 6, 6+3)

test_paths = test_mu_paths + test_nuatm_paths + test_nu2_paths

# Read val paths
val_mu_paths = explore_paths(path_mu, 80+40, 80+40+160)
val_nuatm_paths = explore_paths(path_nuatm, 100+50, 100+50+200)
val_nu2_paths = explore_paths(path_nu2, 6+3, 6+3+12)

val_paths = val_mu_paths + val_nuatm_paths + val_nu2_paths

# # # BIG
# # # Read train paths
# mu_paths = explore_paths(path_mu, 0, 800)
# nuatm_paths = explore_paths(path_nuatm, 0, 1000)
# nu2_paths = explore_paths(path_nu2, 0, 60)

# train_paths = mu_paths + nuatm_paths + nu2_paths

# # Read test paths
# test_mu_paths = explore_paths(path_mu, 800, 800+40)
# test_nuatm_paths = explore_paths(path_nuatm, 1000, 1000+50)
# test_nu2_paths = explore_paths(path_nu2, 60, 60+3)

# test_paths = test_mu_paths + test_nuatm_paths + test_nu2_paths

# # Read val paths
# val_mu_paths = explore_paths(path_mu, 800+40, None)
# val_nuatm_paths = explore_paths(path_nuatm, 1000+50, None)
# val_nu2_paths = explore_paths(path_nu2, 60+3, None)

# val_paths = val_mu_paths + val_nuatm_paths + val_nu2_paths

## Save chosen dataset: config and paths

In [12]:
name_of_config = 'numusep_signal_small'

In [13]:
os.makedirs(f'./configurations/{name_of_config}', exist_ok=False)
cfgm.save_cfg(cfg, path=f'./configurations/{name_of_config}/cfg.yaml')
cfgm.save_paths(train_paths, f'./configurations/{name_of_config}/train_paths.csv')
cfgm.save_paths(test_paths, f'./configurations/{name_of_config}/test_paths.csv')
cfgm.save_paths(val_paths, f'./configurations/{name_of_config}/val_paths.csv')

## How to read configuration

### Read Cfg

In [9]:
batches_cfg = cfgm.load_cfg(f'./configurations/{name_of_config}/cfg.yaml')
batches_cfg.to_dict()

{'chunk_generator_cfg': {'chunk_size': 50,
  'processor_params': {'center_times': True,
   'calc_tres': False,
   'filter_cfg': {'only_signal': True,
    'min_hits': 5,
    'min_strings': 2,
    'min_Q': 0,
    't_threshold': 100000}},
  'fields': ['PulsesAmpl', 'PulsesTime', 'Xrel', 'Yrel', 'Zrel', 'nu_induced'],
  'shuffle_paths': True},
 'batch_size': 256,
 'features_name': ['PulsesAmpl', 'PulsesTime', 'Xrel', 'Yrel', 'Zrel'],
 'labels_name': ['nu_induced'],
 'do_norm': True,
 'norm_params': {'PulsesTime': [0, 240.0],
  'PulsesAmpl': [6.0, 95.0],
  'Xrel': [0, 60],
  'Yrel': [0, 60],
  'Zrel': [0, 260]},
 'do_augment': True,
 'augment_parmas': {'PulsesTime': 5,
  'PulsesAmpl': 0.1,
  'Xrel': 2,
  'Yrel': 2,
  'Zrel': 5},
 'shuffle': True}

### Read paths

In [10]:
paths = cfgm.read_paths(f'./configurations/{name_of_config}/paths.csv')
paths[0:10]

['/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10048.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10091.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10170.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10185.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10206.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/1021.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/1024.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/1025.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10300.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10333.root']

# Test

In [14]:
import config_manager as cfgm
from batch_generator import BatchGenerator

name_of_dataset = "numusep_signal_small"

path_to_train_paths = f"./configurations/{name_of_dataset}/train_paths.csv"
train_paths = cfgm.read_paths(path_to_train_paths)

path_to_cfg = f"./configurations/{name_of_dataset}/cfg.yaml"
cfg = cfgm.load_cfg(path_to_cfg)

gen = BatchGenerator(train_paths, cfg)
batches = gen.get_batches()
for batch in batches:
    inputs, targets = batch
    print(inputs.shape, targets.shape)
    break

2024-10-24 12:29:16,321 - INFO - Shuffled all file paths. Total paths: 186
2024-10-24 12:29:36,124 - INFO - #0 chunk loaded.


torch.Size([256, 81, 5]) torch.Size([256, 1])
