In [1]:
import os
import csv

In [2]:
from root_manager.settings import FilterParams, ProcessorConfig, ChunkGeneratorConfig
from settings import BatchGeneratorConfig, NormParams, AugmentParams
from ustils_for_stats import StatsCollector
import config_manager as cfgm

## Choose paths and their proportion to create dataset

In [3]:
path_mu = "/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/"
path_nuatm = "/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/nuatm/root/all/"
path_nu2 = "/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/nue2_100pev/root/all/"
def explore_paths(p: str, start: int, stop: int):
    files = sorted(os.listdir(p))[start:stop]
    return [f"{p}{file}" for file in files]

In [14]:
mu_files = os.listdir(path_mu)
len(mu_files)

20004

In [21]:
# Read train paths
mu_paths = explore_paths(path_mu, 0, 110)
nuatm_paths = explore_paths(path_nuatm, 0, 85)
nu2_paths = explore_paths(path_nu2, 0, 15)

train_paths = mu_paths + nuatm_paths + nu2_paths

## Choose processing settings

In [22]:
proc_cfg = ProcessorConfig(
    center_times= True, 
    calc_tres = False, 
    filter_cfg = FilterParams(
        only_signal = False,
        min_hits = 0,
        min_strings = 0,
        min_Q = 0,
        t_threshold = 100000
    )
)

## Collect stats of processed events to adjust proportion of files

In [23]:
stats = StatsCollector(mu_paths, nuatm_paths, nu2_paths, proc_cfg)

In [24]:
stats.get_stats(0, 10)
stats_dict = stats.return_stats()

In [25]:
stats_dict

{'proc_cfg': ProcessorConfig(center_times=True, calc_tres=False, filter_cfg=FilterParams(only_signal=False, min_hits=0, min_strings=0, min_Q=0, t_threshold=100000)),
 'mu_num_estimated': 3595427.0,
 'nuatm_num_estimated': 3357789.0,
 'nu2_num_estimated': 349810.5,
 'mu_filter_koef': 1.0169630217327048,
 'nuatm_filter_koef': 1.0042990468878308,
 'nu2_filter_koef': 1.0929750807286907,
 'mu_nu_ratio': 0.9697452489137514,
 'nuatm_nu2_ratio': 9.598879965009626,
 'Q_mean': 1.6307861845694616,
 'Q_std': 41.172829510175546,
 't_mean': 1.0548509105373332e-07,
 't_std': 1371.8097639882328}

## Create final config of batch generator

In [27]:
cfg = BatchGeneratorConfig(
    chunk_generator_cfg=ChunkGeneratorConfig(
            chunk_size = 25,
            processor_params = proc_cfg,
            fields = ['PulsesAmpl', 'PulsesTime', 'Xrel', 'Yrel', 'Zrel', 'nu_induced'],
            shuffle_paths = True
        ),
    batch_size = 256,
    features_name = ["PulsesAmpl", "PulsesTime", "Xrel", "Yrel", "Zrel"],
    labels_name = ["nu_induced"],
    do_norm = True,
    norm_params = NormParams(
        PulsesTime = [0, 1372.],
        PulsesAmpl = [1.6, 41.2],
        Xrel = [0, 60],
        Yrel = [0, 60],
        Zrel = [0, 260]
        ),
    do_augment = True,
    augment_params = AugmentParams(
        PulsesTime = 5,
        PulsesAmpl = 0.1,
        Xrel = 2,
        Yrel = 2,
        Zrel = 5
        ),
    shuffle = True
)

## Choose files assuming got stats

In [28]:
# SMALL
# Read train paths
mu_paths = explore_paths(path_mu, 0, 55)
nuatm_paths = explore_paths(path_nuatm, 0, 44)
nu2_paths = explore_paths(path_nu2, 0, 7)

train_paths = mu_paths + nuatm_paths + nu2_paths

# Read test paths
test_mu_paths = explore_paths(path_mu, 55, 55+8)
test_nuatm_paths = explore_paths(path_nuatm, 44, 44+6)
test_nu2_paths = explore_paths(path_nu2, 7, 7+1)

test_paths = test_mu_paths + test_nuatm_paths + test_nu2_paths

# Read val paths
val_mu_paths = explore_paths(path_mu, 55+8, None)
val_nuatm_paths = explore_paths(path_nuatm, 44+6, None)
val_nu2_paths = explore_paths(path_nu2, 7+1, None)

val_paths = val_mu_paths + val_nuatm_paths + val_nu2_paths

# # # BIG
# # # Read train paths
# mu_paths = explore_paths(path_mu, 0, 800)
# nuatm_paths = explore_paths(path_nuatm, 0, 1000)
# nu2_paths = explore_paths(path_nu2, 0, 60)

# train_paths = mu_paths + nuatm_paths + nu2_paths

# # Read test paths
# test_mu_paths = explore_paths(path_mu, 800, 800+40)
# test_nuatm_paths = explore_paths(path_nuatm, 1000, 1000+50)
# test_nu2_paths = explore_paths(path_nu2, 60, 60+3)

# test_paths = test_mu_paths + test_nuatm_paths + test_nu2_paths

# # Read val paths
# val_mu_paths = explore_paths(path_mu, 800+40, None)
# val_nuatm_paths = explore_paths(path_nuatm, 1000+50, None)
# val_nu2_paths = explore_paths(path_nu2, 60+3, None)

# val_paths = val_mu_paths + val_nuatm_paths + val_nu2_paths


## Save chosen dataset: config and paths

In [29]:
name_of_config = 'numusep_all_small'

In [30]:
os.makedirs(f'./configurations/{name_of_config}', exist_ok=False)
cfgm.save_cfg(cfg, path=f'./configurations/{name_of_config}/cfg.yaml')
cfgm.save_paths(train_paths, f'./configurations/{name_of_config}/train_paths.csv')
cfgm.save_paths(test_paths, f'./configurations/{name_of_config}/test_paths.csv')
cfgm.save_paths(val_paths, f'./configurations/{name_of_config}/val_paths.csv')

## How to read configuration

### Read Cfg

In [31]:
batches_cfg = cfgm.load_cfg(f'./configurations/{name_of_config}/cfg.yaml')
batches_cfg.to_dict()

{'chunk_generator_cfg': {'chunk_size': 25,
  'processor_params': {'center_times': True,
   'calc_tres': False,
   'filter_cfg': {'only_signal': False,
    'min_hits': 0,
    'min_strings': 0,
    'min_Q': 0,
    't_threshold': 100000}},
  'fields': ['PulsesAmpl', 'PulsesTime', 'Xrel', 'Yrel', 'Zrel', 'nu_induced'],
  'shuffle_paths': True},
 'batch_size': 256,
 'features_name': ['PulsesAmpl', 'PulsesTime', 'Xrel', 'Yrel', 'Zrel'],
 'labels_name': ['nu_induced'],
 'do_norm': True,
 'norm_params': {'PulsesTime': [0, 1372.0],
  'PulsesAmpl': [1.6, 41.2],
  'Xrel': [0, 60],
  'Yrel': [0, 60],
  'Zrel': [0, 260]},
 'do_augment': True,
 'augment_params': {'PulsesTime': 5,
  'PulsesAmpl': 0.1,
  'Xrel': 2,
  'Yrel': 2,
  'Zrel': 5},
 'shuffle': True}

### Read paths

In [36]:
paths = cfgm.read_paths(f'./configurations/{name_of_config}/train_paths.csv')
paths

['/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/1000.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10000.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10001.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10002.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10003.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10004.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10005.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10006.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10007.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10008.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/10009.root',
 '/net/62/home3/ivkhar/Baikal/data/initial_data/MC_2020/muatm/root/all/1001.r

# Test

In [14]:
import config_manager as cfgm
from batch_generator import BatchGenerator

name_of_dataset = "numusep_signal_small"

path_to_train_paths = f"./configurations/{name_of_dataset}/train_paths.csv"
train_paths = cfgm.read_paths(path_to_train_paths)

path_to_cfg = f"./configurations/{name_of_dataset}/cfg.yaml"
cfg = cfgm.load_cfg(path_to_cfg)

gen = BatchGenerator(train_paths, cfg)
batches = gen.get_batches()
for batch in batches:
    inputs, targets = batch
    print(inputs.shape, targets.shape)
    break

2024-10-24 12:29:16,321 - INFO - Shuffled all file paths. Total paths: 186
2024-10-24 12:29:36,124 - INFO - #0 chunk loaded.


torch.Size([256, 81, 5]) torch.Size([256, 1])
