In [13]:
import numpy as np
import pandas as pd
import os

## Get paths

In [14]:
PART_NAME_ARR = ['nuel', 'numu', 'nutau']
PART_PDG_ARR = [12, 14, 16]

In [15]:
DATA_PATH = dict()
DATA_PATH['nuel']  = "~/snd_data/nue"
DATA_PATH['numu']  = "~/snd_data/numu"
DATA_PATH['nutau'] = "~/snd_data/nutau"

In [16]:
all_files = []

In [17]:
for (part_id, part_type) in enumerate(PART_NAME_ARR):
    dir_name  = os.path.expanduser(DATA_PATH[part_type])
    file_list = os.listdir(dir_name)
    file_num  = len(file_list)
    
    print('Directory {} contains {} files'.format(dir_name, file_num))
    
    for file_name in file_list:
        full_file_path = os.path.join(dir_name, file_name)
        file_id = int(file_name)
        all_files.append([part_id, file_id, full_file_path])

Directory /home/fsergeev/snd_data/nue contains 100 files
Directory /home/fsergeev/snd_data/numu contains 97 files
Directory /home/fsergeev/snd_data/nutau contains 100 files


In [18]:
# [particle type id, file number, full file path]
all_files

[[0, 93, '/home/fsergeev/snd_data/nue/93'],
 [0, 17, '/home/fsergeev/snd_data/nue/17'],
 [0, 28, '/home/fsergeev/snd_data/nue/28'],
 [0, 67, '/home/fsergeev/snd_data/nue/67'],
 [0, 76, '/home/fsergeev/snd_data/nue/76'],
 [0, 68, '/home/fsergeev/snd_data/nue/68'],
 [0, 42, '/home/fsergeev/snd_data/nue/42'],
 [0, 84, '/home/fsergeev/snd_data/nue/84'],
 [0, 91, '/home/fsergeev/snd_data/nue/91'],
 [0, 30, '/home/fsergeev/snd_data/nue/30'],
 [0, 18, '/home/fsergeev/snd_data/nue/18'],
 [0, 57, '/home/fsergeev/snd_data/nue/57'],
 [0, 31, '/home/fsergeev/snd_data/nue/31'],
 [0, 78, '/home/fsergeev/snd_data/nue/78'],
 [0, 13, '/home/fsergeev/snd_data/nue/13'],
 [0, 27, '/home/fsergeev/snd_data/nue/27'],
 [0, 34, '/home/fsergeev/snd_data/nue/34'],
 [0, 1, '/home/fsergeev/snd_data/nue/1'],
 [0, 52, '/home/fsergeev/snd_data/nue/52'],
 [0, 43, '/home/fsergeev/snd_data/nue/43'],
 [0, 61, '/home/fsergeev/snd_data/nue/61'],
 [0, 0, '/home/fsergeev/snd_data/nue/0'],
 [0, 55, '/home/fsergeev/snd_data/nu

In [19]:
dataset_path = os.path.expanduser('~/snd_data/dataset')

## Repack data

We save in separate files following the [webdataset format](https://github.com/webdataset/webdataset/blob/master/notebooks/gettingstarted.ipynb). The events are subsequently archived in shards to multiple `tar` archives.

In [20]:
!mkdir -p ~/snd_data/tar_dataset

In [21]:
import webdataset as wds
from itertools import islice

In [22]:
from random import shuffle

shuffle(all_files)

In [23]:
tar_dataset_name = os.path.expanduser('~/snd_data/tar_dataset/out-%05d.tar')
MAX_COUNT = 10000

file_counter = 0
event_counter = 0
part_type_counter = [0, 0, 0]


with wds.ShardWriter(tar_dataset_name, maxcount=MAX_COUNT) as sink:
    for (part_id, file_id, file_name) in all_files:
        pdg = PART_PDG_ARR[part_id]
        resp_scifi = pd.read_pickle(os.path.join(file_name, "tt_cleared.pkl"))
        resp_muons = pd.read_pickle(os.path.join(file_name, "mu_cleared.pkl"))
        energy     = pd.read_pickle(os.path.join(file_name,  "y_cleared.pkl"))

        assert(len(resp_scifi) == len(resp_muons))
        assert(len(resp_scifi) == len(energy))

        event_num = len(resp_scifi) # number of events recorded in the file

        for i in range(event_num):
            data   = [resp_scifi.iloc[i], resp_muons.iloc[i]]
            target = [pdg, energy.iloc[i]['E']]
            
            event_key  = str(event_counter).zfill(8)
            event_name = 'event_' + event_key
            event_path = os.path.join(dataset_path, event_name)
            
            sample = {'__key__' : event_key,
                      'pyd'     : data,
                      'json'    : target}

            sink.write(sample)
            event_counter  += 1
            
        file_counter  += 1
        part_type_counter[part_id] += event_num

# writing /home/fsergeev/snd_data/tar_dataset/out-00000.tar 0 0.0 GB 0
# writing /home/fsergeev/snd_data/tar_dataset/out-00001.tar 10000 0.3 GB 10000
# writing /home/fsergeev/snd_data/tar_dataset/out-00002.tar 10000 0.4 GB 20000
# writing /home/fsergeev/snd_data/tar_dataset/out-00003.tar 10000 0.3 GB 30000
# writing /home/fsergeev/snd_data/tar_dataset/out-00004.tar 10000 0.4 GB 40000
# writing /home/fsergeev/snd_data/tar_dataset/out-00005.tar 10000 0.4 GB 50000
# writing /home/fsergeev/snd_data/tar_dataset/out-00006.tar 10000 0.3 GB 60000
# writing /home/fsergeev/snd_data/tar_dataset/out-00007.tar 10000 0.4 GB 70000
# writing /home/fsergeev/snd_data/tar_dataset/out-00008.tar 10000 0.3 GB 80000
# writing /home/fsergeev/snd_data/tar_dataset/out-00009.tar 10000 0.5 GB 90000
# writing /home/fsergeev/snd_data/tar_dataset/out-00010.tar 10000 0.2 GB 100000
# writing /home/fsergeev/snd_data/tar_dataset/out-00011.tar 10000 0.4 GB 110000
# writing /home/fsergeev/snd_data/tar_dataset/out-00012.ta

# writing /home/fsergeev/snd_data/tar_dataset/out-00103.tar 10000 0.5 GB 1030000
# writing /home/fsergeev/snd_data/tar_dataset/out-00104.tar 10000 0.4 GB 1040000
# writing /home/fsergeev/snd_data/tar_dataset/out-00105.tar 10000 0.3 GB 1050000
# writing /home/fsergeev/snd_data/tar_dataset/out-00106.tar 10000 0.2 GB 1060000
# writing /home/fsergeev/snd_data/tar_dataset/out-00107.tar 10000 0.3 GB 1070000
# writing /home/fsergeev/snd_data/tar_dataset/out-00108.tar 10000 0.3 GB 1080000
# writing /home/fsergeev/snd_data/tar_dataset/out-00109.tar 10000 0.2 GB 1090000
# writing /home/fsergeev/snd_data/tar_dataset/out-00110.tar 10000 0.3 GB 1100000
# writing /home/fsergeev/snd_data/tar_dataset/out-00111.tar 10000 0.5 GB 1110000
# writing /home/fsergeev/snd_data/tar_dataset/out-00112.tar 10000 0.4 GB 1120000
# writing /home/fsergeev/snd_data/tar_dataset/out-00113.tar 10000 0.4 GB 1130000
# writing /home/fsergeev/snd_data/tar_dataset/out-00114.tar 10000 0.4 GB 1140000
# writing /home/fsergeev/snd

In [24]:
# number of processed pickled files
file_counter

297

In [25]:
# number of processed events
event_counter

1184000

In [26]:
# total number of tar archives
len(os.listdir(os.path.expanduser('~/snd_data/tar_dataset')))

119

In [27]:
# size of the directory
!du -sh ~/snd_data/tar_dataset

40G	/home/fsergeev/snd_data/tar_dataset


## Dataset

In [30]:
import torch
from torch.utils.data import IterableDataset
import webdataset as wds
from itertools import islice

In [32]:
%%bash
curl -s file:///home/fsergeev/snd_data/tar_dataset/out-00000.tar | tar tf - | sed 10q
# tar archive file structure

00000000.json
00000000.pyd
00000001.json
00000001.pyd
00000002.json
00000002.pyd
00000003.json
00000003.pyd
00000004.json
00000004.pyd


In [33]:
# what archived files contain
url = 'file:///home/fsergeev/snd_data/tar_dataset/out-00000.tar'
dataset = wds.WebDataset(url)

for sample in islice(dataset, 0, 3):
    for key, value in sample.items():
        print(key, repr(value)[:50])
    print()

__key__ '00000000'
json b'[14, 327.03418112366097]'
pyd b'\x80\x03]q\x00(cpandas.core.series\nSeries\nq\x0

__key__ '00000001'
json b'[14, 1785.1891117359125]'
pyd b'\x80\x03]q\x00(cpandas.core.series\nSeries\nq\x0

__key__ '00000002'
json b'[14, 438.0602822294983]'
pyd b'\x80\x03]q\x00(cpandas.core.series\nSeries\nq\x0



In [35]:
# unpacking the dataset
dataset = (
    wds.WebDataset(url)
    .shuffle(MAX_COUNT)
    .decode('pil')
    .to_tuple('pyd', 'json')
)

for data, target in islice(dataset, 0, 1):
    scifi_resp, muon_resp = data
    pdg, energy = target
    
    print(scifi_resp)
    print(muon_resp)
    print(pdg)
    print(energy)

PX         [-0.0042868126183748245, -0.0003130360855720937]
PY            [0.000397871364839375, 0.0014051293255761266]
PZ           [0.008921915665268898, 0.00012290953600313514]
X                 [-12.897856712341309, 19.056102752685547]
Y                 [-3.165752410888672, -15.070075988769531]
Z                   [25.988750457763672, 3.707683563232422]
PdgCode                                            [11, 11]
Name: 4, dtype: object
PX         [0.001095384475775063, -0.031043685972690582, ...
PY         [0.0014260945608839393, 0.0341348722577095, 0....
PZ         [0.009616779163479805, 0.00252715777605772, 0....
X          [12.612625122070312, -25.633800268173218, -27....
Y          [-31.92073945999146, -25.74629287719727, -20.5...
Z          [136.96112060546875, 49.26005554199219, 49.149...
PdgCode    [11, -11, 11, 22, -211, 11, 11, -11, -11, -11,...
Name: 4, dtype: object
14
2276.4224677946913


## Digitization

It takes too much space to save digitized detector response, so we digitize the signal in preprocessing.

In [36]:
from utils import DataPreprocess, Parameters
from src.operate_datasets import detector_planes_num

# detector geometry
DET_PARAMS = Parameters("SNDatLHC")
DET_CONFIG = DET_PARAMS.snd_params[DET_PARAMS.configuration]

# number of detector planes
FILT_NUM = detector_planes_num(DET_PARAMS)
FILT_NUM

Welcome to JupyROOT 6.18/00


{'scifi': 5, 'up_mu': 5, 'down_mu': 3}

In [37]:
from net import digitize_signal_scifi          as digit_scifi
from net import digitize_signal_upstream_mu    as digit_up_mu
from net import digitize_signal_downstream_mu  as digit_dn_mu


# detector signal digitization procedure
def digitize(resp):
    resp_scifi, resp_muons = resp
    
    dig_scifi = digit_scifi(resp_scifi, DET_PARAMS, FILT_NUM['scifi'])
    dig_up_mu = digit_up_mu(resp_muons, DET_PARAMS, FILT_NUM['up_mu'])
    dig_dn_mu = digit_dn_mu(resp_muons, DET_PARAMS, FILT_NUM['down_mu'])
    
    return [dig_scifi, dig_up_mu, dig_dn_mu]


# energy normalization
def normalize(cls):
    NORM_CONST = 1. / 4000.
    pdg, energy = cls
    
    return [pdg, energy * NORM_CONST]

In [38]:
dataset = (
    wds.WebDataset(url)
    .shuffle(MAX_COUNT)
    .decode('pil')
    .to_tuple('pyd', 'json')
    .map_tuple(digitize, normalize)
)

for data, target in islice(dataset, 0, 1):
    scifi, up_mu, dn_mu = data
    pdg, energy = target
    
    print(scifi)
    print(up_mu)
    print(dn_mu)
    print(pdg)
    print(energy)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
[[[0.]
  [1.]
  [1.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]]

 [[1.]
  [1.]
  [0.]
  [1.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [1.]
  [1.]
  [1.]
 