In [1]:
from utils import DataPreprocess, Parameters

import numpy as np
import pandas as pd
import os

Welcome to JupyROOT 6.18/00


In [2]:
DETECTOR_PARAMS = Parameters("SNDatLHC")
DETECTOR_CONFIG = DETECTOR_PARAMS.snd_params[DETECTOR_PARAMS.configuration]

from src.process_pickle import *

In [3]:
PART_TYPE_ARR = ['nuel', 'numu', 'nutau']

In [18]:
DATA_PATH = dict()
DATA_PATH['nuel']  = "~/snd_data/nue"
DATA_PATH['numu']  = "~/snd_data/numu"
DATA_PATH['nutau'] = "~/snd_data/nutau"

EVENTS_PER_FILE = 4000 # todo -> read from the files ?
FILES_NUM       = 100  

In [19]:
full_paths = dict()

for part_type in PART_TYPE_ARR:
    full_paths[part_type] = os.path.expandvars(DATA_PATH[part_type])

In [20]:
full_paths

{'nuel': '~/snd_data/nue',
 'numu': '~/snd_data/numu',
 'nutau': '~/snd_data/nutau'}

In [21]:
from src.operate_datasets import detector_planes_num

filt_num = detector_planes_num(DETECTOR_PARAMS)

# number of detector planes - extracted from the detector geometry
filt_num

In [25]:
file_size = 1 #FILES_NUM  
step_size = EVENTS_PER_FILE

n_steps = int(file_size / step_size) # number of chunks


file_id = 99 # file num (pickled file contains multiple events) 

proc_file_path = full_paths['nuel']
particle_type  = 'nuel'


outpath = proc_file_path + "/{}".format(file_id)
chunklist_scifi = pd.read_pickle(os.path.join(outpath, "tt_cleared.pkl"))
chunklist_muons = pd.read_pickle(os.path.join(outpath, "mu_cleared.pkl"))
chunklist_energ = pd.read_pickle(os.path.join(outpath,  "y_cleared.pkl"))

In [26]:
det_params = DETECTOR_PARAMS

In [27]:
from net import digitize_signal_scifi          as digit_scifi
from net import digitize_signal_upstream_mu    as digit_up_mu
from net import digitize_signal_downstream_mu  as digit_dn_mu

# take subset of events in the file to ease computation
events_per_file = 100 #step_size

energy       = []
resp_scifi   = []
resp_up_mu   = []
resp_down_mu = []

# digitize the events
for ev_id in range(events_per_file):
    energy       .append(chunklist_energ['E'][ev_id])
    resp_scifi   .append(digit_scifi(chunklist_scifi.iloc[ev_id], det_params, filt_num['scifi']))
    resp_up_mu   .append(digit_up_mu(chunklist_muons.iloc[ev_id], det_params, filt_num['up_mu']))
    resp_down_mu .append(digit_dn_mu(chunklist_muons.iloc[ev_id], det_params, filt_num['down_mu']))

In [41]:
# construct a dataframe with all info about the event (scifi, mu resp, energy)
file_id_arr  = np.full(events_per_file, file_id)
file_pos_arr = np.arange(events_per_file)
event_id     = file_id * step_size + file_pos_arr
    
df = pd.DataFrame({'file_id' : file_id_arr,
                  'file_pos' : file_pos_arr,
                  'event_id' : event_id,
                  'energy'   : energy,
                  'scifi'    : resp_scifi,
                  'up_mu'    : resp_up_mu,
                  'dn_mu'    : resp_down_mu})

# todo (to lower memory consumption):
# categorical for particle type
# dtypes

In [29]:
df

Unnamed: 0,file_id,file_pos,event_id,energy,scifi,up_mu,dn_mu
0,99,0,396000,539.551934,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
1,99,1,396001,800.003587,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
2,99,2,396002,439.818584,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
3,99,3,396003,612.684584,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
4,99,4,396004,498.400765,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
...,...,...,...,...,...,...,...
95,99,95,396095,2044.944618,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[0.0], [1.0], [1.0], [0.0], [0.0], [1.0], [0...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
96,99,96,396096,361.222199,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[0.0], [0.0], [0.0], [0.0], [1.0], [1.0], [1...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
97,99,97,396097,940.337260,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
98,99,98,396098,970.424941,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."


In [35]:
# estimate disk usage
keys_arr = df.keys()
memory_usage = df.memory_usage(index=True, deep=True)
total_mem_usage = memory_usage.sum()

In [36]:
total_mem_usage # Byte

171657728

In [38]:
total_mem_usage / (1024**2) # MB

163.70556640625

In [42]:
# 164 MB for 100 events
# we have 3 * 100 * 3000 events

# thus memory
164 / 100 * 3 * 100 * 3000 / 1024 # GB
# too much

1441.40625

In comparison

`(base) [fsergeev@lphepc119 snd_data]$ du -sh *`

| Directory  | Size |
|:----:|:-----------:|
|     nue     |  19G |
|     numu    | 5.7G |
|    nutau    |  12G |


(directory contains pickled files for each of the particle types)