In [1]:
from pathlib import Path
import h5py
from pyarrow import csv

In [2]:
DATA_ROOT = '../train_sample'
#DATA_ROOT = '../training'

HDF5_GROUP_KWARGS = {'track_order': True}
HDF5_DATASET_KWARGS = {'fletcher32': True, 'chunks': True, 'compression': 'lzf'}
#HDF5_DATASET_KWARGS = {'fletcher32': True, 'chunks': True, 'compression': 'gzip', 'compression_opts': 9}  # Use GZIP if the filesystem is slow

## Write hits

In [3]:
hits_files = list(Path(DATA_ROOT).glob('event?????????-hits.csv.gz'))

In [4]:
len(hits_files)

50

In [5]:
%%time
hits_tables = [csv.read_csv(f) for f in hits_files]

CPU times: user 1.56 s, sys: 94.8 ms, total: 1.66 s
Wall time: 824 ms


In [6]:
hits_tables[0].to_pandas().head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-71.2343,-0.966963,-1502.5,7,2,1
1,2,-74.3945,-5.98036,-1502.5,7,2,1
2,3,-82.2111,-6.09399,-1502.5,7,2,1
3,4,-71.5644,-6.81087,-1502.5,7,2,1
4,5,-51.3558,-11.9411,-1502.5,7,2,1


In [7]:
hits_tables[0].to_pandas().max()

hit_id       106122.00
x              1024.90
y              1024.96
z              2955.50
volume_id        18.00
layer_id         14.00
module_id      3192.00
dtype: float64

In [8]:
%%time
with h5py.File('test.hdf5', 'w', **HDF5_GROUP_KWARGS) as fp:
    
    hits = fp.create_group('hits', **HDF5_GROUP_KWARGS)

    # Event-level data
    event_offset = hits.create_dataset('event_offset', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    event_length = hits.create_dataset('event_length', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)

    # Individual hits
    hit_id = hits.create_dataset('hit_id', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    x = hits.create_dataset('x', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    y = hits.create_dataset('y', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    z = hits.create_dataset('z', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    volume_id = hits.create_dataset('volume_id', (0,), maxshape=(None,), dtype='i1', **HDF5_DATASET_KWARGS)
    layer_id  = hits.create_dataset( 'layer_id', (0,), maxshape=(None,), dtype='i1', **HDF5_DATASET_KWARGS)
    module_id = hits.create_dataset('module_id', (0,), maxshape=(None,), dtype='i2', **HDF5_DATASET_KWARGS)

    current_event_offset = 0
    for table in hits_tables:
        N_hits = table.shape[0]
        
        for group in [event_offset, event_length]:
            group.resize(group.shape[0] + 1, axis=0)
        event_length[-1] = N_hits
        event_offset[-1] = current_event_offset
        current_event_offset += N_hits

        for group in [hit_id, x, y, z, volume_id, layer_id, module_id]:
            group.resize(group.shape[0] + N_hits, axis=0)
        hit_id[-N_hits:] = table['hit_id']
        x[-N_hits:] = table['x']
        y[-N_hits:] = table['y']
        z[-N_hits:] = table['z']
        volume_id[-N_hits:] = table['volume_id']
        layer_id[ -N_hits:] = table[ 'layer_id']
        module_id[-N_hits:] = table['module_id']

CPU times: user 355 ms, sys: 121 ms, total: 476 ms
Wall time: 476 ms


In [9]:
# Validation
with h5py.File('test.hdf5', 'r') as f:

    import pyarrow as pa
    import numpy as np
    all_hits_table = pa.concat_tables(hits_tables)

    for feature in ['hit_id', 'x', 'y', 'z', 'volume_id', 'layer_id', 'module_id']:
        assert np.allclose(f['hits'][feature][:], all_hits_table[feature].to_numpy())

    assert all(f['hits/event_offset'][1:] - f['hits/event_offset'][:-1] == f['hits/event_length'][:-1])

    for idx in range(len(hits_tables)):
        offset = f['hits/event_offset'][idx]
        length = f['hits/event_length'][idx]
        for feature in ['hit_id', 'x', 'y', 'z', 'volume_id', 'layer_id', 'module_id']:
            assert np.allclose(f['hits'][feature][offset:offset+length], hits_tables[idx][feature].to_numpy())

## Write cells

In [10]:
cells_files = list(Path(DATA_ROOT).glob('event?????????-cells.csv.gz'))
assert(len(cells_files) == len(hits_files))

In [11]:
%%time
cells_tables = [csv.read_csv(f) for f in cells_files]

CPU times: user 2.68 s, sys: 121 ms, total: 2.8 s
Wall time: 1.23 s


In [12]:
cells_tables[0].to_pandas().head()

Unnamed: 0,hit_id,ch0,ch1,value
0,1,243,851,0.312499
1,2,108,606,0.280636
2,3,289,717,0.307342
3,4,203,832,0.053623
4,4,203,833,0.263058


In [13]:
cells_tables[0].to_pandas().max()

hit_id    120242.0
ch0         1189.0
ch1         1279.0
value          1.0
dtype: float64

In [14]:
%%time
with h5py.File('test.hdf5', 'r+') as fp:
    
    cells = fp.create_group('cells', **HDF5_GROUP_KWARGS)

    # Event-level data
    event_offset = cells.create_dataset('event_offset', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    event_length = cells.create_dataset('event_length', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)

    # Individual hits
    hit_id = cells.create_dataset('hit_id', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    ch0 = cells.create_dataset('ch0', (0,), maxshape=(None,), dtype='i2', **HDF5_DATASET_KWARGS)
    ch1 = cells.create_dataset('ch1', (0,), maxshape=(None,), dtype='i2', **HDF5_DATASET_KWARGS)
    value = cells.create_dataset('value', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)

    current_event_offset = 0
    for table in cells_tables:
        N_hits = table.shape[0]
        
        for group in [event_offset, event_length]:
            group.resize(group.shape[0] + 1, axis=0)
        event_length[-1] = N_hits
        event_offset[-1] = current_event_offset
        current_event_offset += N_hits

        for group in [hit_id, ch0, ch1, value]:
            group.resize(group.shape[0] + N_hits, axis=0)
        hit_id[-N_hits:] = table['hit_id']
        ch0[-N_hits:] = table['ch0']
        ch1[-N_hits:] = table['ch1']
        value[-N_hits:] = table['value']

CPU times: user 733 ms, sys: 233 ms, total: 966 ms
Wall time: 1.11 s


In [15]:
# Validation
with h5py.File('test.hdf5', 'r') as f:

    all_cells_table = pa.concat_tables(cells_tables)

    for feature in ['hit_id', 'ch0', 'ch1', 'value']:
        assert np.allclose(f['cells'][feature][:], all_cells_table[feature].to_numpy())

    assert all(f['cells/event_offset'][1:] - f['cells/event_offset'][:-1] == f['cells/event_length'][:-1])

    for idx in range(len(cells_tables)):
        offset = f['cells/event_offset'][idx]
        length = f['cells/event_length'][idx]
        for feature in ['hit_id', 'ch0', 'ch1', 'value']:
            assert np.allclose(f['cells'][feature][offset:offset+length], cells_tables[idx][feature].to_numpy())

## Write particles

In [16]:
particles_files = list(Path(DATA_ROOT).glob('event?????????-particles.csv.gz'))
assert(len(particles_files) == len(hits_files))

In [17]:
%%time
particles_tables = [csv.read_csv(f) for f in particles_files]

CPU times: user 490 ms, sys: 37 ms, total: 527 ms
Wall time: 404 ms


In [18]:
particles_tables[0].to_pandas().head()

Unnamed: 0,particle_id,particle_type,vx,vy,vz,px,py,pz,q,nhits
0,4503668346847232,-13,-0.011938,-0.0241,1.23557,-43.9463,-27.2878,-391.87,1,14
1,4503874505277440,-211,-0.011938,-0.0241,1.23557,-2.14656,-0.544226,11.9026,-1,2
2,4503943224754176,211,-0.011938,-0.0241,1.23557,-0.197988,-0.004153,4.01512,1,12
3,4504011944230912,321,-0.011938,-0.0241,1.23557,0.322507,-0.703132,8.19269,1,10
4,4504080663707648,-211,-0.011938,-0.0241,1.23557,0.900248,0.886279,10.1678,-1,13


In [19]:
particles_tables[0].to_pandas().max()

particle_id      8.737297e+17
particle_type    2.212000e+03
vx               9.239790e+02
vy               9.246850e+02
vz               2.951500e+03
px               1.367670e+01
py               1.224290e+01
pz               1.090690e+02
q                1.000000e+00
nhits            2.200000e+01
dtype: float64

In [20]:
%%time
with h5py.File('test.hdf5', 'r+') as fp:
    
    particles = fp.create_group('particles', **HDF5_GROUP_KWARGS)

    # Event-level data
    event_offset = particles.create_dataset('event_offset', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    event_length = particles.create_dataset('event_length', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)

    # Individual particles
    particle_id = particles.create_dataset('particle_id', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)
    particle_type = particles.create_dataset('particle_type', (0,), maxshape=(None,), dtype='i2', **HDF5_DATASET_KWARGS)
    vx = particles.create_dataset('vx', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    vy = particles.create_dataset('vy', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    vz = particles.create_dataset('vz', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    px = particles.create_dataset('px', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    py = particles.create_dataset('py', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    pz = particles.create_dataset('pz', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    q = particles.create_dataset('q', (0,), maxshape=(None,), dtype='i1', **HDF5_DATASET_KWARGS)
    nhits = particles.create_dataset('nhits', (0,), maxshape=(None,), dtype='i1', **HDF5_DATASET_KWARGS)
    
    current_event_offset = 0
    for table in particles_tables:
        N_particles = table.shape[0]
        
        for group in [event_offset, event_length]:
            group.resize(group.shape[0] + 1, axis=0)
        event_length[-1] = N_particles
        event_offset[-1] = current_event_offset
        current_event_offset += N_particles

        for group in [particle_id, particle_type, vx, vy, vz, px, py, pz, q, nhits]:
            group.resize(group.shape[0] + N_particles, axis=0)
        particle_id[-N_particles:] = table['particle_id']
        particle_type[-N_particles:] = table['particle_type']
        vx[-N_particles:] = table['vx']
        vy[-N_particles:] = table['vy']
        vz[-N_particles:] = table['vz']
        px[-N_particles:] = table['px']
        py[-N_particles:] = table['py']
        pz[-N_particles:] = table['pz']
        q[-N_particles:] = table['q']
        nhits[ -N_particles:] = table[ 'nhits']

CPU times: user 84.3 ms, sys: 17.9 ms, total: 102 ms
Wall time: 101 ms


In [21]:
# Validation
with h5py.File('test.hdf5', 'r') as f:

    all_particles_table = pa.concat_tables(particles_tables)

    for feature in ['particle_id', 'particle_type', 'vx', 'vy', 'vz', 'px', 'py', 'pz', 'q', 'nhits']:
        assert np.allclose(f['particles'][feature][:], all_particles_table[feature].to_numpy())

    assert all(f['particles/event_offset'][1:] - f['particles/event_offset'][:-1] == f['particles/event_length'][:-1])

    for idx in range(len(particles_tables)):
        offset = f['particles/event_offset'][idx]
        length = f['particles/event_length'][idx]
        for feature in ['particle_id', 'particle_type', 'vx', 'vy', 'vz', 'px', 'py', 'pz', 'q', 'nhits']:
            assert np.allclose(f['particles'][feature][offset:offset+length], particles_tables[idx][feature].to_numpy())

## Write truth

In [22]:
truth_files = list(Path(DATA_ROOT).glob('event?????????-truth.csv.gz'))
assert(len(truth_files) == len(hits_files))

In [23]:
%%time
truth_tables = [csv.read_csv(f) for f in truth_files]

CPU times: user 3.32 s, sys: 139 ms, total: 3.46 s
Wall time: 1.64 s


In [24]:
truth_tables[0].to_pandas().head()

Unnamed: 0,hit_id,particle_id,tx,ty,tz,tpx,tpy,tpz,weight
0,1,657527538460917760,-57.918,2.48939,-1502.5,-0.452228,0.038489,-11.6419,1.1e-05
1,2,252204808948154368,-75.0581,-7.55168,-1502.5,-0.879625,-0.115445,-17.7146,1.5e-05
2,3,837678807820271616,-59.5603,-7.32468,-1502.5,0.155798,-0.000566,-3.72562,1.2e-05
3,4,265717257097707520,-60.2607,2.00153,-1502.5,-0.321307,0.028965,-7.84429,1.2e-05
4,5,256713012780466176,-66.1137,-6.67106,-1502.5,-0.787132,-0.06311,-18.2504,1.5e-05


In [25]:
truth_tables[0].to_pandas().max()

hit_id         9.830200e+04
particle_id    8.917390e+17
tx             1.024690e+03
ty             1.024730e+03
tz             2.955500e+03
tpx            9.999910e+05
tpy            9.999470e+05
tpz            1.000000e+06
weight         2.363910e-04
dtype: float64

In [26]:
%%time
with h5py.File('test.hdf5', 'r+') as fp:
    
    truth = fp.create_group('truth', **HDF5_GROUP_KWARGS)

    # Event-level data
    event_offset = truth.create_dataset('event_offset', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    event_length = truth.create_dataset('event_length', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)

    # Individual hits
    hit_id = truth.create_dataset('hit_id', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    particle_id = truth.create_dataset('particle_id', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)
    tx = truth.create_dataset('tx', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    ty = truth.create_dataset('ty', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    tz = truth.create_dataset('tz', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    tpx = truth.create_dataset('tpx', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    tpy = truth.create_dataset('tpy', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    tpz = truth.create_dataset('tpz', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    weight = truth.create_dataset('weight', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    
    current_event_offset = 0
    for table in truth_tables:
        N_hits = table.shape[0]
        
        for group in [event_offset, event_length]:
            group.resize(group.shape[0] + 1, axis=0)
        event_length[-1] = N_hits
        event_offset[-1] = current_event_offset
        current_event_offset += N_hits

        for group in [hit_id, particle_id, tx, ty, tz, tpx, tpy, tpz, weight]:
            group.resize(group.shape[0] + N_hits, axis=0)
        hit_id[-N_hits:] = table['hit_id']
        particle_id[-N_hits:] = table['particle_id']
        tx[-N_hits:] = table['tx']
        ty[-N_hits:] = table['ty']
        tz[-N_hits:] = table['tz']
        tpx[-N_hits:] = table['tpx']
        tpy[-N_hits:] = table['tpy']
        tpz[-N_hits:] = table['tpz']
        weight[-N_hits:] = table['weight']

CPU times: user 743 ms, sys: 160 ms, total: 903 ms
Wall time: 984 ms


In [27]:
# Validation
with h5py.File('test.hdf5', 'r') as f:

    all_truth_table = pa.concat_tables(truth_tables)

    for feature in ['hit_id', 'particle_id', 'tx', 'ty', 'tz', 'tpx', 'tpy', 'tpz', 'weight']:
        assert np.allclose(f['truth'][feature][:], all_truth_table[feature].to_numpy())

    assert all(f['truth/event_offset'][1:] - f['truth/event_offset'][:-1] == f['truth/event_length'][:-1])

    for idx in range(len(truth_tables)):
        offset = f['truth/event_offset'][idx]
        length = f['truth/event_length'][idx]
        for feature in ['hit_id', 'particle_id', 'tx', 'ty', 'tz', 'tpx', 'tpy', 'tpz', 'weight']:
            assert np.allclose(f['truth'][feature][offset:offset+length], truth_tables[idx][feature].to_numpy())

## Check the resulting file

In [28]:
f = h5py.File('test.hdf5', 'r')

In [29]:
f.keys()

<KeysViewHDF5 ['hits', 'cells', 'particles', 'truth']>

In [30]:
f['hits'].keys()

<KeysViewHDF5 ['event_offset', 'event_length', 'hit_id', 'x', 'y', 'z', 'volume_id', 'layer_id', 'module_id']>

In [31]:
f['cells'].keys()

<KeysViewHDF5 ['event_offset', 'event_length', 'hit_id', 'ch0', 'ch1', 'value']>

In [32]:
f['particles'].keys()

<KeysViewHDF5 ['event_offset', 'event_length', 'particle_id', 'particle_type', 'vx', 'vy', 'vz', 'px', 'py', 'pz', 'q', 'nhits']>

In [33]:
f['truth'].keys()

<KeysViewHDF5 ['event_offset', 'event_length', 'hit_id', 'particle_id', 'tx', 'ty', 'tz', 'tpx', 'tpy', 'tpz', 'weight']>

In [34]:
f['hits/hit_id'].shape

(5467050,)

In [35]:
f['cells/hit_id'].shape

(18788818,)

In [36]:
f['particles/particle_id'].shape

(538589,)

In [37]:
f['truth/hit_id'].shape

(5467050,)

In [38]:
f.close()

## Delete the test file

In [39]:
! rm test.hdf5