In [7]:
from pathlib import Path
import h5py
from pyarrow import csv, ArrowInvalid
from tqdm import tqdm
import logging

## Set paths

In [2]:
# Edit the paths below as needed
DATA_ROOT = Path('/home/ucloud/data')
OUTPUT_HDF5_FILE = Path('../data/TrackML/training-small')

HDF5_GROUP_KWARGS = {'track_order': True}
HDF5_DATASET_KWARGS = {'fletcher32': True, 'chunks': True, 'compression': 'lzf'}
#HDF5_DATASET_KWARGS = {'fletcher32': True, 'chunks': True, 'compression': 'gzip', 'compression_opts': 9}  # Use GZIP if the filesystem is slow

## Handle corrupted or missing files

In [3]:
DELETE_INVALID_FILES = True
INVALID_FILES = [
    'event000022500-cells.csv.gz',
    'event000023450-cells.csv.gz',
    'event000023161-truth.csv.gz',
    'event000023157-cells.csv.gz',
]

for file in INVALID_FILES:
    event_number = file[5:14]
    invalid_group = DATA_ROOT.glob(f'event{event_number}*.csv.gz')
    for related_file in invalid_group:
        if DELETE_INVALID_FILES and related_file.exists():
            related_file.unlink()

## Check the number of events

In [4]:
hits_files = list(DATA_ROOT.glob('event?????????-hits.csv.gz'))
cells_files = list(DATA_ROOT.glob('event?????????-cells.csv.gz'))
particles_files = list(DATA_ROOT.glob('event?????????-particles.csv.gz'))
truth_files = list(DATA_ROOT.glob('event?????????-truth.csv.gz'))

assert len(hits_files) == len(cells_files) == len(particles_files) == len(truth_files)

## Write hits

In [5]:
OUTPUT_HDF5_FILE.parent.mkdir(parents=True, exist_ok=True)

In [8]:
%%time

with h5py.File(OUTPUT_HDF5_FILE, 'w', **HDF5_GROUP_KWARGS) as fp:

    fp.attrs['number_of_events'] = len(hits_files)
    
    hits = fp.create_group('hits', **HDF5_GROUP_KWARGS)

    # Event-level data
    event_offset = hits.create_dataset('event_offset', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)
    event_length = hits.create_dataset('event_length', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)

    # Individual hits
    hit_id = hits.create_dataset('hit_id', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    x = hits.create_dataset('x', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    y = hits.create_dataset('y', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    z = hits.create_dataset('z', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    volume_id = hits.create_dataset('volume_id', (0,), maxshape=(None,), dtype='i1', **HDF5_DATASET_KWARGS)
    layer_id  = hits.create_dataset( 'layer_id', (0,), maxshape=(None,), dtype='i1', **HDF5_DATASET_KWARGS)
    module_id = hits.create_dataset('module_id', (0,), maxshape=(None,), dtype='i2', **HDF5_DATASET_KWARGS)

    current_event_offset = 0
    for file in tqdm(hits_files):
        try:
            table = csv.read_csv(file)
        except ArrowInvalid as ex:
            logging.warning(f'Invalid file {file}')
            continue
        N_hits = table.shape[0]
        
        for group in [event_offset, event_length]:
            group.resize(group.shape[0] + 1, axis=0)
        event_length[-1] = N_hits
        event_offset[-1] = current_event_offset
        current_event_offset += N_hits

        for group in [hit_id, x, y, z, volume_id, layer_id, module_id]:
            group.resize(group.shape[0] + N_hits, axis=0)
        hit_id[-N_hits:] = table['hit_id']
        x[-N_hits:] = table['x']
        y[-N_hits:] = table['y']
        z[-N_hits:] = table['z']
        volume_id[-N_hits:] = table['volume_id']
        layer_id[ -N_hits:] = table[ 'layer_id']
        module_id[-N_hits:] = table['module_id']

100%|██████████| 900/900 [00:58<00:00, 15.30it/s]

CPU times: user 1min 34s, sys: 10.9 s, total: 1min 45s
Wall time: 58.9 s





In [None]:
## Write cells

In [9]:
%%time

with h5py.File(OUTPUT_HDF5_FILE, 'r+') as fp:
    
    cells = fp.create_group('cells', **HDF5_GROUP_KWARGS)

    # Event-level data
    event_offset = cells.create_dataset('event_offset', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)
    event_length = cells.create_dataset('event_length', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)

    # Individual hits
    hit_id = cells.create_dataset('hit_id', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    ch0 = cells.create_dataset('ch0', (0,), maxshape=(None,), dtype='i2', **HDF5_DATASET_KWARGS)
    ch1 = cells.create_dataset('ch1', (0,), maxshape=(None,), dtype='i2', **HDF5_DATASET_KWARGS)
    value = cells.create_dataset('value', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)

    current_event_offset = 0
    for file in tqdm(cells_files):
        try:
            table = csv.read_csv(file)
        except ArrowInvalid as ex:
            logging.warning(f'Invalid file {file}')
            continue
        N_hits = table.shape[0]
        
        for group in [event_offset, event_length]:
            group.resize(group.shape[0] + 1, axis=0)
        event_length[-1] = N_hits
        event_offset[-1] = current_event_offset
        current_event_offset += N_hits

        for group in [hit_id, ch0, ch1, value]:
            group.resize(group.shape[0] + N_hits, axis=0)
        hit_id[-N_hits:] = table['hit_id']
        ch0[-N_hits:] = table['ch0']
        ch1[-N_hits:] = table['ch1']
        value[-N_hits:] = table['value']

100%|██████████| 900/900 [01:41<00:00,  8.89it/s]

CPU times: user 2min 48s, sys: 13.4 s, total: 3min 2s
Wall time: 1min 41s





## Write particles

In [10]:
%%time

with h5py.File(OUTPUT_HDF5_FILE, 'r+') as fp:
    
    particles = fp.create_group('particles', **HDF5_GROUP_KWARGS)

    # Event-level data
    event_offset = particles.create_dataset('event_offset', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)
    event_length = particles.create_dataset('event_length', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)

    # Individual particles
    particle_id = particles.create_dataset('particle_id', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)
    particle_type = particles.create_dataset('particle_type', (0,), maxshape=(None,), dtype='i2', **HDF5_DATASET_KWARGS)
    vx = particles.create_dataset('vx', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    vy = particles.create_dataset('vy', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    vz = particles.create_dataset('vz', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    px = particles.create_dataset('px', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    py = particles.create_dataset('py', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    pz = particles.create_dataset('pz', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    q = particles.create_dataset('q', (0,), maxshape=(None,), dtype='i1', **HDF5_DATASET_KWARGS)
    nhits = particles.create_dataset('nhits', (0,), maxshape=(None,), dtype='i1', **HDF5_DATASET_KWARGS)
    
    current_event_offset = 0
    for file in tqdm(particles_files):
        try:
            table = csv.read_csv(file)
        except ArrowInvalid as ex:
            logging.warning(f'Invalid file {file}')
            continue
        N_particles = table.shape[0]
        
        for group in [event_offset, event_length]:
            group.resize(group.shape[0] + 1, axis=0)
        event_length[-1] = N_particles
        event_offset[-1] = current_event_offset
        current_event_offset += N_particles

        for group in [particle_id, particle_type, vx, vy, vz, px, py, pz, q, nhits]:
            group.resize(group.shape[0] + N_particles, axis=0)
        particle_id[-N_particles:] = table['particle_id']
        particle_type[-N_particles:] = table['particle_type']
        vx[-N_particles:] = table['vx']
        vy[-N_particles:] = table['vy']
        vz[-N_particles:] = table['vz']
        px[-N_particles:] = table['px']
        py[-N_particles:] = table['py']
        pz[-N_particles:] = table['pz']
        q[-N_particles:] = table['q']
        nhits[ -N_particles:] = table[ 'nhits']

100%|██████████| 900/900 [00:26<00:00, 33.45it/s]


CPU times: user 32.8 s, sys: 4.95 s, total: 37.8 s
Wall time: 27 s


## Write truth

In [11]:
%%time

with h5py.File(OUTPUT_HDF5_FILE, 'r+') as fp:
    
    truth = fp.create_group('truth', **HDF5_GROUP_KWARGS)

    # Event-level data
    event_offset = truth.create_dataset('event_offset', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)
    event_length = truth.create_dataset('event_length', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)

    # Individual hits
    hit_id = truth.create_dataset('hit_id', (0,), maxshape=(None,), dtype='i4', **HDF5_DATASET_KWARGS)
    particle_id = truth.create_dataset('particle_id', (0,), maxshape=(None,), dtype='i8', **HDF5_DATASET_KWARGS)
    tx = truth.create_dataset('tx', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    ty = truth.create_dataset('ty', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    tz = truth.create_dataset('tz', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    tpx = truth.create_dataset('tpx', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    tpy = truth.create_dataset('tpy', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    tpz = truth.create_dataset('tpz', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    weight = truth.create_dataset('weight', (0,), maxshape=(None,), dtype='f4', **HDF5_DATASET_KWARGS)
    
    current_event_offset = 0
    for file in tqdm(truth_files):
        try:
            table = csv.read_csv(file)
        except ArrowInvalid as ex:
            logging.warning(f'Invalid file {file}')
            continue
        N_hits = table.shape[0]
        
        for group in [event_offset, event_length]:
            group.resize(group.shape[0] + 1, axis=0)
        event_length[-1] = N_hits
        event_offset[-1] = current_event_offset
        current_event_offset += N_hits

        for group in [hit_id, particle_id, tx, ty, tz, tpx, tpy, tpz, weight]:
            group.resize(group.shape[0] + N_hits, axis=0)
        hit_id[-N_hits:] = table['hit_id']
        particle_id[-N_hits:] = table['particle_id']
        tx[-N_hits:] = table['tx']
        ty[-N_hits:] = table['ty']
        tz[-N_hits:] = table['tz']
        tpx[-N_hits:] = table['tpx']
        tpy[-N_hits:] = table['tpy']
        tpz[-N_hits:] = table['tpz']
        weight[-N_hits:] = table['weight']



100%|██████████| 900/900 [01:52<00:00,  8.01it/s]


CPU times: user 3min 7s, sys: 17.5 s, total: 3min 25s
Wall time: 1min 52s
