# Writing a hepfile from Scratch

The method shown in this tutorial is what is done under the hood in the `hepfile.dict_tools.dictlike_to_hepfile` method. This may be preferable for those users who want more control over the structure of their hepfile.

In [1]:
# imports
import time
import numpy as np
import sys

from hepfile import write as writer
from hepfile import load

Once hepfile.write is imported, we need to start by initializing the empty data structure.

In [2]:
start = time.time()
data = writer.initialize()

Now that the empty data structure is initialized, we must set up groups and datasets. Groups hold datasets and datasets hold data about that group. You can also just create higher level datasets that are considered "singletons" which do not correspond to a specific group.

In [3]:
# create the groups
writer.create_group(data,'jet',counter='njet')
writer.create_group(data,'muons',counter='nmuon')

# add datasets to different groups
writer.create_dataset(data,['e','px','py','pz'],group='jet',dtype=float)
writer.create_dataset(data,['algorithm'],group='jet',dtype=int)
writer.create_dataset(data,['words'],group='jet',dtype=str)
writer.create_dataset(data,['e','px','py','pz'],group='muons',dtype=float)

# add a higher level dataset that doesn't have a group, a "singleton"
writer.create_dataset(data,['METpx','METpy'],dtype=float)

# add 75 more singletons to make sure we go over the 256 char limit
for i in range(75):
    writer.create_dataset(data,f'test{i}',dtype=str)

To start adding data to the groups and datasets we have to generate an empty bucket.

In [4]:
bucket = writer.create_single_bucket(data)

Now that we have the hepfile structure set up, we can generate random data and insert it into the hepfile.

In [5]:
rando_words = ["hi", "bye", "ciao", "aloha"]

for i in range(0,10000):

    #hepfile.clear_event(event)

    njet = 17
    bucket['jet/njet'] = njet

    for n in range(njet):
        bucket['jet/e'].append(np.random.random())
        bucket['jet/px'].append(np.random.random())
        bucket['jet/py'].append(np.random.random())
        bucket['jet/pz'].append(np.random.random())

        bucket['jet/algorithm'].append(np.random.randint(-1,1))

        bucket['jet/words'].append(np.random.choice(rando_words))

    bucket['METpx'] = np.random.random()
    bucket['METpy'] = np.random.random()

    #hepfile.pack(data,event,EMPTY_OUT_BUCKET=False)
    return_value = writer.pack(data,bucket,STRICT_CHECKING=True)
    if return_value != 0:
        exit()

Finally, we are ready to write this random data to a file called `output.hdf5`!

In [6]:
print("Writing the file...")
#hdfile = hepfile.write_to_file('output.hdf5',data)
hdfile = writer.write_to_file('output_from_scratch.hdf5',data,comp_type='gzip',comp_opts=9,verbose=True)

Writing the file...
Writing _SINGLETONS_GROUP_/COUNTER to file
	Converting list to array...
	Writing to file...
Writing to file _SINGLETONS_GROUP_/COUNTER as type <class 'int'>
Writing METpx to file
	Converting list to array...
	Converting array to single precision...
	Writing to file...
Writing to file METpx as type <class 'numpy.float32'>
Writing METpy to file
	Converting list to array...
	Converting array to single precision...
	Writing to file...
Writing to file METpy as type <class 'numpy.float32'>
Writing test0 to file
	Converting list to array...
	Converting array to single precision...
	Writing to file...
Writing to file test0 as type <class 'numpy.float32'>
Writing test1 to file
	Converting list to array...
	Converting array to single precision...
	Writing to file...
Writing to file test1 as type <class 'numpy.float32'>
Writing test2 to file
	Converting list to array...
	Converting array to single precision...
	Writing to file...
Writing to file test2 as type <class 'numpy.flo

In [7]:
# output time it took to run write the file
write_time = time.time()
print(f'Time it took to write: {write_time-start}')

Time it took to write: 2.157505750656128


In [8]:
data, bucket = load('output_from_scratch.hdf5')

Building the indices...

Built the indices!
Data is read in and input file is closed.


In [9]:
read_time = time.time()
print(f'Time it took to read: {read_time-write_time}')

Time it took to read: 0.0899360179901123


In [10]:
data.keys()

dict_keys(['_MAP_DATASETS_TO_COUNTERS_', '_MAP_DATASETS_TO_INDEX_', '_LIST_OF_COUNTERS_', '_LIST_OF_DATASETS_', '_META_', '_NUMBER_OF_BUCKETS_', '_SINGLETONS_GROUP_', '_SINGLETONS_GROUP_/COUNTER', '_SINGLETONS_GROUP_/COUNTER_INDEX', 'jet/njet', 'jet/njet_INDEX', 'muons/nmuon', 'muons/nmuon_INDEX', 'METpx', 'METpy', 'jet/algorithm', 'jet/e', 'jet/px', 'jet/py', 'jet/pz', 'jet/words', 'muons/e', 'muons/px', 'muons/py', 'muons/pz', 'test0', 'test1', 'test10', 'test11', 'test12', 'test13', 'test14', 'test15', 'test16', 'test17', 'test18', 'test19', 'test2', 'test20', 'test21', 'test22', 'test23', 'test24', 'test25', 'test26', 'test27', 'test28', 'test29', 'test3', 'test30', 'test31', 'test32', 'test33', 'test34', 'test35', 'test36', 'test37', 'test38', 'test39', 'test4', 'test40', 'test41', 'test42', 'test43', 'test44', 'test45', 'test46', 'test47', 'test48', 'test49', 'test5', 'test50', 'test51', 'test52', 'test53', 'test54', 'test55', 'test56', 'test57', 'test58', 'test59', 'test6', 'tes

In [11]:
data['_SINGLETONS_GROUP_']

array(['METpx', 'METpy', 'test0', 'test1', 'test2', 'test3', 'test4',
       'test5', 'test6', 'test7', 'test8', 'test9', 'test10', 'test11',
       'test12', 'test13', 'test14', 'test15', 'test16', 'test17',
       'test18', 'test19', 'test20', 'test21', 'test22', 'test23',
       'test24', 'test25', 'test26', 'test27', 'test28', 'test29',
       'test30', 'test31', 'test32', 'test33', 'test34', 'test35',
       'test36', 'test37', 'test38', 'test39', 'test40', 'test41',
       'test42', 'test43', 'test44', 'test45', 'test46', 'test47',
       'test48', 'test49', 'test50', 'test51', 'test52', 'test53',
       'test54', 'test55', 'test56', 'test57', 'test58', 'test59',
       'test60', 'test61', 'test62', 'test63', 'test64', 'test65',
       'test66', 'test67', 'test68', 'test69', 'test70', 'test71',
       'test72', 'test73', 'test74'], dtype='<U6')