In [None]:
import awkward as ak
import numpy as np

import hepfile

# !pip install uproot coffea
# if you don't have them installed
# This is just to read in a ROOT file and pull out the entries for testing
# the writing

import uproot


In [None]:
# Down load a file for us to play with
!curl http://opendata.cern.ch/record/12361/files/SMHiggsToZZTo4L.root --output SMHiggsToZZTo4L.root

In [None]:
# We could use uproot to open it directly if we wanted. 
# But we're going to take advantage of a tool which makes
# awkward arrays that are more like what we expect others to work with
f = uproot.open('SMHiggsToZZTo4L.root')

In [None]:
# Read in the ROOT file
# There will be a bunch of warnings but don't worry about it
nano_events = NanoEventsFactory.from_root('SMHiggsToZZTo4L.root').events()

# turn the events NanoEvents Object into an awkward array
events = ak.Array(nano_events) 

In [None]:
# events is an awkward array with these fields
events.fields

In [None]:
# And those fields have these fields
for field in events.fields:
    print(field)
    for variable in events[field].fields:
        print(f"\t{variable}")

In [None]:
# Let's check out these types and their sizes

print(type(events['Muon']))
print(type(events['Muon']['pt']))
print()

# Number of events
print(ak.num(events['Muon'],axis=0))

# Number of muons in each event
print(ak.num(events['Muon'],axis=1))

# Number of muons in total
print(ak.sum(ak.num(events['Muon'],axis=1)))

In [None]:
def _get_awkward_type(ak_array: ak.Record) -> type:
    try:
        if isinstance(ak_array[0], (ak.Record, ak.Array)):
            arr = ak_array
            import pdb; pdb.set_trace()
            type_str = ak_array.type.content
            if isinstance(type_str, ak.types.NumpyType):
                dtype = type_str.primitive
            else:
                dtype = str(type_str).rsplit("*", maxsplit=1)[-1].strip()
            
        else:
            arr = np.array(ak_array)
            dtype = arr.dtype

        if dtype == "string":
            dtype = np.dtype("<U1")

        np_dtype = np.dtype(dtype)
        if np_dtype.char == "U":
            np_dtype = str

    except Exception as exc:
        raise IOError("Cannot convert input value to a numpy data type!") from exc

    return np_dtype

In [None]:
# This is expecting something like the Muon group from events
# But we could change it to perhaps be a list of awkward arrays

def pack_single_awkward_array(d, arr, group_name):
    
    # To turn what is returned by awkward into what we store
    # Not sure if the string part is correct
    dtype_dict = {'f':float, 'i':int, 's':str}
    
    # d is our data dictionary that was already initialized
    counter = f"n{group_name}"
    d['_GROUPS_'][group_name] = [counter]
    
    # We will use this name for the counter later
    counter = f"{group_name}/n{group_name}"
    d['_MAP_DATASETS_TO_DATA_TYPES_'][counter] = int

    d['_MAP_DATASETS_TO_COUNTERS_'][group_name] = counter
    d['_LIST_OF_COUNTERS_'].append(counter)
    
    for field in arr.fields:

        # build a name for the hepfile entry
        dataset_name = f"{group_name}/{field}"
        #print(field)
        print(dataset_name)
        
        # Get the values
        x = arr[field]
        
        # For debugging
        #print(f"\t{v}   {x.ndim}")
        
        # Tells us if this is jagged or not
        dtype = _get_awkward_type(x)
        if x.ndim==1:
            x = ak.to_numpy(x)
            num = np.ones(len(x),dtype=int) # This is repeated, should we only do it once?

        else:
            num = ak.num(x)
            x = ak.flatten(x).to_numpy()
            
        d[dataset_name] = x
        
        #print(dtype_dict[dtype])
        d['_MAP_DATASETS_TO_DATA_TYPES_'][dataset_name] = dtype_dict[dtype]
        
        d['_MAP_DATASETS_TO_COUNTERS_'][dataset_name] = counter
        d['_GROUPS_'][group_name].append(field)

    d[counter] = num
    # We don't need to return the dictionary because in python
    # dictionaries are mutable


# Initialize the data dictionary
data = hepfile.initialize()

# Pack these groups of awkward arrays
# The data dictionary is modified inside of the function
pack_single_awkward_array(data,events['Muon'],group_name='muon')
pack_single_awkward_array(data,events['Electron'],group_name='electron')
pack_single_awkward_array(data,events['MET'],group_name='MET')


In [None]:
# Uncomment if you want to see what the data dictionary looks like
#data

In [None]:
# Write it!
hepfile.write_to_file('awkward_write_test.h5', data, verbose=True, comp_type="gzip", comp_opts=9)

# Scratch code

Just a bunch of test code when I was trying to figure this all out. 

In [None]:
d = {}
groups_to_datasets = {}

counters = []

for field in events.fields:
    
    print(field)
    
    d[field] = []
    groups_to_datasets[field] = []
    
    counters.append(f'n{field}')
    
    for v in events[field].fields:
        groups_to_datasets[field].append(v)
        
        key = f"{field}/{v}"
        
        x = events[field][v]
        
        #print(v)
        
        print(f"\t{v}   {x.ndim}")
        
        if x.ndim==1:
            dtype = x.layout.format
            x = ak.to_numpy(x)

        else:
            dtype = x.layout.content.format
            x = ak.flatten(x).to_numpy()


        d[key] = x

In [None]:
!ls -ltr

In [None]:
ak.num(events['Muon']['pt'])

In [None]:
events.luminosityBlock

In [None]:
d

In [None]:
x = events['MET']['pt']

print(x.ndim)
print(events['Muon']['pt'].ndim)
print(events['MET']['pt'].ndim)

In [None]:
#x.layout
layout = events['Electron']['pt'].layout

In [None]:
layout.content.format

In [None]:
x = events['MET']['pt']

x.layout.format

In [None]:
x = events['MET']['pt']

x.layout

In [None]:
x = events['MET']['pt']

try:
    x = ak.flatten(x)
except:
    1
x = ak.to_numpy(x)

print(type(x))

In [None]:
data = hepfile.initialize()
data

In [None]:
hepfile.create_group(data,group_name='muon',counter='nmuon')
hepfile.create_dataset(data,group='muon',datasets=['px','py','pz'],dtype=float)

hepfile.create_dataset(data,datasets=['luminosity_block'],dtype=int)

In [None]:
data

In [None]:
#d.keys()
for key in d.keys():
    print(f'{key}')

In [None]:
data['_GROUPS_']

In [None]:
data = hepfile.initialize()
data

In [None]:
hepfile.create_group(data,group_name='muon',counter='nmuon')
hepfile.create_dataset(data,group='muon',datasets=['px','py','pz'],dtype=float)

hepfile.create_dataset(data,datasets=['luminosity_block'],dtype=int)

for key in data.keys():
    print(key)
    print(data[key])
    print()