In [1]:
import pandas as pd
import numpy as np
import polars as pl
import pyarrow as pa
import pyarrow.parquet as pq
from lzt_utils.dataset import LztDataset
from pathlib import Path

Module libc not found.


In [2]:
lzt_data = Path.home() / 'ext_data' / 'lorenzetti' / 'V2.1.0'
datasets_info = {
    'user.joao.pinto.mc25_13TeV.250401.Pythia8EvtGen_Zee': {
        'color': 'blue',
        'alias': 'zee'
    },
    'user.joao.pinto.mc25_13TeV.250401.Pythia8EvtGen_JF17': {
        'color': 'red',
        'alias': 'jf17'
    }
}
datasets = {}
for dir_name, info in datasets_info.items():
    ds = LztDataset.from_dir(lzt_data/dir_name)
    ds.color = info['color']
    datasets[info['alias']] = ds
datasets

{'zee': LztDataset(path=/root/ext_data/lorenzetti/V2.1.0/user.joao.pinto.mc25_13TeV.250401.Pythia8EvtGen_Zee, label=Z->ee),
 'jf17': LztDataset(path=/root/ext_data/lorenzetti/V2.1.0/user.joao.pinto.mc25_13TeV.250401.Pythia8EvtGen_JF17, label=Jets et > 17GeV)}

In [4]:
AOD_ARROW_SCHEMA = pa.schema([
    ('CaloCellContainer_Cells', pa.list_(pa.struct([
        ('descriptor_link', pa.uint64()),
        ('deta', pa.float32()),
        ('dphi', pa.float32()),
        ('e', pa.float32()),
        ('et', pa.float32()),
        ('eta', pa.float32()),
        ('phi', pa.float32()),
        ('tau', pa.float32())
    ]))),
    ('CaloClusterContainer_Clusters', pa.list_(pa.struct([
        ('cell_links', pa.list_(pa.uint64())),
        ('deta', pa.float32()),
        ('dphi', pa.float32()),
        ('e', pa.float32()),
        ('e0', pa.float32()),
        ('e1', pa.float32()),
        ('e2', pa.float32()),
        ('e233', pa.float32()),
        ('e237', pa.float32()),
        ('e277', pa.float32()),
        ('e2tsts1', pa.float32()),
        ('e3', pa.float32()),
        ('ehad1', pa.float32()),
        ('ehad2', pa.float32()),
        ('ehad3', pa.float32()),
        ('emaxs1', pa.float32()),
        ('emaxs2', pa.float32()),
        ('eratio', pa.float32()),
        ('et', pa.float32()),
        ('eta', pa.float32()),
        ('etot', pa.float32()),
        ('f0', pa.float32()),
        ('f1', pa.float32()),
        ('f2', pa.float32()),
        ('f3', pa.float32()),
        ('fracMax', pa.float32()),
        ('lambdaCenter', pa.float32()),
        ('lateralMom', pa.float32()),
        ('longitudinalMom', pa.float32()),
        ('phi', pa.float32()),
        ('reta', pa.float32()),
        ('rhad', pa.float32()),
        ('rhad1', pa.float32()),
        ('rphi', pa.float32()),
        ('secondLambda', pa.float32()),
        ('secondR', pa.float32()),
        ('seed_link', pa.int32()),
        ('weta2', pa.float32())
    ]))),
    ('CaloDetDescriptorContainer_Cells', pa.list_(pa.struct([
        ('bc_duration', pa.float32()),
        ('bcid_end', pa.int32()),
        ('bcid_start', pa.int32()),
        ('deta', pa.float32()),
        ('detector', pa.int32()),
        ('dphi', pa.float32()),
        ('e', pa.float32()),
        ('edep', pa.float32()),
        ('edep_per_bunch', pa.list_(pa.float32())),
        ('eta', pa.float32()),
        ('hash', pa.uint64()),
        ('phi', pa.float32()),
        ('pulse', pa.list_(pa.float32())),
        ('sampling', pa.int32()),
        ('tau', pa.float32()),
        ('tof', pa.list_(pa.float32())),
        ('z', pa.float32())
    ]))),
    ('CaloRingsContainer_Rings', pa.list_(pa.struct([
        ('cluster_link', pa.int32()),
        ('rings', pa.list_(pa.float32()))
    ]))),
    ('ElectronContainer_Electrons', pa.list_(pa.struct([
        ('cluster_link', pa.int32()),
        ('e', pa.float32()),
        ('et', pa.float32()),
        ('eta', pa.float32()),
        ('phi', pa.float32()),
        ('isEM', pa.list_(pa.bool_())),
    ]))),
    ('EventInfoContainer_Events', pa.list_(pa.struct([
        ('avgmu', pa.float32()),
        ('eventNumber', pa.float32()),
        ('runNumber', pa.float32())
    ]))),
    ('SeedContainer_Seeds', pa.list_(pa.struct([
        ('e', pa.float32()),
        ('et', pa.float32()),
        ('eta', pa.float32()),
        ('id', pa.int32()),
        ('phi', pa.float32())
    ]))),
    ('TruthParticleContainer_Particles', pa.list_(pa.struct([
        ('e', pa.float32()),
        ('et', pa.float32()),
        ('eta', pa.float32()),
        ('pdgid', pa.int32()),
        ('phi', pa.float32()),
        ('px', pa.float32()),
        ('py', pa.float32()),
        ('pz', pa.float32()),
        ('seedid', pa.int32()),
        ('vx', pa.float32()),
        ('vy', pa.float32()),
        ('vz', pa.float32())
    ])))
])

In [5]:
def EventInfoContainer_as_python(data):
    new_data = []
    for d in data:
        new_data.append({
            'avgmu': d.avgmu,
            'eventNumber': d.eventNumber,
            'runNumber': d.runNumber})
    return new_data


def CaloCellContainer_Cells_as_python(data):
    new_data = []
    for d in data:
        new_data.append({
            'descriptor_link': d.descriptor_link,
            'deta': d.deta,
            'dphi': d.dphi,
            'e': d.e,
            'et': d.et,
            'eta': d.eta,
            'phi': d.phi,
            'tau': d.tau})
    return new_data

def CaloClusterContainer_Clusters_as_python(data):
    new_data = []
    for d in data:
        new_data.append({
            'cell_links': np.array(d.cell_links, dtype=np.uint64).tolist(),
            'deta': d.deta,
            'dphi': d.dphi,
            'e': d.e,
            'e0': d.e0,
            'e1': d.e1,
            'e2': d.e2,
            'e233': d.e233,
            'e237': d.e237,
            'e277': d.e277,
            'e2tsts1': d.e2tsts1,
            'e3': d.e3,
            'ehad1': d.ehad1,
            'ehad2': d.ehad2,
            'ehad3': d.ehad3,
            'emaxs1': d.emaxs1,
            'emaxs2': d.emaxs2,
            'eratio': d.eratio,
            'et': d.et,
            'eta': d.eta,
            'etot': d.etot,
            'f0': d.f0,
            'f1': d.f1,
            'f2': d.f2,
            'f3': d.f3,
            'fracMax': d.fracMax,
            'lambdaCenter': d.lambdaCenter,
            'lateralMom': d.lateralMom,
            'longitudinalMom': d.longitudinalMom,
            'phi': d.phi,
            'reta': d.reta,
            'rhad': d.rhad,
            'rhad1': d.rhad1,
            'rphi': d.rphi,
            'secondLambda': d.secondLambda,
            'secondR': d.secondR,
            # seed_link is an int32, but it is a list in the schema
            'seed_link': d.seed_link,
            'weta2': d.weta2})
    return new_data

def CaloDetDescriptorContainer_Cells_as_python(data):
    new_data = []
    for d in data:
        new_data.append({
            'bc_duration': d.bc_duration,
            'bcid_end': d.bcid_end,
            'bcid_start': d.bcid_start,
            'deta': d.deta,
            'detector': d.detector,
            'dphi': d.dphi,
            'e': d.e,
            'edep': d.edep,
            'edep_per_bunch': np.array(d.edep_per_bunch).tolist(),
            'eta': d.eta,
            'hash': d.hash,
            'phi': d.phi,
            'pulse': np.array(d.pulse).tolist(),
            'sampling': d.sampling,
            'tau': d.tau,
            'tof': np.array(d.tof).tolist(),
            'z': d.z})
    return new_data

def CaloRingsContainer_Rings_as_python(data):
    new_data = []
    for d in data:
        new_data.append({
            'cluster_link': d.cluster_link,
            'rings': np.array(d.rings).tolist()})
    return new_data

def ElectronContainer_Electrons_as_python(data):
    new_data = []
    for d in data:
        new_data.append({
            'cluster_link': d.cluster_link,
            'e': d.e,
            'et': d.et,
            'eta': d.eta,
            'phi': d.phi,
            'isEM': np.array(d.isEM).tolist()})
    return new_data

def SeedContainer_Seeds_as_python(data):
    new_data = []
    for d in data:
        new_data.append({
            'e': d.e,
            'et': d.et,
            'eta': d.eta,
            'id': d.id,
            'phi': d.phi})
    return new_data

def TruthParticleContainer_Particles_as_python(data):
    new_data = []
    for d in data:
        new_data.append({
            'e': d.e,
            'et': d.et,
            'eta': d.eta,
            'pdgid': d.pdgid,
            'phi': d.phi,
            'px': d.px,
            'py': d.py,
            'pz': d.pz,
            'seedid': d.seedid,
            'vx': d.vx,
            'vy': d.vy,
            'vz': d.vz})
    return new_data

In [6]:
aod_columns = {
    'EventInfoContainer_Events': EventInfoContainer_as_python,
    'CaloCellContainer_Cells': CaloCellContainer_Cells_as_python,
    'CaloClusterContainer_Clusters': CaloClusterContainer_Clusters_as_python,
    'CaloDetDescriptorContainer_Cells': CaloDetDescriptorContainer_Cells_as_python,
    'CaloRingsContainer_Rings': CaloRingsContainer_Rings_as_python,
    'ElectronContainer_Electrons': ElectronContainer_Electrons_as_python,
    'SeedContainer_Seeds': SeedContainer_Seeds_as_python,
    'TruthParticleContainer_Particles': TruthParticleContainer_Particles_as_python
}

In [7]:
data = {col_name: [] for col_name in aod_columns.keys()}
for event in datasets['zee'].aod_tchain:
    # # Get the first event
    for col_name, col_cast in aod_columns.items():
        data[col_name].append(col_cast(getattr(event, col_name)))
    break

In [8]:
for col_name in data.keys():
    data[col_name] = pa.array(data[col_name], type=AOD_ARROW_SCHEMA.field(col_name).type)
pa_table = pa.Table.from_pydict(data, schema=AOD_ARROW_SCHEMA)
pa_table

pyarrow.Table
CaloCellContainer_Cells: list<item: struct<descriptor_link: uint64, deta: float, dphi: float, e: float, et: float, eta: float, phi: float, tau: float>>
  child 0, item: struct<descriptor_link: uint64, deta: float, dphi: float, e: float, et: float, eta: float, phi: float, tau: float>
      child 0, descriptor_link: uint64
      child 1, deta: float
      child 2, dphi: float
      child 3, e: float
      child 4, et: float
      child 5, eta: float
      child 6, phi: float
      child 7, tau: float
CaloClusterContainer_Clusters: list<item: struct<cell_links: list<item: uint64>, deta: float, dphi: float, e: float, e0: float, e1: float, e2: float, e233: float, e237: float, e277: float, e2tsts1: float, e3: float, ehad1: float, ehad2: float, ehad3: float, emaxs1: float, emaxs2: float, eratio: float, et: float, eta: float, etot: float, f0: float, f1: float, f2: float, f3: float, fracMax: float, lambdaCenter: float, lateralMom: float, longitudinalMom: float, phi: float, reta: f

In [9]:
pq.write_table(pa_table, 'test.parquet', compression='gzip')

In [10]:
pl_df = pl.from_arrow(pa_table)
pl_df

CaloCellContainer_Cells,CaloClusterContainer_Clusters,CaloDetDescriptorContainer_Cells,CaloRingsContainer_Rings,ElectronContainer_Electrons,EventInfoContainer_Events,SeedContainer_Seeds,TruthParticleContainer_Particles
list[struct[8]],list[struct[38]],list[struct[17]],list[struct[2]],list[struct[6]],list[struct[3]],list[struct[5]],list[struct[12]]
"[{4396,0.025,0.0982,-85.08432,-84.286293,0.1375,1.2272,-4.283576}, {4397,0.025,0.0982,73.614868,72.924416,0.1375,1.3254,-50.43824}, … {70000367,0.2,0.0982,-18.619766,-16.512362,0.5,1.5217,-24.864124}]","[{[4396, 4397, … 70000367],0.2,0.2,18146.101562,-852.562012,2055.025635,16601.990234,15471.248047,15967.717773,16324.734375,295.941467,341.648163,-111.577888,-47.671463,-59.524895,1672.30127,9196.731445,0.699284,17294.738281,0.3125,17927.326172,-0.046983,0.113249,0.914907,0.018828,3.0624e-41,3.0624e-41,2.3680e-9,3.0624e-41,1.3622,0.97813,-0.012056,-0.006149,0.968908,2.3680e-9,2.3680e-9,0,0.010053}]","[{25.0,3,-21,0.025,2,0.0982,-85.08432,0.0,[0.0, 0.0, … 0.0],0.1375,4396,1.2272,[-109.544975, 88.401695, … -38.63628],0,-4.283576,[3.0624e-41, 1.7161e-9, … 0.0],0.0}, {25.0,3,-21,0.025,2,0.0982,73.614868,0.0,[3.0624e-41, 1.6888e-9, … 0.0],0.1375,4397,1.3254,[56.893921, -57.297031, … -144.101685],0,-50.43824,[0.0, 0.0, … 0.0],0.0}, … {25.0,4,-6,0.2,1,0.0982,-18.619766,0.0,[0.0, 0.0, … 0.0],0.5,70000367,1.5217,[-17.914425, 21.05797, … 36.983963],7,-24.864124,[0.0, 0.0, … 0.0],0.0}]","[{0,[138.731491, -262.7677, … 0.0]}]","[{0,18146.101562,17294.738281,0.3125,1.3622,[true, true, … true]}]","[{0.0,455.0,250401.0}]","[{19.627337,18.65774,0.321009,0,1.351124}]","[{19.627337,18.65774,0.321009,0,1.351124,4.065711,18.209373,6.092706,11,-1.1246e-15,1.1100e-15,-27.00013}]"


In [14]:
import awkward as ak

In [15]:
ak_arr = ak.from_parquet('test.parquet')
ak_arr

In [19]:
ak_arr.CaloDetDescriptorContainer_Cells.pulse