# Work with Large ARGO File

In [1]:
# Download dataset
!wget -nc 'https://pivarski-princeton.s3.amazonaws.com/argo-floats-expert.parquet'

File ‘argo-floats-expert.parquet’ already there; not retrieving.



In [2]:
# imports
import cProfile
import awkward as ak
import hepfile as hf

In [3]:
# read in the parquet file with awkward
imax = 200
awk = ak.from_parquet('argo-floats-expert.parquet', row_groups=[0,imax])

In [4]:
awk.fields

['latitude',
 'longitude',
 'time',
 'levels',
 'config_mission_number',
 'cycle_number',
 'data_centre',
 'data_mode',
 'data_state_indicator',
 'dc_reference',
 'direction',
 'firmware_version',
 'float_serial_no',
 'pi_name',
 'platform_number',
 'platform_type',
 'positioning_system',
 'position_qc',
 'profile_pres_qc',
 'profile_psal_qc',
 'profile_temp_qc',
 'project_name',
 'time_location',
 'time_qc',
 'vertical_sampling_scheme',
 'wmo_inst_type']

In [5]:
# test with the new method to pack awkward ararys into hepfiles
# this seems to be *relatively* fast but still kinda slow (~100s)
# the major slow downs seem to occur in awkward array functions
# which means that there isn't a ton we can do about this...
# I tried to look into how we can remove the ak.to_array calls since
# those seem to be the biggest slowdowns but I don't see how we can
hepfile_from_awkward = 'argo-awkward-to-hepfile.h5'
with cProfile.Profile() as p:
    data = hf.awkward_tools.awkward_to_hepfile(awk, hepfile_from_awkward)
    p.print_stats()

         357543395 function calls (357541045 primitive calls) in 112.627 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       40    0.000    0.000    0.041    0.001 <__array_function__ internals>:177(amax)
       40    0.000    0.000    0.043    0.001 <__array_function__ internals>:177(amin)
       25    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(copyto)
       40    0.000    0.000    0.006    0.000 <__array_function__ internals>:177(count_nonzero)
       40    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(reshape)
      338    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1033(_handle_fromlist)
       79    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:398(parent)
      120    0.000    0.000    0.001    0.000 __init__.py:14(to_nplike)
       12    0.000    0.000    0.000    0.000 __init__.py:1424(debug)
       12    0.000    0.000    0.000  

In [6]:
data.keys()

dict_keys(['_GROUPS_', '_MAP_DATASETS_TO_COUNTERS_', '_LIST_OF_COUNTERS_', '_SINGLETONS_GROUP_/COUNTER', '_MAP_DATASETS_TO_DATA_TYPES_', '_META_', 'latitude', 'longitude', 'time', 'levels/pres', 'levels/nlevels', 'levels/pres_adjusted', 'levels/pres_adjusted_error', 'levels/pres_adjusted_qc', 'levels/pres_qc', 'levels/psal', 'levels/psal_adjusted', 'levels/psal_adjusted_error', 'levels/psal_adjusted_qc', 'levels/psal_qc', 'levels/temp', 'levels/temp_adjusted', 'levels/temp_adjusted_error', 'levels/temp_adjusted_qc', 'levels/temp_qc', 'config_mission_number', 'cycle_number', 'data_centre', 'data_mode', 'data_state_indicator', 'dc_reference', 'direction', 'firmware_version', 'float_serial_no', 'pi_name', 'platform_number', 'platform_type', 'positioning_system', 'position_qc', 'profile_pres_qc', 'profile_psal_qc', 'profile_temp_qc', 'project_name', 'time_location', 'time_qc', 'vertical_sampling_scheme', 'wmo_inst_type'])

In [7]:
# try using the classic write_to_file and see how the speed goes
# not terrible!
hepfile_path = 'argo-hepfile.h5'
with cProfile.Profile() as p:
    hf.write_to_file(hepfile_path, data)
    p.print_stats()

         3455 function calls in 28.516 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       79    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1033(_handle_fromlist)
       79    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:398(parent)
        5    0.000    0.000    0.000    0.000 _internal.py:523(_view_is_safe)
        8    0.000    0.000    0.001    0.000 attrs.py:111(create)
        8    0.000    0.000    0.000    0.000 attrs.py:47(__init__)
        8    0.000    0.000    0.001    0.000 attrs.py:96(__setitem__)
       52    0.000    0.000    0.001    0.000 base.py:102(array_for_new_object)
       42    0.000    0.000    0.000    0.000 base.py:165(_lapl)
        2    0.000    0.000    0.000    0.000 base.py:171(_lcpl)
       96    0.000    0.000    0.000    0.000 base.py:177(_e)
        2    0.000    0.000    0.000    0.000 base.py:187(get_lcpl)
      167    0.000    0.000    0.000   

In [8]:
# how about reading either of these files into the classic format?
with cProfile.Profile() as p:
    hf.load(hepfile_path)
    p.print_stats()

         7283 function calls (7235 primitive calls) in 27.417 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        3    0.000    0.000    0.004    0.001 <__array_function__ internals>:177(unique)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(where)
      248    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1033(_handle_fromlist)
      316    0.001    0.000    0.001    0.000 <frozen importlib._bootstrap>:398(parent)
        1    0.000    0.000    0.000    0.000 __init__.py:1424(debug)
        1    0.000    0.000    0.000    0.000 __init__.py:1689(isEnabledFor)
       42    0.000    0.000    0.000    0.000 _collections_abc.py:802(__init__)
       42    0.000    0.000    0.001    0.000 _collections_abc.py:822(__contains__)
        3    0.000    0.000    0.000    0.000 arraysetops.py:125(_unpack_tuple)
        3    0.000    0.000    0.000    0.000 arraysetops.py:133(_unique

In [9]:
# Now how about into an awkward array?
# this is okay, could probably still be sped up though
# the problem is that unflattening the dictionary inherently takes some amount of time
with cProfile.Profile() as p:
    out_awk, bucket = hf.load(hepfile_path, return_type='awkward')
    p.print_stats()

         89106805 function calls (89105503 primitive calls) in 69.585 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        6    0.000    0.000    0.077    0.013 <__array_function__ internals>:177(array_equal)
       15    0.000    0.000    0.001    0.000 <__array_function__ internals>:177(cumsum)
       25    0.000    0.000   31.060    1.242 <__array_function__ internals>:177(encode)
       15    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(reshape)
       15    0.000    0.000    0.001    0.000 <__array_function__ internals>:177(searchsorted)
       25    0.000    0.000    3.510    0.140 <__array_function__ internals>:177(str_len)
        3    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(unique)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(where)
      455    0.000    0.000    0.001    0.000 <frozen importlib._bootstrap>:1033(_handle_fromli

In [None]:
# and finally into a pandas dataframe?
# this is really fast! Somehow even faster than just plain reading it in...
with cProfile.Profile() as p:
    dfs, bucket = hf.load(hepfile_path, return_type='pandas')
    p.print_stats()

In [None]:
# let's try getting the file meta
# very fast!
with cProfile.Profile() as p:
    meta = hf.get_file_metadata(hepfile_path)
    p.print_stats()
print(meta)

In [None]:
# let's try getting the file header
# It throws an error just like it should!
with cProfile.Profile() as p:
    hdr = hf.get_file_header(hepfile_path)
    p.print_stats()
print(hdr)