# Create index for interpolated data

The appetizer creates one file per time step, that we want to open as one xarray dataset. This operation is VERY slow if you decode times, and still quite slow when you don't, and you have to recreate the time axis by hand afterwards. You can create an index, to make it looks like a fake zarr file for xarray, that will load instantly and even be faster to process.

Created by: 
Tobias Kölling (@d70-t)

In [1]:
import json
import glob

import tqdm
import kerchunk.hdf
import fsspec

In [2]:
path_to_output = "/work/ab0995/a270088/NextGems_public/appetizer/IFS2.5/2t/netcdf/"
files = "tco3999-ng5_pc_*.nc"

In [3]:
urls = list(sorted(glob.glob(f"{path_to_output}/{files}")))
singles = []
for u in tqdm.tqdm(urls):
    with fsspec.open(u) as inf:
        h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, u, inline_threshold=100)
        singles.append(h5chunks.translate())

100%|██████████| 985/985 [02:19<00:00,  7.05it/s]


This is a hack to replace the times within the single reference objects such that they use common units. That way, MultiZarrToZarr doesn't get confused.

In [7]:
import base64
import xarray as xr

def encode_value(v):
    try:
        return v.decode("ascii")
    except UnicodeDecodeError:
        return "base64:" + base64.b64encode(v).decode("ascii")
    
def fix_time(single):
    t = xr.open_dataset(
        "reference://", engine="zarr",
        backend_kwargs={
            "storage_options": {
                "fo": single,
            },
            "consolidated": False
        }
    )[["time"]].compute()
    t.time.encoding = {}
    m = {}
    t.to_zarr(m, encoding={"time": {"units": "seconds since 1990-01-01", "dtype": "i4", "compressor": None}})
    return {
        "version": single["version"],
        "templates": single["templates"],
        "refs": {
            **single["refs"],
            **{k: encode_value(v) for k, v in m.items() if k.startswith("time/")}
        }
    }

In [8]:
from kerchunk.combine import MultiZarrToZarr
mzz = MultiZarrToZarr(
    [fix_time(s) for s in singles],
    concat_dims=["time"]
)

out = mzz.translate()

In [9]:
import xarray as xr
ds = xr.open_dataset(
    "reference://", engine="zarr",
    backend_kwargs={
        "storage_options": {
            "fo": out,
        },
        "consolidated": False
    }
)

In [10]:
ds

In [12]:
files[:-5]

'tco3999-ng5_pc'

In [13]:
with open(f"{path_to_output}/{files[:-5]}.json", "w") as outfile:
    json.dump(out, outfile)

In [15]:
!pwd

/home/a/a270088/PYTHON/nextgems/NextGems_Cycle2/FESOM


In [14]:
xr.open_zarr(f"reference::/{path_to_output}/{files[:-5]}.json", consolidated=False)

Unnamed: 0,Array,Chunk
Bytes,107.00 GiB,111.24 MiB
Shape,"(985, 2700, 5400)","(1, 2700, 5400)"
Count,986 Tasks,985 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 107.00 GiB 111.24 MiB Shape (985, 2700, 5400) (1, 2700, 5400) Count 986 Tasks 985 Chunks Type float64 numpy.ndarray",5400  2700  985,

Unnamed: 0,Array,Chunk
Bytes,107.00 GiB,111.24 MiB
Shape,"(985, 2700, 5400)","(1, 2700, 5400)"
Count,986 Tasks,985 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,107.00 GiB,111.24 MiB
Shape,"(985, 2700, 5400)","(1, 2700, 5400)"
Count,986 Tasks,985 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 107.00 GiB 111.24 MiB Shape (985, 2700, 5400) (1, 2700, 5400) Count 986 Tasks 985 Chunks Type float64 numpy.ndarray",5400  2700  985,

Unnamed: 0,Array,Chunk
Bytes,107.00 GiB,111.24 MiB
Shape,"(985, 2700, 5400)","(1, 2700, 5400)"
Count,986 Tasks,985 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,53.50 GiB,13.90 MiB
Shape,"(985, 2700, 5400)","(1, 1350, 2700)"
Count,3941 Tasks,3940 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 53.50 GiB 13.90 MiB Shape (985, 2700, 5400) (1, 1350, 2700) Count 3941 Tasks 3940 Chunks Type float32 numpy.ndarray",5400  2700  985,

Unnamed: 0,Array,Chunk
Bytes,53.50 GiB,13.90 MiB
Shape,"(985, 2700, 5400)","(1, 1350, 2700)"
Count,3941 Tasks,3940 Chunks
Type,float32,numpy.ndarray
