# Building Prototype Dataset Hierarchy
## MKDG
This notebook constructs a hierarchy of datasets from full MOM6 outputs:
- Full (944 GB)
- Mini (87 GB)
- Micro (few GBs)
- Pico (100–200 MB)
- Pico one-year (30 MB)

Purpose:
- Enable fast prototyping and LLM-assisted code development on small subsets
- Ensure realistic tests on micro/mini before full-scale training


In [None]:
# notebooks/01_build_dataset_hierarchy.ipynb

import xarray as xr
import pathlib

# Paths

PROTOTYPE_DIR = pathlib.Path("../../prototypes_dataset/")
PROTOTYPE_DIR.mkdir(parents=True, exist_ok=True)

# Load full dataset (~ 1 Tb)
path = "../../9km_monthly_gyre/train/"

ds = xr.open_mfdataset(
    path+"*", 
    combine='by_coords',decode_times=True)

# Take only first 1000 meters
ds = ds.sel(z_l=slice(None,1000))

# Now AOU is is mol/kg, because of dimensional consistency with AOU we need to put DIC and O2 in micromol/kg  
# Compute net heat flux positive IN the ocean 
ds["Qnet"] = ds["SW"]-ds["LW"]-ds["latent"]-ds["sensible"]


# --- Mini dataset: remove boundary regions ---
ds_mini = ds.sel(
    yh=slice(25.0, 55.0),
    xh=slice(-45.0, -25.0),
    z_l=slice(0, 500)   # adjust as desired
)
ds_mini.to_netcdf(PROTOTYPE_DIR / "ds_mini.nc")
ds_mini_surf = ds_mini.isel(z_l=0)
ds_mini_surf.to_netcdf(PROTOTYPE_DIR / "ds_mini_surf.nc")

# --- Micro dataset: smaller cutout ---
ds_micro = ds_mini.sel(xh=slice(-35, -30), yh=slice(35, 50))
ds_micro.to_netcdf(PROTOTYPE_DIR / "ds_micro.nc")
ds_micro_surf = ds_micro.isel(z_l=0)
ds_micro_surf.to_netcdf(PROTOTYPE_DIR / "ds_micro_surf.nc")

# --- Nano dataset: smallest extract ---
ds_nano = ds_mini.sel(xh=slice(-34, -33), yh=slice(43, 44))
ds_nano.to_netcdf(PROTOTYPE_DIR / "ds_nano.nc")
ds_nano_surf = ds_nano.isel(z_l=0)
ds_nano_surf.to_netcdf(PROTOTYPE_DIR / "ds_nano_surf.nc")

# --- Temporal subsets of Nano ---
ds_nano_surf_three_years = ds_nano_surf.sel(
    time=ds_nano_surf['time'].dt.year.isin([2016, 2017, 2018])
)
ds_nano_surf_three_years.to_netcdf(PROTOTYPE_DIR / "ds_nano_surf_three_years.nc")

ds_nano_surf_one_year = ds_nano_surf.sel(
    time=ds_nano_surf['time'].dt.year.isin([2016])
)
ds_nano_surf_one_year.to_netcdf(PROTOTYPE_DIR / "ds_nano_surf_one_year.nc")


In [6]:
import yaml, numpy as np, math
from pathlib import Path

def _to_python(obj):
    """Recursively convert NumPy/Pandas objects to pure Python and scrub NaNs."""
    if isinstance(obj, np.generic):
        obj = obj.item()
    if isinstance(obj, float):
        if math.isnan(obj):
            return None
        if math.isinf(obj):
            return str(obj)
        return obj
    if isinstance(obj, dict):
        return {str(k): _to_python(v) for k,v in obj.items()}
    if isinstance(obj, (list, tuple, set, np.ndarray)):
        return [_to_python(x) for x in obj]
    return obj

def write_schema(ds, out_path, notes=None):
    """
    Generate schema.yaml for a given xarray.Dataset.
    - Extracts coords and variables
    - Supplements units/long_names for key BGC variables
    """
    schema = {
        "coords": {},
        "variables": {},
        "io_hints": {
            "suggested_chunks": {dim: min(size, 128) for dim, size in ds.sizes.items()},
            "compression": {"zlib": True, "complevel": 4}
        }
    }

    for c in ds.coords:
        da = ds[c]
        schema["coords"][c] = {
            "dims": list(da.dims),
            "dtype": str(da.dtype),
            **{k: v for k, v in da.attrs.items() if v not in (None,"")}
        }

    for v in ds.data_vars:
        da = ds[v]
        meta = {
            "dims": list(da.dims),
            "dtype": str(da.dtype),
            **{k: v2 for k,v2 in da.attrs.items() if v2 not in (None,"")}
        }
        # Standardize key biogeochemical variables
        if v in ["o2", "dic", "O2sat", "AOU"]:
            meta["units"] = "mol kg-1"
        if v == "CT":
            meta["units"] = "degC"
            meta["notes"] = "Computed using TEOS-10: gsw.CT_from_t(SA, temp, pressure)"

        schema["variables"][v] = meta

    if notes:
        schema["notes"] = notes

    # Save schema
    Path(out_path).parent.mkdir(exist_ok=True, parents=True)
    with open(out_path, "w") as f:
        yaml.safe_dump(_to_python(schema), f, sort_keys=False)
    print(f"Schema written to {out_path}")


In [None]:
# Write schemas on all datasets
# Mini dataset
write_schema(ds_mini, PROTOTYPE_DIR / "schema_ds_mini.yaml")

# Micro dataset
write_schema(ds_micro, PROTOTYPE_DIR / "schema_ds_micro.yaml")

# Pico dataset
write_schema(ds_pico, PROTOTYPE_DIR / "schema_ds_pico.yaml")

# Pico surf one-year
write_schema(ds_pico_surf_one_year, PROTOTYPE_DIR / "schema_ds_pico_surf_one_year.yaml")
