# Building Prototype Dataset Hierarchy
@MKDG

This notebook constructs a hierarchy of datasets from full MOM6-Cobalt v2 outputs:

* 3D datasets (0-1000 m)
- Full (944 GB)
- Mini (282 GB)
- Micro (35 GBs)
- Nano (500 MB)
  
* 2D datasets (surface only)
- Mini_surf (10 GB)
- Micro_surf (1.6 GBs)
- Nano_surf (59 MB)
- Nano_surf_three_year (34 MB) (available on Git)
- Nano_surf_one_year (11 MB) (available on Git, for code testing only since it can't resolve seasonal cycle)


Purpose:
- Enable fast prototyping and code development on small subsets
- Ensure realistic tests on micro/mini before full-scale training


In [6]:
import sys, os, glob

data_dir = "/scratch/gpfs/GEOCLIM/LRGROUP/maximek/INMOS/9km_gyre_data/"
bio_path = os.path.join(data_dir,"hist_control_cobalt_3d_yearly__2016_01.nc" )
phy_path = os.path.join(data_dir,"hist_control_dynamics3d_yearly__2016_01.nc" )
bc_path = os.path.join(data_dir,"hist_control_dynamics2d_yearly__2016_01.nc"   )

ds_bio = xr.open_dataset(bio_path,engine="netcdf4")
ds_phy = xr.open_dataset(phy_path,engine="netcdf4")
ds_bc = xr.open_dataset(bc_path,engine="netcdf4")

ds_unproc = xr.merge([
    ds_bio,
    ds_phy,
    ds_bc
])




In [7]:
ds_unproc

In [1]:
import xarray as xr
import pathlib

# Paths
PROTOTYPE_DIR = pathlib.Path("../../prototypes_dataset/")
PROTOTYPE_DIR.mkdir(parents=True, exist_ok=True)

# Load full dataset (~ 1 Tb)
path = "../../9km_monthly_gyre/train/"

ds = xr.open_mfdataset(
    path+"*", 
    combine="by_coords", decode_times=True
)

# Take only first 1000 meters
ds = ds.sel(z_l=slice(None, 1000))

# Net heat flux positive IN the ocean
ds["Qnet"] = ds["SW"] - ds["LW"] - ds["latent"] - ds["sensible"]

# --- Mini dataset: remove boundary regions ---
ds_mini = ds.sel(
    yh=slice(25.0, 55.0),
    xh=slice(-45.0, -25.0),
    z_l=slice(0, 500)   # adjust as desired
)
ds_mini_surf = ds_mini.isel(z_l=0)

# --- Micro dataset: smaller cutout ---
ds_micro = ds_mini.sel(xh=slice(-35, -30), yh=slice(35, 50))
ds_micro_surf = ds_micro.isel(z_l=0)

# --- Nano dataset: smallest extract ---
ds_nano = ds_mini.sel(xh=slice(-34, -33), yh=slice(43, 44))
ds_nano_surf = ds_nano.isel(z_l=0)

# --- Temporal subsets of Nano ---
ds_nano_surf_three_years = ds_nano_surf.sel(
    time=ds_nano_surf["time"].dt.year.isin([2016, 2017, 2018])
)
ds_nano_surf_one_year = ds_nano_surf.sel(
    time=ds_nano_surf["time"].dt.year.isin([2016])
)



In [None]:
# --- Save all subsets at the end ---
ds_mini.to_netcdf(PROTOTYPE_DIR / "ds_mini.nc")
ds_mini_surf.to_netcdf(PROTOTYPE_DIR / "ds_mini_surf.nc")

ds_micro.to_netcdf(PROTOTYPE_DIR / "ds_micro.nc")
ds_micro_surf.to_netcdf(PROTOTYPE_DIR / "ds_micro_surf.nc")

ds_nano.to_netcdf(PROTOTYPE_DIR / "ds_nano.nc")
ds_nano_surf.to_netcdf(PROTOTYPE_DIR / "ds_nano_surf.nc")

ds_nano_surf_three_years.to_netcdf(PROTOTYPE_DIR / "ds_nano_surf_three_years.nc")
ds_nano_surf_one_year.to_netcdf(PROTOTYPE_DIR / "ds_nano_surf_one_year.nc")

In [2]:
import yaml, numpy as np, math
from pathlib import Path

def _to_python(obj):
    """Recursively convert NumPy/Pandas objects to pure Python and scrub NaNs."""
    if isinstance(obj, np.generic):
        obj = obj.item()
    if isinstance(obj, float):
        if math.isnan(obj):
            return None
        if math.isinf(obj):
            return str(obj)
        return obj
    if isinstance(obj, dict):
        return {str(k): _to_python(v) for k,v in obj.items()}
    if isinstance(obj, (list, tuple, set, np.ndarray)):
        return [_to_python(x) for x in obj]
    return obj

def write_schema(ds, out_path, notes=None):
    """
    Generate schema.yaml for a given xarray.Dataset.
    Includes: coords, variables, io_hints, and dataset summary (grid + resolution).
    """
    schema = {
        "conventions": {"cf_version": "CF-1.x"},
        "dataset": {
            "title": "full_9km_monthly",
            "grid_type": "regular",
            "spatial_resolution": "9 km",
            "temporal_resolution": "monthly",
            "grid_shape": {dim: int(size) for dim, size in ds.sizes.items()}
        },
        "coords": {},
        "variables": {},
        "io_hints": {
            "suggested_chunks": {dim: min(size, 128) for dim, size in ds.sizes.items()},
            "compression": {"zlib": True, "complevel": 4}
        }
    }

    # Coordinates
    for c in ds.coords:
        da = ds[c]
        schema["coords"][c] = {
            "dims": list(da.dims),
            "dtype": str(da.dtype),
            **{k: v for k, v in da.attrs.items() if v not in (None,"")}
        }

    # Variables
    for v in ds.data_vars:
        da = ds[v]
        meta = {
            "dims": list(da.dims),
            "dtype": str(da.dtype),
            **{k: v2 for k,v2 in da.attrs.items() if v2 not in (None,"")}
        }
        # Standardize units/names
        if v in ["o2", "dic", "O2sat", "AOU"]:
            meta["units"] = "mol kg-1"
        if v == "CT":
            meta["units"] = "degC"
            meta["long_name"] = "Conservative Temperature"
            meta["notes"] = ("Computed using TEOS-10: gsw.CT_from_t(SA, temp, pressure)")
        if v == "Qnet":
            meta["notes"] = "Qnet = SW - LW - latent - sensible"

        schema["variables"][v] = meta

    if notes:
        schema["notes"] = notes

    # Save schema
    Path(out_path).parent.mkdir(exist_ok=True, parents=True)
    with open(out_path, "w") as f:
        yaml.safe_dump(_to_python(schema), f, sort_keys=False)
    print(f"Schema written to {out_path}")


In [8]:
# Write schemas on all datasets
# Mini dataset

SCHEMA_DIR = pathlib.Path("../config/")

write_schema(ds_unproc, SCHEMA_DIR / "schema_ds_unproc.yaml")


write_schema(ds, SCHEMA_DIR / "schema_ds_full.yaml")


write_schema(ds_mini, SCHEMA_DIR / "schema_ds_mini.yaml")

# Micro dataset
write_schema(ds_micro, SCHEMA_DIR / "schema_ds_micro.yaml")

# Nano dataset
write_schema(ds_nano, SCHEMA_DIR / "schema_ds_nano.yaml")

# Pico surf one-year
write_schema(ds_nano_surf_one_year, SCHEMA_DIR / "schema_ds_nano_surf_one_year.yaml")


Schema written to ../config/schema_ds_unproc.yaml
Schema written to ../config/schema_ds_full.yaml
Schema written to ../config/schema_ds_mini.yaml
Schema written to ../config/schema_ds_micro.yaml
Schema written to ../config/schema_ds_nano.yaml
Schema written to ../config/schema_ds_nano_surf_one_year.yaml
