In [1]:

import sys, os, glob
from pathlib import Path
import pandas as pd
import xarray as xr
import gsw
import numpy as np
from xgcm import Grid
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

In [2]:
#!/usr/bin/env python3
"""
Generate schema.yaml for MOM6 + COBALT outputs.
- Extracts coords/variables metadata from NetCDF
- Supplements known unit fixes (O2, DIC, O2sat, AOU in mol/kg)
- Defines Conservative Temperature (CT) as per TEOS-10 (gsw.CT_from_t)
- Writes schema to config/schema.yaml
"""

import xarray as xr
import yaml
import numpy as np
import math
from pathlib import Path

def _to_python(obj):
    """Recursively convert NumPy/Pandas objects to pure Python and scrub NaNs."""
    if isinstance(obj, np.generic):
        obj = obj.item()
    if isinstance(obj, float):
        if math.isnan(obj):
            return None
        if math.isinf(obj):
            return str(obj)
        return obj
    if isinstance(obj, dict):
        return {str(k): _to_python(v) for k,v in obj.items()}
    if isinstance(obj, (list, tuple, set, np.ndarray)):
        return [_to_python(x) for x in obj]
    return obj

def safe_dump_yaml(obj, outpath):
    obj = _to_python(obj)
    with open(outpath, "w") as f:
        yaml.safe_dump(obj, f, sort_keys=False)

def infer_schema(nc_path, out_path="../config/schema.yaml"):
    ds = xr.open_dataset(nc_path)

    schema = {
        "conventions": {"cf_version": ds.attrs.get("Conventions", "CF-1.x")},
        "dataset": {
            "title": ds.attrs.get("title", ""),
            "source_file_example": Path(nc_path).name,
            "associated_files": ds.attrs.get("associated_files", ""),
            "grid_type": ds.attrs.get("grid_type", ""),
            "grid_tile": ds.attrs.get("grid_tile", "")
        },
        "coords": {},
        "variables": {},
        "io_hints": {
            "suggested_chunks": {dim: min(size, 128) for dim, size in ds.sizes.items()},
            "compression": {"zlib": True, "complevel": 4}
        }
    }

    # Coordinates
    for c in ds.coords:
        da = ds[c]
        entry = {
            "dims": list(da.dims),
            "dtype": str(da.dtype),
            **{k: v for k,v in da.attrs.items() if v not in (None,"")}
        }
        schema["coords"][c] = entry

    # Variables
    for v in ds.data_vars:
        da = ds[v]
        meta = {
            "dims": list(da.dims),
            "dtype": str(da.dtype),
            **{k: v2 for k,v2 in da.attrs.items() if v2 not in (None,"")}
        }

        # Supplement known fixes
        if v in ["o2", "dic", "O2sat", "AOU"]:
            meta["units"] = "mol kg-1"
            if v == "o2":
                meta["long_name"] = "Dissolved Oxygen"
            elif v == "dic":
                meta["long_name"] = "Dissolved Inorganic Carbon"
            elif v == "O2sat":
                meta["long_name"] = "Oxygen Saturation"
            elif v == "AOU":
                meta["long_name"] = "Apparent Oxygen Utilization"
            meta["notes"] = "Standardized to mol/kg for consistency across biogeochemical tracers"

        elif v == "Qnet":
            meta["units"] = "W m-2"
            meta["long_name"] = "Net Surface Heat Flux (positive into ocean)"
            meta["notes"] = "Qnet = SW - LW - latent - sensible"

        elif v == "CT":
            meta["units"] = "degC"
            meta["long_name"] = "Conservative Temperature"
            meta["notes"] = (
                "Computed using TEOS-10: gsw.CT_from_t(SA, in-situ temperature, pressure). "
                "In practice, derived via xr.apply_ufunc(gsw.CT_from_t, SA, temp, pressure, "
                "input_core_dims=[['z_l','yh','xh']]*3, output_core_dims=[['z_l','yh','xh']], "
                "vectorize=True, dask='parallelized', output_dtypes=[float])"
            )

        schema["variables"][v] = meta

    # Save
    Path(out_path).parent.mkdir(exist_ok=True)
    safe_dump_yaml(schema, out_path)
    print(f"Schema written to {out_path}")
    return schema

if __name__ == "__main__":
    nc_file = "../../prototypes_dataset/ds_mini.nc"  # update path
    infer_schema(nc_file)


Schema written to ../config/schema.yaml
