
## Set up and imports

In [7]:
# --- 1) Setup & Imports ---
import os, json, importlib, textwrap
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# Render nicely in notebooks
%matplotlib inline
plt.rcParams["figure.dpi"] = 120


## Load the dataset catalog
Read datasets.json (data catalog) into memory so we can browse datasets and their metadata.

In [8]:
# --- 2) Load the Dataset Catalog (datasets.json) ---
CATALOG_PATH = "datasets.json"  # adjust if you keep it elsewhere

assert os.path.exists(CATALOG_PATH), f"Catalog file not found: {CATALOG_PATH}"

with open(CATALOG_PATH, "r", encoding="utf-8") as f:
    catalog_data = json.load(f)

# Expect a top-level key "datasets" with a list of entries
catalog_list = catalog_data.get("datasets", [])
assert isinstance(catalog_list, list) and len(catalog_list) > 0, "No datasets found in catalog."

# Normalize to dict keyed by dataset name
CATALOG = {d["name"]: d for d in catalog_list}

print("Datasets available:")
for name in CATALOG.keys():
    print(" •", name)


Datasets available:
 • Indian Ocean grid
 • GHRSST Level 4 MUR Global Foundation Sea Surface Temperature Analysis (v4.1)
 • ERA5 Atmospheric Surface Analysis


## Helper to show description, time range, bounds, variables, etc

In [9]:
# --- 3) Explore Dataset Metadata (Description, Bounds, Variables) ---

def describe_dataset(ds_meta: dict):
    """Pretty print a single catalog entry."""
    print(f"# {ds_meta.get('name','<unnamed>')}\n")
    print(ds_meta.get("description","(no description)"))
    print()

    tb = ds_meta.get("temporal_bounds", {})
    sb = ds_meta.get("spatial_bounds", {})

    print("Time range:", tb.get("start_time"), "→", tb.get("end_time"))
    print("Spatial bounds:",
          f"lat[{sb.get('min_lat')},{sb.get('max_lat')}],",
          f"lon[{sb.get('min_lon')},{sb.get('max_lon')}]")
    print()

    # Variables table
    variables = (ds_meta.get("variables") or {}).get("variables", [])
    if variables:
        print("Variables:")
        df_vars = pd.DataFrame(variables)
        display(df_vars)
    else:
        print("(no variables listed)")

# Choose one for inspection (hardcode here; swap as needed)
INSPECT_NAME = "GHRSST Level 4 MUR Global Foundation Sea Surface Temperature Analysis (v4.1)"
describe_dataset(CATALOG[INSPECT_NAME])


# GHRSST Level 4 MUR Global Foundation Sea Surface Temperature Analysis (v4.1)

The GHRSST MUR Level 4 sea surface temperature dataset provides global 0.01° analyses using wavelet-based optimal interpolation, combining nighttime SST observations from multiple satellite instruments and in situ sources, with retrospective (four-day latency) and near-real-time (one-day latency) products. It also includes ice concentration data for high-latitude SST improvements, SST anomalies, and the temporal distance to the nearest IR measurement for each pixel.

Time range: 2002-05-31 → present
Spatial bounds: lat[-90.0,90.0], lon[-180.0,180.0]

Variables:


Unnamed: 0,standard_name,description,units
0,analysed_sst,Analysed sea surface temperature,kelvin
1,analysis_error,Estimated error standard deviation of analysed...,kelvin
2,lat,Latitude,degrees_north
3,lon,Longitude,degrees_east
4,mask,Sea/land field composite mask,
5,sea_ice_fraction,Sea ice area fraction,fraction (0 to 1)
6,time,Reference time of SST field,seconds since 1981-01-01 00:00:00 UTC


## Loader registry (bridges catalog → dataset.py)

In [10]:
# --- 4) Dataset Loader Registry (link datasets.json → dataset.py) ---

# Import your loader functions module
loaders = importlib.import_module("dataset")  # this is dataset.py in the same folder

def load_from_catalog(ds_name: str, **kwargs):
    """
    Dispatch to the correct loader based on the catalog's access.access_function.
    Skips datasets that don't specify an access_function (e.g., ERA5 we are not using now).
    """
    meta = CATALOG[ds_name]
    access = meta.get("access", {})
    fn_name = access.get("access_function", "").strip()

    if not fn_name:
        raise ValueError(
            f"Dataset '{ds_name}' has no 'access_function' in catalog; "
            "skipping (ERA5/Arraylake not used)."
        )

    if not hasattr(loaders, fn_name):
        raise AttributeError(
            f"Loader function '{fn_name}' not found in dataset.py for dataset '{ds_name}'."
        )

    fn = getattr(loaders, fn_name)

    # Standard arguments taken from catalog
    platform = access.get("platform")
    path = access.get("path")
    other_args = access.get("other_args", {}) or {}

    # Allow overrides via **kwargs when calling
    return fn(path=path, platform=platform, meta=meta, **other_args, **kwargs)


## Pick a dataset and preview it.

In [12]:
# --- 5) Select & Preview a Dataset (quick peek) ---

PREVIEW_NAME = INSPECT_NAME  # reuse the one we inspected above; change if needed

obj = load_from_catalog(PREVIEW_NAME)

def quick_summary(obj):
    """Print a tiny summary whether it's xarray or pandas."""
    if isinstance(obj, xr.Dataset):
        print(obj)  # Dataset summary
    elif isinstance(obj, xr.DataArray):
        print(obj)
    elif isinstance(obj, pd.DataFrame):
        print(obj.info())
        display(obj.head())
    else:
        print(type(obj))
        print(repr(obj)[:500])

quick_summary(obj)

def peek(obj, var: str | None = None, time_sel=None):
    """
    Minimal peek:
    - xarray.Dataset: print dims/coords; if var given, show one slice (isel 0 along non-time dims)
    - pandas.DataFrame: head()
    """
    if isinstance(obj, xr.Dataset):
        if var is not None and var in obj:
            da = obj[var]
        else:
            # pick first data variable if not provided
            first = next(iter(obj.data_vars)) if len(obj.data_vars) else None
            da = obj[first] if first else None

        if da is None:
            print("(No variables to peek)")
            return

        if time_sel is not None and "time" in da.dims:
            da = da.sel(time=time_sel)
        else:
            # show first index along dims > 1
            sel = {d: 0 for d, sz in da.sizes.items() if sz > 1}
            da = da.isel(**sel) if sel else da

        display(da)
    elif isinstance(obj, pd.DataFrame):
        display(obj.head())
    else:
        print("(peek) unsupported type:", type(obj))

# Example peek:
# For MUR, common variable is 'analysed_sst' (per your catalog) :contentReference[oaicite:2]{index=2}
peek(obj, var="analysed_sst")


<xarray.Dataset> Size: 117TB
Dimensions:           (time: 6443, lat: 17999, lon: 36000)
Coordinates:
  * lat               (lat) float32 72kB -89.99 -89.98 -89.97 ... 89.98 89.99
  * lon               (lon) float32 144kB -180.0 -180.0 -180.0 ... 180.0 180.0
  * time              (time) datetime64[ns] 52kB 2002-06-01T09:00:00 ... 2020...
Data variables:
    analysed_sst      (time, lat, lon) float64 33TB dask.array<chunksize=(5, 1799, 3600), meta=np.ndarray>
    analysis_error    (time, lat, lon) float64 33TB dask.array<chunksize=(5, 1799, 3600), meta=np.ndarray>
    mask              (time, lat, lon) float32 17TB dask.array<chunksize=(5, 1799, 3600), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float64 33TB dask.array<chunksize=(5, 1799, 3600), meta=np.ndarray>
Attributes: (12/47)
    Conventions:                CF-1.7
    Metadata_Conventions:       Unidata Observation Dataset v1.0
    acknowledgment:             Please acknowledge the use of these data with...
    cdm_data

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,(),()
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
Array Chunk Bytes 8 B 8 B Shape () () Dask graph 1 chunks in 3 graph layers Data type float64 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,(),()
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


## LLM → plotting code
Let users ask “plot monthly mean SST over Arabian Sea for 2015” and have the notebook generate runnable code that uses the already-loaded object

## Plot helpers for maps: time-series & snapshot
Fallbacks if the LLM struggles

In [None]:
# Example helpers
# 1) makes a time series by averaging the variable across all non-time dimensions
def plot_timeseries_mean(ds, var, time_slice=None): # ds: xarray.Dataset var: variable name 
    da = ds[var]
    if time_slice is not None:
        da = da.sel(time=slice(*time_slice))
    ts = da.mean(dim=[d for d in da.dims if d!="time"]) # ts: 1-D over time
    ts.plot()
    plt.title(f"{var} global mean over time")
    plt.tight_layout()

# 2) plots a single time slice as a 2-D map
def plot_snapshot_map(ds, var, time=None):
    da = ds[var]
    if time is not None:
        da = da.sel(time=time) # da: 2-D; if the exact timestamp isn’t present, may use .sel(time=time, method="nearest")
    da.plot()
    plt.title(f"{var} snapshot")
    plt.tight_layout()
