# Explore NOAA NODD CDR SST
* use Virtualizarr to create a cloud-optimized virtual dataset from multiple remote NetCDF files
* explore the virtual dataset using Holoviz tools
* compute in parallel using Dask

In [None]:

import fsspec

In [None]:
fs = fsspec.filesystem("s3", anon=True)

In [None]:
oisst_files = fs.glob(
    "s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202503/oisst-avhrr-v02r01.*.nc"
)

oisst_files = sorted(["s3://" + f for f in oisst_files])

In [None]:
print(len(oisst_files))

In [None]:
from virtualizarr import open_virtual_dataset

In [None]:
so = dict(anon=True)

In [None]:
%%time
virtual_datasets = [
    open_virtual_dataset(url, indexes={}, reader_options={"storage_options": so}, )
    for url in oisst_files
]

In [None]:
import xarray as xr

In [None]:
# this Dataset wraps a bunch of virtual ManifestArray objects directly
virtual_ds = xr.concat(
    virtual_datasets,
    dim="time",
    coords="minimal",
    compat="override",
    combine_attrs="override",
)
# cache the combined dataset pattern to disk, in this case using the existing kerchunk specification for reference files
virtual_ds.virtualize.to_kerchunk('combined.json', format='json')

In [None]:
ds = xr.open_dataset('combined.json', engine='kerchunk', backend_kwargs={'storage_options':dict(remote_options=so)}, chunks={})  # normal xarray.Dataset object, wrapping dask/numpy arrays etc.


In [None]:
ds

In [None]:
import hvplot.xarray

In [None]:
ds['sst'].nbytes/1e9

In [None]:
ds['sst']

In [None]:
ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180))  #.sortby('lon')

In [None]:
ds['sst'][0,0,:,:].hvplot()

In [None]:
import dask

In [None]:
cluster_type = 'Coiled'

In [None]:
if cluster_type == 'Gateway':
    from dask_gateway import Gateway
    gateway = Gateway()  # instantiate Dask gateway 
    options = gateway.cluster_options()
    cluster = gateway.new_cluster(options)
    client = cluster.get_client()
    cluster.adapt(minimum=4, maximum=30)

In [None]:
if cluster_type == 'Coiled':
    import coiled
    cluster = coiled.Cluster(
        region="us-west-2",
        arm=True,   # run on ARM to save energy & cost
        worker_vm_types=["t4g.small"],  # cheap, small ARM instances, 2cpus, 2GB RAM
        worker_options={'nthreads':2},
        n_workers=4,
        wait_for_workers=False,
        compute_purchase_option="spot_with_fallback",
        name='hackhours_rps',   # Dask cluster name
        software='hackhours-arm',  # Conda environment name
        workspace='esip-lab',
        timeout=180   # leave cluster running for 3 min in case we want to use it again
    )

    client = cluster.get_client()

In [None]:
%%time
so = dict(anon=True)


virtual_datasets = dask.compute(*[
    dask.delayed(open_virtual_dataset)(url, indexes={}, reader_options={"storage_options": so}, )
    for url in oisst_files
])

In [None]:
# this Dataset wraps a bunch of virtual ManifestArray objects directly
virtual_ds = xr.concat(
    virtual_datasets,
    dim="time",
    coords="minimal",
    compat="override",
    combine_attrs="override",
)
# cache the combined dataset pattern to disk, in this case using the existing kerchunk specification for reference files
virtual_ds.virtualize.to_kerchunk('combined.json', format='json')

In [None]:
url = 'https://gist.githubusercontent.com/rsignell/dd6a8d6fafcea40dfd23dd3e887fcc1e/raw/ba366821704a648ff71aa669fe07ccf503cbfd1d/sst_combined_refs.json'
ds = xr.open_dataset(url, engine='kerchunk', backend_kwargs={'storage_options':dict(remote_options=so)}, chunks={})  # normal xarray.Dataset object, wrapping dask/numpy arrays etc.


In [None]:
%%time
da = ds['sst'][:3,0,:,:].load()

In [None]:
da = ds['sst'][:,0,:,:].load()

In [None]:
ds