# Dataset Log: 

# In this project, we are using the following datasets:


### 1. Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)

In [15]:
# Set up
import s3fs
import xarray as xr
import re

In [6]:
# Bypass AWS tokens, keys etc.
s3 = s3fs.S3FileSystem(anon=True)

# Verify that we're in the right place
sst_files = s3.ls("mur-sst/zarr-v1/")
sst_files

['mur-sst/zarr-v1/',
 'mur-sst/zarr-v1/.zattrs',
 'mur-sst/zarr-v1/.zgroup',
 'mur-sst/zarr-v1/.zmetadata',
 'mur-sst/zarr-v1/analysed_sst',
 'mur-sst/zarr-v1/analysis_error',
 'mur-sst/zarr-v1/lat',
 'mur-sst/zarr-v1/lon',
 'mur-sst/zarr-v1/mask',
 'mur-sst/zarr-v1/sea_ice_fraction',
 'mur-sst/zarr-v1/time']

In [7]:
# Open MUR SST
ds = xr.open_zarr(
        store=s3fs.S3Map(
            root=f"s3://{sst_files[0]}", s3=s3, check=False
        )
)

ds

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 30.38 TiB 247.06 MiB Shape (6443, 17999, 36000) (5, 1799, 3600) Dask graph 141790 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 30.38 TiB 247.06 MiB Shape (6443, 17999, 36000) (5, 1799, 3600) Dask graph 141790 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,123.53 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 15.19 TiB 123.53 MiB Shape (6443, 17999, 36000) (5, 1799, 3600) Dask graph 141790 chunks in 2 graph layers Data type float32 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,123.53 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 30.38 TiB 247.06 MiB Shape (6443, 17999, 36000) (5, 1799, 3600) Dask graph 141790 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


### 2. NSF NCAR Curated ECMWF Reanalysis 5 (ERA5)

In [8]:
# Verify that we're in the right place
files = s3.ls("nsf-ncar-era5/")
files

['nsf-ncar-era5/e5.oper.an.pl',
 'nsf-ncar-era5/e5.oper.an.sfc',
 'nsf-ncar-era5/e5.oper.an.vinteg',
 'nsf-ncar-era5/e5.oper.fc.sfc.accumu',
 'nsf-ncar-era5/e5.oper.fc.sfc.instan',
 'nsf-ncar-era5/e5.oper.fc.sfc.meanflux',
 'nsf-ncar-era5/e5.oper.fc.sfc.minmax',
 'nsf-ncar-era5/e5.oper.invariant',
 'nsf-ncar-era5/index.html']

In [16]:
# 2) Find NetCDF files for a variable (here: 2m temperature  -  "2t")
# Note: NetCDF files organized by variable, not a single multi-var Zarr.
def find_var_files(prefix: str, var_keys=("2t",), limit=24, maxdepth=5):
    pat = re.compile(r"|".join(map(re.escape, var_keys)), re.I)
    hits = []
    for key in s3.find(prefix, maxdepth=maxdepth):
        if key.endswith(".nc") and pat.search(key):
            hits.append(f"s3://{key}")
            if len(hits) >= limit:
                break
    return hits

In [18]:
paths = find_var_files("nsf-ncar-era5/e5.oper.an.sfc", var_keys=("2t",), limit=12)
paths

['s3://nsf-ncar-era5/e5.oper.an.sfc/194001/e5.oper.an.sfc.128_167_2t.ll025sc.1940010100_1940013123.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194002/e5.oper.an.sfc.128_167_2t.ll025sc.1940020100_1940022923.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194003/e5.oper.an.sfc.128_167_2t.ll025sc.1940030100_1940033123.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194004/e5.oper.an.sfc.128_167_2t.ll025sc.1940040100_1940043023.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194005/e5.oper.an.sfc.128_167_2t.ll025sc.1940050100_1940053123.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194006/e5.oper.an.sfc.128_167_2t.ll025sc.1940060100_1940063023.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194007/e5.oper.an.sfc.128_167_2t.ll025sc.1940070100_1940073123.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194008/e5.oper.an.sfc.128_167_2t.ll025sc.1940080100_1940083123.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194009/e5.oper.an.sfc.128_167_2t.ll025sc.1940090100_1940093023.nc',
 's3://nsf-ncar-era5/e5.oper.an.sfc/194010/e5.oper.an.sfc.128_16

In [20]:
# Open with xarray (NetCDF4 on S3; use h5netcdf or netcdf4 engine)
ds = xr.open_mfdataset(
    paths,
    engine="h5netcdf",                     # good with remote S3 objects
    combine="by_coords",
    chunks={},                             # add dask chunks if you like, e.g. {"time": 24}
    backend_kwargs={"storage_options": {"anon": True}},
)
ds

Unnamed: 0,Array,Chunk
Bytes,33.97 GiB,3.97 MiB
Shape,"(8784, 721, 1440)","(27, 139, 277)"
Dask graph,58256 chunks in 37 graph layers,58256 chunks in 37 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 33.97 GiB 3.97 MiB Shape (8784, 721, 1440) (27, 139, 277) Dask graph 58256 chunks in 37 graph layers Data type float32 numpy.ndarray",1440  721  8784,

Unnamed: 0,Array,Chunk
Bytes,33.97 GiB,3.97 MiB
Shape,"(8784, 721, 1440)","(27, 139, 277)"
Dask graph,58256 chunks in 37 graph layers,58256 chunks in 37 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.31 kiB,2.91 kiB
Shape,"(8784,)","(744,)"
Dask graph,12 chunks in 25 graph layers,12 chunks in 25 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 34.31 kiB 2.91 kiB Shape (8784,) (744,) Dask graph 12 chunks in 25 graph layers Data type int32 numpy.ndarray",8784  1,

Unnamed: 0,Array,Chunk
Bytes,34.31 kiB,2.91 kiB
Shape,"(8784,)","(744,)"
Dask graph,12 chunks in 25 graph layers,12 chunks in 25 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray


### 3. Indian Ocean Data: https://ocean-satellite-tools.github.io/mind-the-chl-gap/notebooks/IO_Zarr.html

In [10]:
ds = xr.open_dataset(
    "gcs://nmfs_odp_nwfsc/CB/mind_the_chl_gap/IO.zarr",
    engine="zarr",
    backend_kwargs={"storage_options": {"token": "anon"}},
    consolidated=True
)
ds