# Set up Arraylake Dataset Management 

upload the following datasets to Arraylake repository:

    - Indian Ocean Dataset
    - ERA5 Arraylake
    - MUR SST (use S3)

In [None]:
# Setup & Initialization
from arraylake import Client

client = Client()

In [None]:
# Create repository on ArrayLake: .../oceanhackweek/dashboard_llm
client.create_repo("oceanhackweek/dashboard_llm", bucket_config_nickname="nmfs-openscapes-persistent-workshop" , kind="icechunk")

In [5]:
# Open the repo for writing
repo = client.get_repo("oceanhackweek/dashboard_llm")
session = repo.writable_session("main")

### Dataset Upload

In [10]:
# Setup
import xarray as xr

# 1) Open the public Zarr (dask-backed)
indian = xr.open_zarr(
    "gcs://nmfs_odp_nwfsc/CB/mind_the_chl_gap/IO.zarr",
    storage_options={"token": "anon"},
    consolidated=True,        # source is consolidated
    chunks="auto",            # stream in chunks
)

# 2) Forces all variables in the dataset to have exactly the same chunk structure along shared dimensions
indian = indian.unify_chunks()

### Indian Ocean Dataset

In [12]:
# make a shallow copy so we can safely edit encodings, need to change from zarr v2 to zarr v3
# assume ds is your selected dataset ready to write
ds = indian.copy(deep=False)

for v in ds.variables:
    enc = ds[v].encoding
    # strip v2 stuff
    enc.pop("compressor", None)
    enc.pop("compressors", None)
    enc.pop("filters", None)
    # strip chunk *hints* that came from the source store
    enc.pop("chunks", None)
    enc.pop("chunksizes", None)         # sometimes present
    enc.pop("preferred_chunks", None)   # sometimes present

# (optional) set the chunking you want for writing, e.g. 100-day time chunks
# ds = ds.chunk({"time": 100})

# let Zarr use the Dask chunking; align just in case
ds.to_zarr(
    session.store,
    group="indian_ocean",
    mode="w",
    zarr_format=3,
    consolidated=False,
    write_empty_chunks=False,
    align_chunks=True,   # let xarray auto-align to Dask chunks if needed
    # safe_chunks=False, # not usually needed if align_chunks=True; leave default
)


<xarray.backends.zarr.ZarrStore at 0x7f0f1b213100>

In [None]:
# Commit upload
session.commit('Initial Commit')

### MUR SST dataset (using S3)

In [14]:
# Setup
import s3fs

# Bypass AWS tokens, keys etc.
s3 = s3fs.S3FileSystem(anon=True)

# Verify that we're in the right place
sst_files = s3.ls("mur-sst/zarr-v1/")
sst_files

ds = xr.open_zarr(
        store=s3fs.S3Map(
            root=f"s3://{sst_files[0]}", s3=s3, check=False
        )
)

ds

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 30.38 TiB 247.06 MiB Shape (6443, 17999, 36000) (5, 1799, 3600) Dask graph 141790 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 30.38 TiB 247.06 MiB Shape (6443, 17999, 36000) (5, 1799, 3600) Dask graph 141790 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,123.53 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 15.19 TiB 123.53 MiB Shape (6443, 17999, 36000) (5, 1799, 3600) Dask graph 141790 chunks in 2 graph layers Data type float32 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,15.19 TiB,123.53 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 30.38 TiB 247.06 MiB Shape (6443, 17999, 36000) (5, 1799, 3600) Dask graph 141790 chunks in 2 graph layers Data type float64 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,30.38 TiB,247.06 MiB
Shape,"(6443, 17999, 36000)","(5, 1799, 3600)"
Dask graph,141790 chunks in 2 graph layers,141790 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
