In [1]:
import xarray as xr
import numpy as np
from numpy import pi, sin, cos, arccos, clip, deg2rad
import numpy.ma as ma
from datetime import datetime
import dask
import time
import zarr

#### 30 equally structured datasets stored in 30 zarr-stores.
All datasets have 5 columns,  `latitude`, `longitude`, `time`, `mean`, `anomalies`, each with shape = (2105319,) 

In [2]:
import json
import gcsfs

with open('pangeo-181919-e7bc5bdaf4d5.json') as f:
    token = json.load(f)
gcs = gcsfs.GCSFileSystem(token=token)

plevel = 4 # Choosing an arbitrary dataset out of the 30
dspath='pangeo-argo-eke/global/mean_and_anomalies/with_pressure_coordinate/readable_ws_and_NHarm/mean_and_anomalies_global_ws500_plevel'+str(plevel)+'.zarr'
mapper_alt0 = gcs.get_mapper(dspath)
d = xr.open_zarr(mapper_alt0, consolidated=True, decode_cf=False )

In [3]:
import fsspec.utils
fsspec.utils.setup_logging(logger_name="gcsfs")

<Logger gcsfs (DEBUG)>

### Storing data

In [None]:
# Mean and anomalies for pressurelevel is saved to output-path:
ds = xr.Dataset(
    data_vars = dict(
        Mean = ( ["i", "pressure"], d.Mean.load().values ),
        Anomalies = ( ["i", "pressure"], d.Anomalies.load().values ),
    ),
    coords=dict(
        time = ( ["i"], d.time.load().values ),
        latitude = ( ["i"], d.latitude.load().values ),
        longitude = ( ["i"], d.longitude.load().values ),
        pressure = (["pressure"], d.pressure.load().values ),
    ),
    attrs=dict(
        description = 'Estimated mean dynamic height on profile-coordinates, and anomalies by subtracting estimated mean from observations',
        pressureindex = d.pressureindex,
        number_of_harmonics = d.number_of_harmonics,
        window_size = d.window_size,
        creation_date = str( datetime.now() )
    ),
)

ds.time.attrs["standard_name"] = 'time'
ds.time.attrs["units"] = 'days since 1970-01-01 00:00:00'
ds.latitude.attrs["standard_name"] = 'latitude'
ds.latitude.attrs["units"] = 'degrees_north'
ds.longitude.attrs["standard_name"] = 'longitude'
ds.longitude.attrs["units"] = 'degrees_east'
ds.pressure.attrs["standard_name"] = 'pressure'
ds.pressure.attrs["units"] = 'decibar'
ds.Mean.attrs["standard_name"] = 'Estimated mean dynamic height'
ds.Mean.attrs["units"] = 'm^2/s^2'
ds.Anomalies.attrs["standard_name"] = 'dynamic height anomalies'
ds.Anomalies.attrs["units"] = 'm^2/s^2'

dsc = ds.chunk()
outfile = 'pangeo-argo-eke/chunk_alternatives/global_mean_and_anomalies_plevel'+str(plevel)+'_current_zarr-storing-code.zarr'
mapper_alt1 = gcs.get_mapper(outfile)
dsc.to_zarr(mapper_alt1, consolidated=True)

#### Size of each dataset in megabytes

In [7]:
dsc.nbytes/1e6

84.212768

### Connect to a dask-cluster and set cluster-options

In [8]:
from dask_gateway import GatewayCluster, Gateway
from distributed import Client

g = Gateway()
g.list_clusters()

[]

In [9]:
#cluster = g.connect(g.list_clusters()[0].name)

In [10]:
options = g.cluster_options()
options.worker_cores = 2; options.worker_memory = 4
# Create a cluster with those options
cluster = g.new_cluster(options)

In [50]:
#g.list_clusters()

In [11]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: /services/dask-gateway/clusters/prod.5a5ed91bd69c4708a888ddff55b6a356/status,


In [12]:
cluster.scale(1)

### Loading of data from cloud-storage<br>
Want to load 4 columns of data, `latitude`, `longitude`, `time`, `anomalies`, to several dask-workers memory and do embarrasingly parallel computations ( ~1e6  ).

Loading data only:

In [188]:
@dask.delayed
def load_chunksauto(mapper):
    """Load data from zarr-store into worker-memory"""
    
    ds = xr.open_zarr(mapper, consolidated=True, chunks='auto') 
    data0 = ds.Anomalies.sel(pressure=50).load().values # shape=(n,) from shape=(n,1)
    ii = ~xr.apply_ufunc(np.isnan, data0)
    data, lat, lon, time = data0[ii], ds.latitude.load().values[ii], ds.longitude.load().values[ii], ds.time.load().values[ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [190]:
@dask.delayed
def load_zarr(mapper):
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.open_consolidated(mapper)
    data0 = np.squeeze(z.Anomalies[:]) # shape=(n,) from shape=(n,1)
    ii = ~np.isnan(data0)
    data, lat, lon, time = data0[ii], z.latitude[:][ii], z.longitude[:][ii], z.time[:][ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [228]:
@dask.delayed
def load_zarr_convenience(mapper):
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.convenience.open_consolidated(mapper)
    data0 = np.squeeze(z.Anomalies[:]) # shape=(n,) from shape=(n,1)
    ii = ~np.isnan(data0)
    data, lat, lon, time = data0[ii], z.latitude[:][ii], z.longitude[:][ii], z.time[:][ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

### Timing loading of data:<br>

<br>
ds = xr.open_zarr(mapper, consolidated=True, chunks='auto'); ...  

In [226]:
%timeit dask.compute( load_chunksauto(mapper_alt1) )[0]

1.15 s ± 157 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<br>
ds = zarr.open_consolidated(mapper); ... 

In [234]:
%timeit dask.compute( load_zarr(mapper_alt1) )[0]

1.15 s ± 190 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<br>
ds = zarr.convenience.open_consolidated(mapper); ... 

In [232]:
%timeit dask.compute( load_zarr_convenience(mapper_alt1) )[0]

1.16 s ± 53.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Scaling down and closing cluster

In [235]:
cluster.scale(0)

In [236]:
cluster.close()

In [237]:
cluster.shutdown()

2023-07-27 19:42:09,769 - distributed.client - ERROR - 
Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/distributed/utils.py", line 742, in wrapper
    return await func(*args, **kwargs)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/distributed/client.py", line 1298, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/distributed/client.py", line 1328, in _ensure_connected
    comm = await connect(
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/distributed/comm/core.py", line 291, in connect
    comm = await asyncio.wait_for(
  File "/srv/conda/envs/notebook/lib/python3.10/asyncio/tasks.py", line 432, in wait_for
    await waiter
asyncio.exceptions.CancelledError

Traceback (most recent call last):
  File "/srv/conda/envs/notebook/lib/python3.10/site-packages/distributed/utils.py", line 742, in wrapper
    return await func(*args, **k

### The reports from the logger
(I am not sure if these look the same for the dask-workers):

In [203]:
ds = xr.open_zarr(mapper_alt1, consolidated=True, chunks='auto') 
data0 = ds.Anomalies.sel(pressure=50).load().values # shape=(n,) from shape=(n,1)
ii = ~xr.apply_ufunc(np.isnan, data0)
data, lat, lon, time = data0[ii], ds.latitude.load().values[ii], ds.longitude.load().values[ii], ds.time.load().values[ii]

2023-07-27 19:26:24,732 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2F.zmetadata?alt=media, (), {}
2023-07-27 19:26:24,775 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2Ftime%2F0?alt=media, (), {}
2023-07-27 19:26:24,919 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2Ftime%2F0?alt=media, (), {}
2023-07-27 19:26:25,014 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2Fpressure%2F0?alt=media, (), {}


In [206]:
z = zarr.open_consolidated(mapper_alt1);
zanom0= np.squeeze(z.Anomalies[:]);
ii= ~np.isnan(zanom0);
zanom= zanom0[ii]; 
zlat = z.latitude[:][ii]; 
zlon=z.longitude[:][ii]; 
zt = z.time[:][ii]; 

2023-07-27 19:26:35,849 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2F.zmetadata?alt=media, (), {}
2023-07-27 19:26:35,896 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2FAnomalies%2F0.0?alt=media, (), {}
2023-07-27 19:26:36,044 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2Flatitude%2F0?alt=media, (), {}
2023-07-27 19:26:36,200 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2Flongitude%2F0?alt=media, (), {}
2023-07-27 19:

In [207]:
z = zarr.convenience.open_consolidated(mapper_alt1)
data0 = np.squeeze(z.Anomalies[:]) # shape=(n,) from shape=(n,1)
ii = ~np.isnan(data0)
data, lat, lon, time = data0[ii], z.latitude[:][ii], z.longitude[:][ii], z.time[:][ii]

2023-07-27 19:26:39,607 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2F.zmetadata?alt=media, (), {}
2023-07-27 19:26:39,651 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2FAnomalies%2F0.0?alt=media, (), {}
2023-07-27 19:26:39,787 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2Flatitude%2F0?alt=media, (), {}
2023-07-27 19:26:39,989 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-argo-eke/o/chunk_alternatives%2Fglobal_mean_and_anomalies_plevel4_current_zarr-storing-code.zarr%2Flongitude%2F0?alt=media, (), {}
2023-07-27 19: