In [8]:
import xarray as xr
import numpy as np
from numpy import pi, sin, cos, arccos, clip, deg2rad
import numpy.ma as ma
from datetime import datetime
import dask
import time
import zarr

In [64]:
import gcsfs
gcs = gcsfs.GCSFileSystem()
import os
SCRATCH_BUCKET = os.environ['PANGEO_SCRATCH']

#### Equally structured datasets stored in several zarr-stores.
All datasets have 5 columns (like the example below), each with shape = (~2105319,)

### Storing data

In [None]:
n = 2105319
ds = xr.Dataset(
    data_vars = dict(
        Mean = ( ["i", "pressure"], np.random.uniform(0, 2, n).reshape(n,1) ),
        Anomalies = ( ["i", "pressure"], np.random.uniform(-1, 5, n).reshape(n,1) ),
    ),
    coords=dict(
        time = ( ["i"], np.random.uniform(13000, 18500, n) ),
        latitude = ( ["i"], np.random.uniform(-80, 80, n) ),
        longitude = ( ["i"], np.random.uniform(-180, 180, n) ),
        pressure = (["pressure"], np.array([50]) ),
    ),
    attrs=dict(
        description = 'Description',
        pindex = 4,
        number_of_harmonics = 2,
        window_size = 500e3,
        creation_date = str( datetime.now() )
    ),
)

ds.time.attrs["standard_name"] = 'time'
ds.time.attrs["units"] = 'days since 1970-01-01 00:00:00'
ds.latitude.attrs["standard_name"] = 'latitude'
ds.latitude.attrs["units"] = 'degrees_north'
ds.longitude.attrs["standard_name"] = 'longitude'
ds.longitude.attrs["units"] = 'degrees_east'
ds.pressure.attrs["standard_name"] = 'pressure'
ds.pressure.attrs["units"] = 'decibar'
ds.Mean.attrs["standard_name"] = 'Estimated mean dynamic height'
ds.Mean.attrs["units"] = 'm^2/s^2'
ds.Anomalies.attrs["standard_name"] = 'dynamic height anomalies'
ds.Anomalies.attrs["units"] = 'm^2/s^2'

dsc = ds.chunk()
path_to_zarrstore = f'{SCRATCH_BUCKET}/dataset.zarr'
mapper_alt1 = gcs.get_mapper(path_to_zarrstore)
dsc.to_zarr(mapper_alt1, consolidated=True)

In [128]:
dsc.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:    (i: 2105319, pressure: 1)
Coordinates:
    time       (i) float64 dask.array<chunksize=(2105319,), meta=np.ndarray>
    latitude   (i) float64 dask.array<chunksize=(2105319,), meta=np.ndarray>
    longitude  (i) float64 dask.array<chunksize=(2105319,), meta=np.ndarray>
  * pressure   (pressure) int64 50
Dimensions without coordinates: i
Data variables:
    Mean       (i, pressure) float64 dask.array<chunksize=(2105319, 1), meta=np.ndarray>
    Anomalies  (i, pressure) float64 dask.array<chunksize=(2105319, 1), meta=np.ndarray>
Attributes:
    description:          Description
    pindex:               4
    number_of_harmonics:  2
    window_size:          500000.0
    creation_date:        2023-07-28 10:52:19.703490>

#### Size of each dataset in megabytes

In [67]:
dsc.nbytes/1e6

84.212768

### Connect to a dask-cluster and set cluster-options

In [163]:
from dask_gateway import GatewayCluster, Gateway
from distributed import Client

g = Gateway()
g.list_clusters()

[]

In [164]:
#cluster = g.connect(g.list_clusters()[0].name)

In [165]:
options = g.cluster_options()
options.worker_cores = 2; options.worker_memory = 4
# Create a cluster with those options
cluster = g.new_cluster(options)

In [166]:
#g.list_clusters()

In [167]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: /services/dask-gateway/clusters/prod.8c7a16b83dc849059cafaef8edcd6dcd/status,


In [168]:
cluster.scale(1)

### Loading of data from cloud-storage<br>
Want to load 4 out of 5 columns of data, into several dask-workers memory, and do embarrasingly parallel computations ( ~1e6  ).

Loading data only:

In [183]:
@dask.delayed
def load_chunksauto(mapper):
    """Load data from zarr-store into worker-memory"""
    
    ds = xr.open_zarr(mapper, consolidated=True, chunks='auto') 
    data0 = ds.Anomalies.sel(pressure=50).load() # to shape=(n,) from shape=(n,1)
    ii = ~xr.apply_ufunc(np.isnan, data0)
    data, lat, lon, time = data0.values[ii], ds.latitude.load().values[ii], ds.longitude.load().values[ii], ds.time.load().values[ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [194]:
@dask.delayed
def load_zarr(mapper):
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.open_consolidated(mapper)
    data0 = np.squeeze(z.Anomalies[:]) # to shape=(n,) from shape=(n,1)
    ii = ~np.isnan(data0)
    data, lat, lon, time = data0[ii], z.latitude[:][ii], z.longitude[:][ii], z.time[:][ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [195]:
@dask.delayed
def load_zarr_convenience(mapper): 
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.convenience.open_consolidated(mapper)
    data0 = np.squeeze(z.Anomalies[:]) # to shape=(n,) from shape=(n,1)
    ii = ~np.isnan(data0)
    data, lat, lon, time = data0[ii], z.latitude[:][ii], z.longitude[:][ii], z.time[:][ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

### Timing loading of data:<br>

<br>
ds = xr.open_zarr(mapper, consolidated=True, chunks='auto'); ...  

In [97]:
%timeit dask.compute( load_chunksauto(mapper_alt1) )[0]

1.07 s ± 67.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<br>
ds = zarr.open_consolidated(mapper); ... 

In [98]:
%timeit dask.compute( load_zarr(mapper_alt1) )[0]

1.01 s ± 270 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<br>
ds = zarr.convenience.open_consolidated(mapper); ... 

In [90]:
%timeit dask.compute( load_zarr_convenience(mapper_alt1) )[0]

987 ms ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Scaling down and closing cluster

In [138]:
cluster.scale(0)

In [139]:
cluster.close()

In [140]:
cluster.shutdown()

### The reports from the logger
(I am not sure if these look the same for the dask-workers):

In [152]:
import fsspec.utils
fsspec.utils.setup_logging(logger_name="gcsfs")

<Logger gcsfs (DEBUG)>

In [182]:
ds = xr.open_zarr(mapper_alt1, consolidated=True, chunks='auto') 
data0 = ds.Anomalies.sel(pressure=50).load() # to shape=(n,) from shape=(n,1)
ii = ~xr.apply_ufunc(np.isnan, data0)
data, lat, lon, time = data0.values[ii], ds.latitude.load().values[ii], ds.longitude.load().values[ii], ds.time.load().values[ii]

2023-07-28 11:38:11,422 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2F.zmetadata?alt=media, (), {}
2023-07-28 11:38:11,487 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2Ftime%2F0?alt=media, (), {}
2023-07-28 11:38:11,633 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2Ftime%2F0?alt=media, (), {}
2023-07-28 11:38:11,757 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2Fpressure%2F0?alt=media, (), {}


In [160]:
z = zarr.open_consolidated(mapper_alt1);
zanom0= np.squeeze(z.Anomalies[:]);
ii= ~np.isnan(zanom0);
zanom= zanom0[ii]; 
zlat = z.latitude[:][ii]; 
zlon=z.longitude[:][ii]; 
zt = z.time[:][ii]; 

2023-07-28 11:28:31,766 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2F.zmetadata?alt=media, (), {}
2023-07-28 11:28:31,839 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2FAnomalies%2F0.0?alt=media, (), {}
2023-07-28 11:28:32,133 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2Flatitude%2F0?alt=media, (), {}
2023-07-28 11:28:32,310 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2Flongitude%2F0?alt=media, (), {}
2023-07-28 11:28:32,543 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scrat

In [161]:
z = zarr.convenience.open_consolidated(mapper_alt1)
data0 = np.squeeze(z.Anomalies[:]) # shape=(n,) from shape=(n,1)
ii = ~np.isnan(data0)
data, lat, lon, time = data0[ii], z.latitude[:][ii], z.longitude[:][ii], z.time[:][ii]

2023-07-28 11:28:33,806 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2F.zmetadata?alt=media, (), {}
2023-07-28 11:28:33,838 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2FAnomalies%2F0.0?alt=media, (), {}
2023-07-28 11:28:33,959 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2Flatitude%2F0?alt=media, (), {}
2023-07-28 11:28:34,186 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset.zarr%2Flongitude%2F0?alt=media, (), {}
2023-07-28 11:28:34,307 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scrat