In [1]:
import xarray as xr
import numpy as np
from numpy import pi, sin, cos, arccos, clip, deg2rad
import numpy.ma as ma
from datetime import datetime
import dask
import time
import zarr

In [2]:
import gcsfs
gcs = gcsfs.GCSFileSystem()
import os
SCRATCH_BUCKET = os.environ['PANGEO_SCRATCH']

In [15]:
gcs.rm('pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr', recursive=True)
#gcs.ls('pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr')

In [16]:
n = 2105319
time = np.random.uniform(13000, 18500, n)
latitude = np.random.uniform(-80, 80, n)
longitude = np.random.uniform(-180, 180, n)

#### Equally structured datasets stored in one zarr-store

### Storing data

In [17]:
ds1 = xr.Dataset(
    data_vars = dict(
        Anomalies = ( ["i", "pressure"], np.random.uniform(-1, 5, n).reshape(n,1) ),
        Mean = ( ["i", "pressure"], np.random.uniform(0, 2, n).reshape(n,1) ),
    ),
    coords=dict(
        i = (["i"], np.arange(n) ),
        time = ( ["i"], time),
        latitude = ( ["i"], latitude ),
        longitude = ( ["i"], longitude),
        pressure = (["pressure"], np.array([5]) ),
    ),
    attrs=dict(
        description = 'Description',
        pindex = 4,
        number_of_harmonics = 2,
        window_size = 500e3,
        creation_date = str( datetime.now() )
    ),
)

ds1.time.attrs["standard_name"] = 'time'
ds1.time.attrs["units"] = 'days since 1970-01-01 00:00:00'
ds1.latitude.attrs["standard_name"] = 'latitude'
ds1.latitude.attrs["units"] = 'degrees_north'
ds1.longitude.attrs["standard_name"] = 'longitude'
ds1.longitude.attrs["units"] = 'degrees_east'
ds1.pressure.attrs["standard_name"] = 'pressure'
ds1.pressure.attrs["units"] = 'decibar'
ds1.Mean.attrs["standard_name"] = 'Estimated mean dynamic height'
ds1.Mean.attrs["units"] = 'm^2/s^2'
ds1.Anomalies.attrs["standard_name"] = 'dynamic height anomalies'
ds1.Anomalies.attrs["units"] = 'm^2/s^2'

dsc = ds1.chunk()
path_to_zarrstore = f'{SCRATCH_BUCKET}/dataset_alternative_3.zarr'
mapper_alt3 = gcs.get_mapper(path_to_zarrstore)

%time dsc.to_zarr(mapper_alt3, consolidated=True)

CPU times: user 701 ms, sys: 149 ms, total: 851 ms
Wall time: 7.05 s


<xarray.backends.zarr.ZarrStore at 0x7fa8927ac430>

In [19]:
ds2 = xr.Dataset(
    data_vars = dict(
        Anomalies = ( ["i", "pressure"], np.random.uniform(-1, 5, n).reshape(n,1) ),
        Mean = ( ["i", "pressure"], np.random.uniform(0, 2, n).reshape(n,1) ),
    ),
    coords=dict(
        i = (["i"], np.arange(n) ),
        time = ( ["i"], time),
        latitude = ( ["i"], latitude ),
        longitude = ( ["i"], longitude),
        pressure = (["pressure"], np.array([10]) ),
    ),
    attrs=dict(
        description = 'Description',
        pindex = 4,
        number_of_harmonics = 2,
        window_size = 500e3,
        creation_date = str( datetime.now() )
    ),
)

ds2.time.attrs["standard_name"] = 'time'
ds2.time.attrs["units"] = 'days since 1970-01-01 00:00:00'
ds2.latitude.attrs["standard_name"] = 'latitude'
ds2.latitude.attrs["units"] = 'degrees_north'
ds2.longitude.attrs["standard_name"] = 'longitude'
ds2.longitude.attrs["units"] = 'degrees_east'
ds2.pressure.attrs["standard_name"] = 'pressure'
ds2.pressure.attrs["units"] = 'decibar'
ds2.Mean.attrs["standard_name"] = 'Estimated mean dynamic height'
ds2.Mean.attrs["units"] = 'm^2/s^2'
ds2.Anomalies.attrs["standard_name"] = 'dynamic height anomalies'
ds2.Anomalies.attrs["units"] = 'm^2/s^2'



dsc2 = ds2.chunk()
path_to_zarrstore = f'{SCRATCH_BUCKET}/dataset_alternative_3.zarr'
mapper_alt3 = gcs.get_mapper(path_to_zarrstore)
%time dsc2.to_zarr(mapper_alt3, consolidated=True, append_dim="pressure")

CPU times: user 1.02 s, sys: 149 ms, total: 1.17 s
Wall time: 12.3 s


<xarray.backends.zarr.ZarrStore at 0x7fa8ce290b30>

In [20]:
ds3 = xr.Dataset(
    data_vars = dict(
        Anomalies = ( ["i", "pressure"], np.random.uniform(-1, 5, n).reshape(n,1) ),
        Mean = ( ["i", "pressure"], np.random.uniform(0, 2, n).reshape(n,1) ),
    ),
    coords=dict(
        i = (["i"], np.arange(n) ),
        time = ( ["i"], time),
        latitude = ( ["i"], latitude ),
        longitude = ( ["i"], longitude),
        pressure = (["pressure"], np.array([20]) ),
    ),
    attrs=dict(
        description = 'Description',
        pindex = 4,
        number_of_harmonics = 2,
        window_size = 500e3,
        creation_date = str( datetime.now() )
    ),
)

ds3.time.attrs["standard_name"] = 'time'
ds3.time.attrs["units"] = 'days since 1970-01-01 00:00:00'
ds3.latitude.attrs["standard_name"] = 'latitude'
ds3.latitude.attrs["units"] = 'degrees_north'
ds3.longitude.attrs["standard_name"] = 'longitude'
ds3.longitude.attrs["units"] = 'degrees_east'
ds3.pressure.attrs["standard_name"] = 'pressure'
ds3.pressure.attrs["units"] = 'decibar'
ds3.Mean.attrs["standard_name"] = 'Estimated mean dynamic height'
ds3.Mean.attrs["units"] = 'm^2/s^2'
ds3.Anomalies.attrs["standard_name"] = 'dynamic height anomalies'
ds3.Anomalies.attrs["units"] = 'm^2/s^2'

dsc3 = ds3.chunk()
path_to_zarrstore = f'{SCRATCH_BUCKET}/dataset_alternative_3.zarr'
mapper_alt3 = gcs.get_mapper(path_to_zarrstore)
%time dsc3.to_zarr(mapper_alt3, consolidated=True, append_dim="pressure")

CPU times: user 1.08 s, sys: 121 ms, total: 1.2 s
Wall time: 11.9 s


<xarray.backends.zarr.ZarrStore at 0x7fa8b55b73e0>

In [21]:
ds = xr.open_zarr(mapper_alt3, consolidated=True)
ds

Unnamed: 0,Array,Chunk
Bytes,16.06 MiB,16.06 MiB
Shape,"(2105319,)","(2105319,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 16.06 MiB 16.06 MiB Shape (2105319,) (2105319,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2105319  1,

Unnamed: 0,Array,Chunk
Bytes,16.06 MiB,16.06 MiB
Shape,"(2105319,)","(2105319,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.06 MiB,16.06 MiB
Shape,"(2105319,)","(2105319,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 16.06 MiB 16.06 MiB Shape (2105319,) (2105319,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2105319  1,

Unnamed: 0,Array,Chunk
Bytes,16.06 MiB,16.06 MiB
Shape,"(2105319,)","(2105319,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.06 MiB,16.06 MiB
Shape,"(2105319,)","(2105319,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 16.06 MiB 16.06 MiB Shape (2105319,) (2105319,) Dask graph 1 chunks in 2 graph layers Data type datetime64[ns] numpy.ndarray",2105319  1,

Unnamed: 0,Array,Chunk
Bytes,16.06 MiB,16.06 MiB
Shape,"(2105319,)","(2105319,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,48.19 MiB,16.06 MiB
Shape,"(2105319, 3)","(2105319, 1)"
Dask graph,3 chunks in 2 graph layers,3 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 48.19 MiB 16.06 MiB Shape (2105319, 3) (2105319, 1) Dask graph 3 chunks in 2 graph layers Data type float64 numpy.ndarray",3  2105319,

Unnamed: 0,Array,Chunk
Bytes,48.19 MiB,16.06 MiB
Shape,"(2105319, 3)","(2105319, 1)"
Dask graph,3 chunks in 2 graph layers,3 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,48.19 MiB,16.06 MiB
Shape,"(2105319, 3)","(2105319, 1)"
Dask graph,3 chunks in 2 graph layers,3 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 48.19 MiB 16.06 MiB Shape (2105319, 3) (2105319, 1) Dask graph 3 chunks in 2 graph layers Data type float64 numpy.ndarray",3  2105319,

Unnamed: 0,Array,Chunk
Bytes,48.19 MiB,16.06 MiB
Shape,"(2105319, 3)","(2105319, 1)"
Dask graph,3 chunks in 2 graph layers,3 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [22]:
xr.__version__, gcsfs.__version__, zarr.__version__,dask.__version__

('2022.12.0', '2022.11.0', '2.13.3', '2022.12.0')

In [23]:
gcs.ls('pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr')

['pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/.zattrs',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/.zgroup',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/.zmetadata',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/Anomalies',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/Mean',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/i',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/latitude',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/longitude',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/pressure',
 'pangeo-integration-te-3eea-prod-scratch-bucket/ofk123/dataset_alternative_3.zarr/time']

#### Size of dataset in megabytes

In [24]:
ds.nbytes/1e6

168.425544

In [25]:
#ds.sortby("pressure").info

### Connect to a dask-cluster and set cluster-options

In [4]:
from dask_gateway import GatewayCluster, Gateway
from distributed import Client

g = Gateway()
g.list_clusters()

[]

In [5]:
options = g.cluster_options()
options.worker_cores = 2; options.worker_memory = 4
# Create a cluster with those options
cluster = g.new_cluster(options)

In [6]:
g.list_clusters()

[ClusterReport<name=prod.66058e7ee2894e428b4a268e3492461d, status=RUNNING>]

In [7]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: /services/dask-gateway/clusters/prod.66058e7ee2894e428b4a268e3492461d/status,


In [8]:
cluster.scale(1)

### Loading of data from cloud-storage<br>
Want to load 4 out of 5 columns of data, into several dask-workers memory, and do embarrasingly parallel computations ( ~1e6  ).

Loading data only:

In [9]:
@dask.delayed
def load_chunksauto(mapper, plevel):
    """Load data from zarr-store into worker-memory"""
    
    ds = xr.open_zarr(mapper, consolidated=True, chunks='auto') 
    data0 = ds.Anomalies.isel(pressure=plevel).load().values
    ii = ~xr.apply_ufunc(np.isnan, data0)
    data, time, lat, lon = data0[ii], ds.time.load().values[ii],ds.latitude.load().values[ii], ds.longitude.load().values[ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [10]:
@dask.delayed
def load_zarr(mapper, plevel):
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.open_consolidated(mapper)
    data0 = z.Anomalies[:,plevel]
    ii = ~np.isnan(data0)
    data, time, lat, lon = data0[ii], z.time[:][ii], z.latitude[:][ii], z.longitude[:][ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [11]:
@dask.delayed
def load_zarr_convenience(mapper, plevel): 
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.convenience.open_consolidated(mapper)
    data0 = z.Anomalies[:,plevel]
    ii = ~np.isnan(data0)
    data, time, lat, lon = data0[ii], z.time[:][ii], z.latitude[:][ii], z.longitude[:][ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

### Timing loading of data:<br>

In [12]:
plevel = 2
path_to_zarrstore = f'{SCRATCH_BUCKET}/dataset_alternative_3.zarr'
mapper_alt3 = gcs.get_mapper(path_to_zarrstore)

<br>
ds = xr.open_zarr(mapper, consolidated=True, chunks='auto'); ...  

In [90]:
%timeit -n 15 dask.compute( load_chunksauto(mapper_alt3, plevel) )[0]

2.24 s ± 40.4 ms per loop (mean ± std. dev. of 7 runs, 15 loops each)


<br>
ds = zarr.open_consolidated(mapper); ... 

In [88]:
%timeit -n 15 dask.compute( load_zarr(mapper_alt3, plevel) )[0]

801 ms ± 39.5 ms per loop (mean ± std. dev. of 7 runs, 15 loops each)


<br>
ds = zarr.convenience.open_consolidated(mapper); ... 

In [89]:
%timeit -n 15 dask.compute( load_zarr_convenience(mapper_alt3, plevel) )[0]

848 ms ± 250 ms per loop (mean ± std. dev. of 7 runs, 15 loops each)


#### Scaling down and closing cluster

In [60]:
cluster.scale(0)

In [61]:
cluster.close()

In [63]:
cluster.shutdown()

### The reports from the logger
(I am not sure if these look the same for the dask-workers):

In [13]:
import fsspec.utils
fsspec.utils.setup_logging(logger_name="gcsfs")

<Logger gcsfs (DEBUG)>

In [21]:
%%time
ds = xr.open_zarr(mapper_alt3, consolidated=True, chunks='auto')
data0 = ds.Anomalies.isel(pressure=plevel).load().values
ii = ~xr.apply_ufunc(np.isnan, data0)
xrdata, xrtime, xrlat, xrlon = data0[ii], ds.time.load().values[ii],ds.latitude.load().values[ii], ds.longitude.load().values[ii]

2023-07-31 20:46:15,520 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2F.zmetadata?alt=media, (), {}
2023-07-31 20:46:15,586 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2Ftime%2F0?alt=media, (), {}
2023-07-31 20:46:15,853 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2Ftime%2F0?alt=media, (), {}
2023-07-31 20:46:16,001 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2Fi%2F0?alt=media, (), {}
2023-07-31 20:46:16,041 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/

CPU times: user 357 ms, sys: 85 ms, total: 442 ms
Wall time: 2.91 s


In [22]:
%%time
z = zarr.open_consolidated(mapper_alt3)
data0 = z.Anomalies[:,plevel]
ii = ~np.isnan(data0)
z1data, z1time, z1lat, z1lon = data0[ii], z.time[:][ii], z.latitude[:][ii], z.longitude[:][ii]

2023-07-31 20:46:27,500 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2F.zmetadata?alt=media, (), {}
2023-07-31 20:46:27,544 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2FAnomalies%2F0.2?alt=media, (), {}
2023-07-31 20:46:27,845 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2Ftime%2F0?alt=media, (), {}
2023-07-31 20:46:27,986 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2Flatitude%2F0?alt=media, (), {}
2023-07-31 20:46:28,203 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download

CPU times: user 256 ms, sys: 82.5 ms, total: 339 ms
Wall time: 833 ms


In [23]:
%%time
z = zarr.convenience.open_consolidated(mapper_alt3)
data0 = z.Anomalies[:,plevel]
ii = ~np.isnan(data0)
z2data, z2time, z2lat, z2lon = data0[ii], z.time[:][ii], z.latitude[:][ii], z.longitude[:][ii]

2023-07-31 20:46:35,269 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2F.zmetadata?alt=media, (), {}
2023-07-31 20:46:35,309 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2FAnomalies%2F0.2?alt=media, (), {}
2023-07-31 20:46:35,458 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2Ftime%2F0?alt=media, (), {}
2023-07-31 20:46:35,599 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_3.zarr%2Flatitude%2F0?alt=media, (), {}
2023-07-31 20:46:35,759 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download

CPU times: user 300 ms, sys: 101 ms, total: 401 ms
Wall time: 704 ms


In [59]:
#np.sum(z1data==z2data) == np.sum(z2data==xrdata) == np.sum(xrdata==z1data) == np.sum(z1time==z2time) == np.sum(z1lat==z2lat) == np.sum(z2lat==xrlat) == np.sum(xrlat==z1lat) == np.sum(z1lon==z2lon) == np.sum(z2lon==xrlon) == np.sum(xrlon==z1lon)