In [1]:
import xarray as xr
import numpy as np
from numpy import pi, sin, cos, arccos, clip, deg2rad
import numpy.ma as ma
from datetime import datetime
import dask
import time
import zarr

In [2]:
import gcsfs
gcs = gcsfs.GCSFileSystem()
import os
SCRATCH_BUCKET = os.environ['PANGEO_SCRATCH']

In [25]:
n = 2105319
Anomalies = np.random.uniform(-1, 5, n).reshape(n,1)
time = np.random.uniform(13000, 18500, n).reshape(n,1)
latitude =  np.random.uniform(-80, 80, n).reshape(n,1)
longitude = np.random.uniform(-180, 180, n).reshape(n,1)

In [47]:
#AnomArray = np.concatenate((Anomalies, time, latitude, longitude), axis=1)

In [33]:
cols = ['Anomalies', 'time', 'latitude', 'longitude']

#### Equally structured datasets stored in several zarr-stores.
All datasets have 5 columns (like the example below), each with shape = (~2105319,)

### Storing data

In [97]:
ds = xr.Dataset(
    data_vars = dict(
        AnomArray = ( ["i", "columns"], AnomArray ),
    ),
    coords=dict(
        columns = (["columns"],np.arange(4) ),
        i = (["i"], np.arange(n) ),
    ),
    attrs=dict(
        description = 'Description',
        pindex = 4,
        number_of_harmonics = 2,
        window_size = 500e3,
        creation_date = str( datetime.now() )
    ),
)

ds.AnomArray.attrs["standard_name"] = 'Array of anomalies, time, latitude, longitude'
ds.AnomArray.attrs["units"] = 'm^2/s^2'

dsc = ds.chunk()
path_to_zarrstore = f'{SCRATCH_BUCKET}/dataset_alternative_2.zarr'
mapper_alt1 = gcs.get_mapper(path_to_zarrstore)
dsc.to_zarr(mapper_alt1, consolidated=True)

2023-07-28 18:53:59,364 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2F.zarray?alt=media, (), {}
2023-07-28 18:53:59,418 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2F.zgroup?alt=media, (), {}
2023-07-28 18:53:59,443 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2F.zarray?alt=media, (), {}
2023-07-28 18:53:59,468 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2F.zgroup?alt=media, (), {}
2023-07-28 18:53:59,509 - gcsfs - DEBUG - _call -- POST: https://storage.googleapis.com/upload/storage/v1/b/pang

<xarray.backends.zarr.ZarrStore at 0x7f6fc0055620>

In [98]:
dsc.info

<bound method Dataset.info of <xarray.Dataset>
Dimensions:    (i: 2105319, columns: 4)
Coordinates:
  * columns    (columns) int64 0 1 2 3
  * i          (i) int64 0 1 2 3 4 5 ... 2105314 2105315 2105316 2105317 2105318
Data variables:
    AnomArray  (i, columns) float64 dask.array<chunksize=(2105319, 4), meta=np.ndarray>
Attributes:
    description:          Description
    pindex:               4
    number_of_harmonics:  2
    window_size:          500000.0
    creation_date:        2023-07-28 18:53:59.269384>

#### Size of each dataset in megabytes

In [99]:
dsc.nbytes/1e6

84.212792

### Connect to a dask-cluster and set cluster-options

In [41]:
from dask_gateway import GatewayCluster, Gateway
from distributed import Client

g = Gateway()
g.list_clusters()

[]

In [42]:
#cluster = g.connect(g.list_clusters()[0].name)

In [43]:
options = g.cluster_options()
options.worker_cores = 2; options.worker_memory = 4
# Create a cluster with those options
cluster = g.new_cluster(options)

In [44]:
#g.list_clusters()

In [45]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: /services/dask-gateway/clusters/prod.a620d415e99a4baeb11b99022a40a5bc/status,


In [64]:
cluster.scale(1)

### Loading of data from cloud-storage<br>
Want to load 4 out of 5 columns of data, into several dask-workers memory, and do embarrasingly parallel computations ( ~1e6  ).

Loading data only:

In [128]:
@dask.delayed
def load_chunksauto(mapper):
    """Load data from zarr-store into worker-memory"""
    
    ds = xr.open_zarr(mapper, consolidated=True, chunks='auto') 
    data0 = ds.AnomArray.load().values
    ii = ~xr.apply_ufunc(np.isnan, data0[:,0])
    data, time, lat, lon = data0[ii,0], data0[ii,1],data0[ii,2], data0[ii,3]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [129]:
@dask.delayed
def load_zarr(mapper):
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.open_consolidated(mapper)
    data0 = z.AnomArray[:] 
    ii = ~np.isnan(data0[:,0])
    data, time, lat, lon = data0[ii,0], data0[ii,1],data0[ii,2], data0[ii,3]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [130]:
@dask.delayed
def load_zarr_convenience(mapper): 
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.convenience.open_consolidated(mapper)
    data0 = z.AnomArray[:] 
    ii = ~np.isnan(data0[:,0])
    data, time, lat, lon = data0[ii,0], data0[ii,1],data0[ii,2], data0[ii,3]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

### Timing loading of data:<br>

<br>
ds = xr.open_zarr(mapper, consolidated=True, chunks='auto'); ...  

In [131]:
%timeit dask.compute( load_chunksauto(mapper_alt1) )[0]

1.82 s ± 99.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<br>
ds = zarr.open_consolidated(mapper); ... 

In [132]:
%timeit dask.compute( load_zarr(mapper_alt1) )[0]

713 ms ± 23.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<br>
ds = zarr.convenience.open_consolidated(mapper); ... 

In [133]:
%timeit dask.compute( load_zarr_convenience(mapper_alt1) )[0]

711 ms ± 34.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Scaling down and closing cluster

In [138]:
cluster.scale(0)

In [139]:
cluster.close()

In [140]:
cluster.shutdown()

### The reports from the logger
(I am not sure if these look the same for the dask-workers):

In [82]:
import fsspec.utils
fsspec.utils.setup_logging(logger_name="gcsfs")

<Logger gcsfs (DEBUG)>

In [134]:
ds = xr.open_zarr(mapper_alt1, consolidated=True, chunks='auto') 
data0 = ds.AnomArray.load().values
ii = ~xr.apply_ufunc(np.isnan, data0)
ii = ii[:,0]
data, time, lat, lon = data0[ii,0], data0[ii,1],data0[ii,2], data0[ii,3]

2023-07-28 19:03:03,098 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2F.zmetadata?alt=media, (), {}
2023-07-28 19:03:03,203 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2Fcolumns%2F0?alt=media, (), {}
2023-07-28 19:03:03,251 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2Fi%2F0?alt=media, (), {}
2023-07-28 19:03:03,334 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2Fi%2F1?alt=media, (), {}
2023-07-28 19:03:03,373 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/

In [135]:
z = zarr.open_consolidated(mapper_alt1)
data0 = z.AnomArray[:] 
ii = ~np.isnan(data0[:,0])
data, time, lat, lon = data0[ii,0], data0[ii,1],data0[ii,2], data0[ii,3]

2023-07-28 19:03:08,002 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2F.zmetadata?alt=media, (), {}
2023-07-28 19:03:08,027 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2FAnomArray%2F0.0?alt=media, (), {}


In [137]:
z = zarr.convenience.open_consolidated(mapper_alt1)
data0 = z.AnomArray[:] 
ii = ~np.isnan(data0[:,0])
data, time, lat, lon = data0[ii,0], data0[ii,1],data0[ii,2], data0[ii,3]

2023-07-28 19:03:15,018 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2F.zmetadata?alt=media, (), {}
2023-07-28 19:03:15,053 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pangeo-integration-te-3eea-prod-scratch-bucket/o/ofk123%2Fdataset_alternative_2.zarr%2FAnomArray%2F0.0?alt=media, (), {}
