In [1]:
import xarray as xr
import numpy as np
from numpy import pi, sin, cos, arccos, clip, deg2rad
import numpy.ma as ma
from datetime import datetime
import dask
import time
import zarr

#### Currently 30 equally structured datasets stored in 30 zarr-stores.
##### All dataset has the same length of tabular data,
All 30 datasets has 5 columns,  `latitude`, `longitude`, `time`, `mean`, `anomalies`, each with shape = (2105319,), where mean and anomalies is the only value different between the datasets.

In [130]:
import json
import gcsfs

with open('pangeo-181919-e7bc5bdaf4d5.json') as f:
    token = json.load(f)
gcs = gcsfs.GCSFileSystem(token=token)

plevel = 4 # Choosing an arbitrary dataset out of the 30
dspath='pangeo-argo-eke/global/mean_and_anomalies/with_pressure_coordinate/readable_ws_and_NHarm/mean_and_anomalies_global_ws500_plevel'+str(plevel)+'.zarr'
mapper_alt0 = gcs.get_mapper(dspath)

In [131]:
d = xr.open_zarr(mapper_alt0, consolidated=True)

In [129]:
#import fsspec.utils
#fsspec.utils.setup_logging(logger_name="gcsfs")
z.time[:].size

2105319

### Storing

In [285]:
# Mean and anomalies for pressurelevel is saved to output-path:
ds = xr.Dataset(
    data_vars = dict(
        Mean = ( ["i", "pressure"], d.Mean.load().values ),
        Anomalies = ( ["i", "pressure"], d.Anomalies.load().values ),
    ),
    coords=dict(
        time = ( ["i"], z.time[:] ),
        latitude = ( ["i"], d.latitude.load().values ),
        longitude = ( ["i"], d.longitude.load().values ),
        pressure = (["pressure"], d.pressure.load().values ),
    ),
    attrs=dict(
        description = 'Estimated mean dynamic height on profile-coordinates, and anomalies by subtracting estimated mean from observations',
        pressureindex = d.pressureindex,
        number_of_harmonics = d.number_of_harmonics,
        window_size = d.window_size,
        creation_date = str( datetime.now() )
    ),
)

ds.time.attrs["standard_name"] = 'time'
ds.time.attrs["units"] = 'days since 1970-01-01 00:00:00'
ds.latitude.attrs["standard_name"] = 'latitude'
ds.latitude.attrs["units"] = 'degrees_north'
ds.longitude.attrs["standard_name"] = 'longitude'
ds.longitude.attrs["units"] = 'degrees_east'
ds.pressure.attrs["standard_name"] = 'pressure'
ds.pressure.attrs["units"] = 'decibar'
ds.Mean.attrs["standard_name"] = 'Estimated mean dynamic height'
ds.Mean.attrs["units"] = 'm^2/s^2'
ds.Anomalies.attrs["standard_name"] = 'dynamic height anomalies'
ds.Anomalies.attrs["units"] = 'm^2/s^2'

dsc = ds.chunk()
outfile = 'pangeo-argo-eke/chunk_alternatives/global_mean_and_anomalies_plevel'+str(plevel)+'_current_zarr-storing-code.zarr'
mapper_alt1 = gcs.get_mapper(outfile)
dsc.to_zarr(mapper_alt1, consolidated=True)

<xarray.backends.zarr.ZarrStore at 0x7fe3b3839460>

#### Size of each dataset in megabytes

In [286]:
dsc.nbytes/1e6

84.212768

### Connect to a dask-cluster and set cluster-options

In [107]:
from dask_gateway import GatewayCluster, Gateway
from distributed import Client

g = Gateway()
g.list_clusters()

[]

In [108]:
#cluster = g.connect(g.list_clusters()[0].name)

In [109]:
options = g.cluster_options()
options.worker_cores = 2; options.worker_memory = 4
# Create a cluster with those options
cluster = g.new_cluster(options)

In [110]:
#g.list_clusters()

In [111]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: /services/dask-gateway/clusters/prod.67fa51d02c424ff390b3f20b65d081af/status,


In [112]:
cluster.scale(1)

### Loading of data from cloud-storage<br>
Want to load 4 columns of data, `latitude`, `longitude`, `time`, `anomalies`, into memory and do a computation. Which is done embarrasingly parallel ( x 1e6  ).

Loading data only:

In [287]:
@dask.delayed
def load_chunksauto(mapper):
    """Load data from zarr-store into worker-memory"""
    
    ds = xr.open_zarr(mapper, consolidated=True, chunks='auto') 
    data0 = ds.Anomalies.sel(pressure=50).load().values
    ii = ~xr.apply_ufunc(np.isnan, data0)
    data, lat, lon, time = data0[ii], ds.latitude.load().values[ii], ds.longitude.load().values[ii], ds.time.load().values[ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [288]:
@dask.delayed
def load_chunksNone(mapper):
    """Load data from zarr-store into worker-memory"""
    
    ds = xr.open_zarr(mapper, consolidated=True, chunks=None) 
    data0 = ds.Anomalies.sel(pressure=50).values
    ii = ~xr.apply_ufunc(np.isnan, data0)
    data, lat, lon, time = data0[ii], ds.latitude.values[ii], ds.longitude.values[ii], ds.time.values[ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

In [289]:
@dask.delayed
def zarr_open(mapper):
    """Load data from zarr-store into worker-memory"""
    
    z = zarr.open_consolidated(mapper)
    data0 = np.squeeze(z.Anomalies[:])
    ii = ~np.isnan(data0)
    data, lat, lon, time = data0[ii], z.latitude[:][ii], z.longitude[:][ii], z.time[:][ii]
    
    # Calculation using data-, lat-, lon-, and time-arrays
    #
    #
    #
    return None

<br>
<br>

When using zarr, time-array has datatype float instead of datetime:

In [300]:
ds = xr.open_zarr(mapper_alt1, consolidated=True, chunks='auto') 
data0 = ds.Anomalies.sel(pressure=50).load().values
ii = ~xr.apply_ufunc(np.isnan, data0)
data, lat, lon, time = data0[ii], ds.latitude.load().values[ii], ds.longitude.load().values[ii], ds.time.load().values[ii]

In [299]:
z = zarr.open_consolidated(mapper_alt1)
zdata0 = np.squeeze(z.Anomalies[:])
zii = ~np.isnan(zdata0)
zdata, zlat, zlon, ztime = zdata0[zii], z.latitude[:][zii], z.longitude[:][zii], z.time[:][zii]

In [301]:
time

array(['2004-01-06T07:15:02.000002560', '2004-01-21T07:25:17.000006144',
       '2004-02-05T07:36:52.000018304', ...,
       '2021-11-04T12:43:31.999994624', '2021-11-14T10:28:53.000001280',
       '2021-11-24T09:04:17.000014080'], dtype='datetime64[ns]')

In [302]:
ztime

array([12423.30210648, 12438.30922454, 12453.31726852, ...,
       18935.53023148, 18945.43672454, 18955.37797454])

<br>
<br>
<br>

### Timing loading of dataset:<br>

<br>
ds = xr.open_zarr(mapper, consolidated=True, chunks='auto'); ...  

In [311]:
%timeit dask.compute( load_chunksauto(mapper_alt1) )[0]

1.05 s ± 39.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<br>
ds = xr.open_zarr(mapper, consolidated=True, chunks=None); ...

In [312]:
%timeit dask.compute( load_chunksNone(mapper_alt1) )[0]

1.29 s ± 70.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<br>
z = zarr.open_consolidated(mapper); ... 

In [313]:
%timeit dask.compute( zarr_open(mapper_alt1) )[0]

822 ms ± 77.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Scaling down and closing cluster

In [314]:
cluster.scale(0)

In [315]:
cluster.close()

In [316]:
cluster.shutdown()

2023-07-27 14:02:28,338 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
