In [None]:
import os
import json
import numpy as np
import pandas as pd

import zarr
import xarray as xr
import hvplot.xarray
import holoviews as hv
hv.extension('bokeh')
from dask.diagnostics import ProgressBar

import uuid
import fsspec
from dotenv import load_dotenv

load_dotenv()

True

In [20]:
z = zarr.create(shape=(20, 30), chunks=(10, 10), dtype='f8', store='data.zarr')
z

<zarr.core.Array (20, 30) float64>

In [21]:
z.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(20, 30)"
Chunk shape,"(10, 10)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,4800 (4.7K)
No. bytes stored,337


In [22]:
z[:] = 2.0
z.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(20, 30)"
Chunk shape,"(10, 10)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,4800 (4.7K)
No. bytes stored,589


In [23]:
z.attrs['units'] = 'meters'
z.attrs['standard_name'] = 'altitude'
print(dict(z.attrs))

{'units': 'meters', 'standard_name': 'altitude'}


In [24]:
!tree -a data.zarr | head

[1;36mdata.zarr[0m
├── .zarray
├── .zattrs
├── 0.0
├── 0.1
├── 0.2
├── 1.0
├── 1.1
└── 1.2



In [25]:
with open('data.zarr/.zarray') as f:
    print(json.load(f))

with open('data.zarr/.zattrs') as f:
    print(json.load(f))

{'chunks': [10, 10], 'compressor': {'blocksize': 0, 'clevel': 5, 'cname': 'lz4', 'id': 'blosc', 'shuffle': 1}, 'dtype': '<f8', 'fill_value': 0.0, 'filters': None, 'order': 'C', 'shape': [20, 30], 'zarr_format': 2}
{'standard_name': 'altitude', 'units': 'meters'}


### choosing chunks:

In [26]:
c = zarr.create(shape=(400, 400, 400), chunks=(1, 200, 200), dtype='f8', store='c.zarr')
c[:] = np.random.rand(*c.shape)

In [27]:
%time _ = c[:, 0, 0]

CPU times: user 72.5 ms, sys: 44.4 ms, total: 117 ms
Wall time: 162 ms


In [28]:
c.resize(400, 200, 200)
c.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(400, 200, 200)"
Chunk shape,"(1, 200, 200)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,128000000 (122.1M)
No. bytes stored,112090816 (106.9M)


### groups:

In [29]:
group = zarr.group(store='group.zarr')
group.create(name='d1', shape=(150, 150), chunks=(20, 20), dtype='i4')
group.create(name='d2', shape=(200, 200), chunks=(30, 30), dtype='f4')

<zarr.core.Array '/d2' (200, 200) float32>

In [30]:
group.tree

<bound method Group.tree of <zarr.hierarchy.Group '/'>>

### zarr with aws S3

In [36]:
acc_key = os.getenv('AWS_ACCESS_KEY_ID')
sec_key = os.getenv('AWS_SECRET_ACCESS_KEY')
region = 'eu-north-1'

storage_kwargs = {
    'key': acc_key,
    'secret': sec_key,
    'client_kwargs': {
        'region_name': region
    }
}

my_folder = f"s3://zarrtesting/zarr/{uuid.uuid4().hex}"
target = f"{my_folder}/data.zarr"
store = zarr.storage.FSStore(target, **storage_kwargs)

In [37]:
ds = xr.tutorial.open_dataset('air_temperature')
ds

In [38]:
plot = ds.air.hvplot(x='lon', y='lat', cmap='magma')
display(plot)

BokehModel(combine_events=True, render_bundle={'docs_json': {'517bfe7d-cb11-4700-8c9a-3f317b546237': {'version…

In [39]:
ds_chunked = ds.chunk({'time': 100})
ds_chunked

Unnamed: 0,Array,Chunk
Bytes,29.52 MiB,1.01 MiB
Shape,"(2920, 25, 53)","(100, 25, 53)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 29.52 MiB 1.01 MiB Shape (2920, 25, 53) (100, 25, 53) Dask graph 30 chunks in 2 graph layers Data type float64 numpy.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,29.52 MiB,1.01 MiB
Shape,"(2920, 25, 53)","(100, 25, 53)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [40]:
path = f"{my_folder}/air_temp.zarr"
with ProgressBar():
    ds_chunked.to_zarr(path, storage_options=storage_kwargs)

  ds_chunked.to_zarr(path, storage_options=storage_kwargs)


[########################################] | 100% Completed | 1.57 sms


In [41]:
ds_from_s3 = xr.open_dataset(path, engine='zarr', storage_options=storage_kwargs)
ds_from_s3

In [42]:
ds_from_s3.air.hvplot(x='lon', y='lat', cmap='magma')

BokehModel(combine_events=True, render_bundle={'docs_json': {'5dda4c02-e0ee-4856-8553-d8a3d96a0350': {'version…

In [44]:
ds_from_s3_chunked = xr.open_dataset(path, engine='zarr', storage_options=storage_kwargs, chunks={})
ds_from_s3_chunked

Unnamed: 0,Array,Chunk
Bytes,29.52 MiB,1.01 MiB
Shape,"(2920, 25, 53)","(100, 25, 53)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 29.52 MiB 1.01 MiB Shape (2920, 25, 53) (100, 25, 53) Dask graph 30 chunks in 2 graph layers Data type float64 numpy.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,29.52 MiB,1.01 MiB
Shape,"(2920, 25, 53)","(100, 25, 53)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [45]:
with ProgressBar():
    air_mean = ds_from_s3_chunked.air.mean(('lon', 'lat')).compute()

[########################################] | 100% Completed | 628.37 ms


In [46]:
air_mean.hvplot()

In [51]:
g = zarr.open(zarr.storage.FSStore(path, **storage_kwargs))
g.tree()

Tree(nodes=(Node(disabled=True, name='/', nodes=(Node(disabled=True, icon='table', name='air (2920, 25, 53) in…

In [52]:
dict(g.air.attrs)

{'GRIB_id': 11,
 'GRIB_name': 'TMP',
 '_ARRAY_DIMENSIONS': ['time', 'lat', 'lon'],
 'actual_range': [185.16000366210938, 322.1000061035156],
 'dataset': 'NMC Reanalysis',
 'level_desc': 'Surface',
 'long_name': '4xDaily Air temperature at sigma level 995',
 'parent_stat': 'Other',
 'precision': 2,
 'scale_factor': 0.01,
 'statistic': 'Individual Obs',
 'units': 'degK',
 'var_desc': 'Air temperature'}

### CMIP6 cloud data

In [54]:
cloud = pd.read_csv("https://cmip6-pds.s3-us-west-2.amazonaws.com/pangeo-cmip6.csv") 
cloud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522217 entries, 0 to 522216
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   activity_id     522217 non-null  object 
 1   institution_id  522217 non-null  object 
 2   source_id       522217 non-null  object 
 3   experiment_id   522217 non-null  object 
 4   member_id       522217 non-null  object 
 5   table_id        522217 non-null  object 
 6   variable_id     522217 non-null  object 
 7   grid_label      522217 non-null  object 
 8   zstore          522217 non-null  object 
 9   dcpp_init_year  141466 non-null  float64
 10  version         522217 non-null  int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 43.8+ MB


In [55]:
res = cloud.query("table_id=='day' & variable_id=='pr' & experiment_id=='historical' & source_id=='GFDL-CM4'")
res

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
5485,CMIP,NOAA-GFDL,GFDL-CM4,historical,r1i1p1f1,day,pr,gr1,s3://cmip6-pds/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/h...,,20180701
5487,CMIP,NOAA-GFDL,GFDL-CM4,historical,r1i1p1f1,day,pr,gr2,s3://cmip6-pds/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/h...,,20180701


In [56]:
url = res.zstore.values[0]
url

's3://cmip6-pds/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/historical/r1i1p1f1/day/pr/gr1/v20180701/'

In [57]:
ds = xr.open_dataset(url, engine='zarr', backend_kwargs={'storage_options': {'anon': True}})
ds

In [58]:
ds.pr.hvplot(x='lon', y='lat', cmap='viridis', dynamic=True, clim=(0, 2e-4))

BokehModel(combine_events=True, render_bundle={'docs_json': {'bc43c439-fe59-46ff-b1c3-1f6e84399eb5': {'version…

In [59]:
dsc = ds.chunk(chunks=ds.pr.encoding['preferred_chunks'])
dsc

Unnamed: 0,Array,Chunk
Bytes,2.81 kiB,2.81 kiB
Shape,"(180, 2)","(180, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 2.81 kiB 2.81 kiB Shape (180, 2) (180, 2) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2  180,

Unnamed: 0,Array,Chunk
Bytes,2.81 kiB,2.81 kiB
Shape,"(180, 2)","(180, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.50 kiB,4.50 kiB
Shape,"(288, 2)","(288, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.50 kiB 4.50 kiB Shape (288, 2) (288, 2) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2  288,

Unnamed: 0,Array,Chunk
Bytes,4.50 kiB,4.50 kiB
Shape,"(288, 2)","(288, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.92 MiB,9.38 kiB
Shape,"(60225, 2)","(600, 2)"
Dask graph,101 chunks in 2 graph layers,101 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 0.92 MiB 9.38 kiB Shape (60225, 2) (600, 2) Dask graph 101 chunks in 2 graph layers Data type object numpy.ndarray",2  60225,

Unnamed: 0,Array,Chunk
Bytes,0.92 MiB,9.38 kiB
Shape,"(60225, 2)","(600, 2)"
Dask graph,101 chunks in 2 graph layers,101 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,11.63 GiB,118.65 MiB
Shape,"(60225, 180, 288)","(600, 180, 288)"
Dask graph,101 chunks in 2 graph layers,101 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.63 GiB 118.65 MiB Shape (60225, 180, 288) (600, 180, 288) Dask graph 101 chunks in 2 graph layers Data type float32 numpy.ndarray",288  180  60225,

Unnamed: 0,Array,Chunk
Bytes,11.63 GiB,118.65 MiB
Shape,"(60225, 180, 288)","(600, 180, 288)"
Dask graph,101 chunks in 2 graph layers,101 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [60]:
with ProgressBar():
    pr_std = dsc.pr.std(dim='time').compute()

[########################################] | 100% Completed | 358.68 s


In [61]:
pr_std.hvplot(x='lon', y='lat', cmap='viridis', clim=(0, 1e-4))