# NetCDF Zarr Sequential Recipe
### NOTE - all files and catalogs here are temporary and not ready for general use!!!

- Can go to https://aws-cloudnode.esgfed.org/thredds/fileServer/CMIP6 to get a sample netcdf file

In [1]:
import pandas as pd
import xarray as xr
from cftime import DatetimeNoLeap

In [2]:
# the versions are fine for cftime:
import zarr
import cftime
xr.__version__, zarr.__version__, cftime.__version__, pd.__version__

('0.16.2', '2.6.1', '1.3.1', '1.2.1')

In [10]:
#import s3fs   - don't use, not very reliable with open_dataset (for CNRM models, in particular)
#fs = s3fs.S3FileSystem(anon=True)

In [4]:
df_s3 = pd.read_csv('http://fletcher.ldeo.columbia.edu/catalogs/s3-world.csv.gz')
tests = ['CMIP/MPI-M/MPI-ESM1-2-HR/historical/r1i1p1f1/Amon/tas/gn/v20190710/'
        ,'CMIP/NOAA-GFDL/GFDL-CM4/historical/r1i1p1f1/Omon/tos/gn/v20180701/'
        ,'CMIP/NASA-GISS/GISS-E2-1-G/historical/r1i1p1f1/Amon/ua/gn/v20180827/'
        ,'CMIP/THU/CIESM/piControl/r1i1p1f1/Amon/zg/gr/v20191202/'
        ,'CMIP/CNRM-CERFACS/CNRM-ESM2-1/piControl/r1i1p1f2/Amon/co2/gr/v20181115/'
        ]
files_per_chunk = [10,1,1,1,1]

In [32]:
# pick a dataset to save as zarr:
test_number = 4
vstore = tests[test_number]
inputs_per_chunk = files_per_chunk[test_number]

df_vstore = df_s3[df_s3.vstore == vstore]
var = df_vstore.variable_id.unique()[0]
# make sure we are looking at the last available version:
last_version = sorted(df_vstore.version.unique())[-1]
dze = df_vstore[df_vstore.version == last_version].reset_index(drop=True)

print(f"number of files = {len(dze)}")
input_urls = dze.url.to_list()
input_urls

number of files = 5


['https://aws-cloudnode.esgfed.org/thredds/fileServer/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/piControl/r1i1p1f2/Amon/co2/gr/v20181115/co2_Amon_CNRM-ESM2-1_piControl_r1i1p1f2_gr_185001-194912.nc',
 'https://aws-cloudnode.esgfed.org/thredds/fileServer/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/piControl/r1i1p1f2/Amon/co2/gr/v20181115/co2_Amon_CNRM-ESM2-1_piControl_r1i1p1f2_gr_195001-204912.nc',
 'https://aws-cloudnode.esgfed.org/thredds/fileServer/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/piControl/r1i1p1f2/Amon/co2/gr/v20181115/co2_Amon_CNRM-ESM2-1_piControl_r1i1p1f2_gr_205001-214912.nc',
 'https://aws-cloudnode.esgfed.org/thredds/fileServer/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/piControl/r1i1p1f2/Amon/co2/gr/v20181115/co2_Amon_CNRM-ESM2-1_piControl_r1i1p1f2_gr_215001-224912.nc',
 'https://aws-cloudnode.esgfed.org/thredds/fileServer/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/piControl/r1i1p1f2/Amon/co2/gr/v20181115/co2_Amon_CNRM-ESM2-1_piControl_r1i1p1f2_gr_225001-234912.nc']

In [33]:
# Choose a Recipe:
from pangeo_forge.recipe import NetCDFtoZarrSequentialRecipe

In [34]:
import tempfile
from fsspec.implementations.local import LocalFileSystem
from pangeo_forge.storage import FSSpecTarget, CacheFSSpecTarget

fs_local = LocalFileSystem()

cache_target = FSSpecTarget(fs_local, root_path='netcdf-tmp')

target = FSSpecTarget(fs_local, root_path='zarr-tmp')

In [36]:
# Look at the first file from the dataset, using the OPeNDAP url:
ncfile = input_urls[0].replace('fileServer','dodsC')
ds = xr.open_dataset(ncfile)
ntimes = len(ds.time)
print(f"number of time slices in first file = {ntimes}")
print(f"Dataset size is {ds.nbytes/1e6} MB")  # Too large - but don't know how to split
ds.coords, ds.data_vars

number of time slices in first file = 1200
Dataset size is 2988.473548 MB


(Coordinates:
   * lat      (lat) float64 -88.93 -87.54 -86.14 -84.74 ... 86.14 87.54 88.93
   * lon      (lon) float64 0.0 1.406 2.812 4.219 ... 354.4 355.8 357.2 358.6
   * plev     (plev) float32 1e+05 9.25e+04 8.5e+04 7e+04 ... 1e+03 500.0 100.0
   * time     (time) datetime64[ns] 1850-01-16T12:00:00 ... 1949-12-16T12:00:00,
 Data variables:
     time_bounds  (time, axis_nbounds) datetime64[ns] ...
     co2          (time, plev, lat, lon) float32 ...)

In [45]:
if test_number == 1:
    recipe = NetCDFtoZarrSequentialRecipe(
        input_urls=input_urls,
        #xarray_open_kwargs={'preprocess':set_bnds_as_coords},
        xarray_open_kwargs={'drop_variables':'height'},
        sequence_dim="time",
        inputs_per_chunk=inputs_per_chunk,
        nitems_per_input=ntimes
    )
else:
    recipe = NetCDFtoZarrSequentialRecipe(
        input_urls=input_urls,
        sequence_dim="time",
        xarray_open_kwargs={'use_cftime':True, 'decode_coords':False},
        #xarray_open_kwargs={'decode_times':False},  # this makes a problem with 365_day calendar in recipe.prepare_target()
        #xarray_open_kwargs={'use_cftime':True,'decode_times':False}, # recipe.prepare_target() fails when using decode_times
        inputs_per_chunk=inputs_per_chunk,
        require_cache=False,
        nitems_per_input=ntimes
    )
recipe

NetCDFtoZarrSequentialRecipe(sequence_dim='time', inputs_per_chunk=1, nitems_per_input=1200, target=<pangeo_forge.storage.UninitializedTarget object at 0x7f486724d290>, input_cache=<pangeo_forge.storage.UninitializedTarget object at 0x7f4870c741d0>, require_cache=False, consolidate_zarr=True, xarray_open_kwargs={'use_cftime': True, 'decode_coords': False}, xarray_concat_kwargs={}, delete_input_encoding=True)

In [49]:
# set the cache and target location
recipe.input_cache = cache_target
recipe.target = target
recipe

NetCDFtoZarrSequentialRecipe(sequence_dim='time', inputs_per_chunk=1, nitems_per_input=1200, target=FSSpecTarget(fs=<fsspec.implementations.local.LocalFileSystem object at 0x7f486b8688d0>, root_path='zarr-tmp'), input_cache=FSSpecTarget(fs=<fsspec.implementations.local.LocalFileSystem object at 0x7f486b8688d0>, root_path='netcdf-tmp'), require_cache=False, consolidate_zarr=True, xarray_open_kwargs={'use_cftime': True, 'decode_coords': False}, xarray_concat_kwargs={}, delete_input_encoding=True)

In [50]:
# Must cache the first chunk as well as any others you want to look at
all_chunks = list(recipe.iter_chunks())

#for input_file in recipe.inputs_for_chunk(all_chunks[0]):
#    print(input_file)
#    recipe.cache_input(input_file)

In [51]:
# put basic info in target directory
recipe.prepare_target()  # This needs to be done on the FIRST chunk

TypeError: Invalid value for attr 'coordinates': Empty(dtype=dtype('S1')) must be a number, a string, an ndarray or a list/tuple of numbers/strings for serialization to netCDF files

In [None]:

for input_file in recipe.inputs_for_chunk(all_chunks[0]):
    print(input_file)
    recipe.cache_input(input_file)

In [None]:
# Now lets look at the last one:
for input_file in recipe.inputs_for_chunk(all_chunks[-1]):
    print(input_file)
    recipe.cache_input(input_file)

In [None]:
ds_chunk = recipe.open_chunk(all_chunks[0])   
print(f'Total chunk size: {ds_chunk.nbytes / 1e6} MB')  

In [None]:
# store first chunk
zgroup = zarr.open(target_dir.name)
print(zgroup.tree())

In [None]:
from cftime import DatetimeNoLeap
recipe.store_chunk(all_chunks[0])
zgroup[var].info

In [None]:
recipe.store_chunk(all_chunks[-1])



In [None]:
# check first chunk
ds = xr.open_zarr(target_dir.name)
ds[var][0,0].plot()

In [None]:
hurl = recipe.inputs_for_chunk(all_chunks[-1])[0]
url = hurl.replace('fileServer','dodsC')
print('source URL = ',url)
print('open netcdf dataset')
ds = xr.open_dataset(url)
print(f"first time = {ds.time[0]}")

zbdir = 'zarr-tmp'
print('save zarr dataset')
ds.to_zarr(zbdir, consolidated=True, mode='w')
ds2 = xr.open_zarr(zbdir)
print(f"first time = {ds2.time[0]}")