### To be used after netcdf files are downloaded locally 
- netcdf files should be in the default subdirectory 'nctemp'
- zarr files can go into 'ztemp'
- once the problem has been identified, an exception should be added to `csv/exceptions.csv` and the 
original request can be re-run

In [8]:
from glob import glob
import os
import xarray as xr
import cftime
import numpy as np
import time 
import datetime

In [9]:
def set_bnds_as_coords(ds):
    new_coords_vars = [var for var in ds.data_vars if 'bnds' in var or 'bounds' in var]
    ds = ds.set_coords(new_coords_vars)
    return ds

In [10]:
sdrive = 'nctemp/'
zdrive = 'ztemp/'

In [10]:
string = '_Omon_IPSL-CM6A-LR_abrupt-4xCO2_r1i1p1f1_gn'
tpath = 'CMIP/IPSL/IPSL-CM6A-LR/abrupt-4xCO2/r1i1p1f1/Omon/'
tgrid = '/gn'

In [None]:
table_id = 'Omon'
variable_ids = ['chl']
for variable_id in variable_ids: 
    
    print(sdrive + variable_id + string + '_*.nc')
    
    gfiles = sorted(glob(sdrive + variable_id + string + '_*.nc'))
    
    print(gfiles)
#    for gfile in gfiles:
#        print('checking ',gfile)
#        ds = xr.open_dataset(gfile,decode_times=True)
#        print(ds.coords)
        
    zbdir = tdrive + tpath + variable_id + tgrid

    nc_size = os.path.getsize(gfiles[0])
    ds = xr.open_dataset(gfiles[0])
    svar = ds.variable_id
    nt = ds[svar].shape[0]

    chunksize_optimal = 2e8
    chunksize = max(int(nt*chunksize_optimal/nc_size),1)

    print('nt:',nt,'netcdf size:', nc_size/1e6, 'Mb')
    print('suggested chunksize:', chunksize)

    if 'time' in ds.coords:   # please use cftime - piControl cannot use datetime64
        df7 = xr.open_mfdataset(gfiles, preprocess=set_bnds_as_coords, data_vars='minimal', chunks={'time': chunksize}, 
                                use_cftime=True, combine='nested', concat_dim='time') # combine='nested'
        print(df7.time.values[0],df7.time.values[-1])
    else: # fixed in time, no time grid
        df7 = xr.open_mfdataset(gfiles, preprocess=set_bnds_as_coords, combine='by_coords', data_vars='minimal')

    if 'time' in ds.coords:
        year = sorted(list(set(df7.time.dt.year.values)))    
        print(np.diff(year).sum(), len(year))
        if '3hr' in table_id:
            assert (np.diff(year).sum() == len(year)-1) | (np.diff(year).sum() == len(year)-2)
        elif 'dec' in table_id:
            assert (np.diff(year).sum()/10 == len(year)) | (np.diff(year).sum()/10 == len(year)-1)
        else:
            assert np.diff(year).sum() == len(year)-1

    dsl = xr.open_dataset(gfiles[0])
    tracking_id = dsl.tracking_id
    if len(gfiles) > 1:
        for file in gfiles[1:]:
            dsl = xr.open_dataset(file)
            tracking_id = tracking_id+'\n'+dsl.tracking_id
    df7.attrs['tracking_id'] = tracking_id

    date = str(datetime.datetime.now().strftime("%Y-%m-%d"))
    nstatus = date + ';created; by gcs.cmip6.ldeo@gmail.com'
    df7.attrs['status'] = nstatus

    df7 = df7.chunk(chunks={'time' : chunksize})   # yes, do it again
    #del df7[svar].encoding['missing_value']
    print(zbdir)
    df7.to_zarr(zbdir, consolidated=True, mode='w') 
    df7.attrs['tracking_id']

In [5]:
assert False  # further checks can be done below

AssertionError: 