In [None]:
! !pip install git+https://github.com/rabernat/pystac.git@optional-timezone-check

In [1]:
from datetime import datetime, timezone
from tqdm.auto import trange, tqdm
import pystac
import cftime
import zarr

import concurrent.futures

In [2]:
# https://github.com/bstriner/keras-tqdm/issues/21#issuecomment-443019223

from IPython.core.display import HTML
HTML("""
<style>
.p-Widget.jp-OutputPrompt.jp-OutputArea-prompt:empty {
  padding: 0;
  border: 0;
}
</style>
""")

In [3]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [4]:
_GLOBAL_EXTENT = pystac.SpatialExtent([[-180, -90, 180, 90]])

In [5]:
_DEFAULT_TEMPORAL_EXTENT = pystac.TemporalExtent([[datetime.min, datetime.max]])

def get_temporal_extent_from_zarr_group(path):
    zgroup = zarr.open_consolidated(fs.get_mapper(path))
    try:
        time_arr = zgroup['time']
    except KeyError:
        return _DEFAULT_TEMPORAL_EXTENT
    
    date_min, date_max = [
        cftime.num2date(num, time_arr.attrs['units'], calendar=time_arr.attrs['calendar'])
        for num in (time_arr[0], time_arr[-1])
    ]   
    
    return pystac.TemporalExtent([[date_min, date_max]])

In [6]:
providers = [
    pystac.Provider("World Climate Research Programme",
        roles=["producer", "licensor"],
        url="https://www.wcrp-climate.org/wgcm-cmip/wgcm-cmip6"
    ),
    pystac.Provider("Pangeo",
        roles=["processor"],
        url="https://pangeo.io/"
    ),
    pystac.Provider("Google Cloud",
        roles=["host"],
        url="https://cloud.google.com/storage"
    )
]
providers

[<pystac.collection.Provider at 0x7feca7247a00>,
 <pystac.collection.Provider at 0x7feca7247a90>,
 <pystac.collection.Provider at 0x7feca7247790>]

In [7]:
extensions = ["collection-assets"]

license_link = pystac.Link(
    "license",
    "https://pcmdi.llnl.gov/CMIP6/TermsOfUse/TermsOfUse6-1.html",
    media_type="text/html",
    title="CMIP6: Terms of Use"
)
license_link

def make_asset_path(path,
                    prefix="https://storage.googleapis.com",
                    suffix="/.zmetadata"):
    return prefix + path + suffix


def collection_from_zarr_group(path):
    coll_id = path.split('/')[-1]
    description = "Auto-generated description"
    temporal_extent = get_temporal_extent_from_zarr_group(path)
    extent = pystac.Extent(_GLOBAL_EXTENT, temporal_extent)
    
    
    assets =  {
        "zmetadata": {
          "href": make_asset_path(path),
          "description": "Consolidated metadata file for Zarr store",
          "type": "application/json",
          "roles": [
            "metadata",
            "zarr-v2-consolidated-metadata"
          ]
        }
    }
    
    coll = pystac.Collection(
        coll_id,
        description,
        extent,
        stac_extensions=extensions,
        providers=providers,
        extra_fields={"assets": assets}
    )
    coll.add_link(license_link)
    return coll

def catalog(path):
    cat_id = path.split('/')[-1]
    description = "Auto-generated description"
    cat = pystac.Catalog(cat_id, description,
                         catalog_type=pystac.CatalogType.SELF_CONTAINED)
    return cat

In [8]:
def list_files_and_subdirs(path):
    subdirs = {}
    files = {}
    
    listing = fs.ls(path, detail=True)
    for info in listing:
        pathname = info["name"].rstrip("/")
        name = pathname.rsplit("/", 1)[-1]
        if info["type"] == "directory" and pathname != path:
            # do not include "self" path
            #full_dirs[pathname] = info
            subdirs[pathname] = info
        elif pathname == path:
            # file-like with same name as give path
            files[""] = info
        else:
            files[name] = info
    
    return list(files), list(subdirs), 

In [14]:
def walk_recursive(path, depth=0, max_depth=None, use_tqdm=True):
    files, subdirs = list_files_and_subdirs(path)
    #if depth < 3:
    #    print(path)
    if '.zmetadata' in files:
        return collection_from_zarr_group(path)
    elif max_depth and depth >= max_depth:
        return none
    else:
        if use_tqdm and len(subdirs) > 1:
            subdirs = tqdm(
                subdirs,
                desc=os.path.basename(path),
                leave=(depth<1)
            )
        cat = catalog(path)
        items = [
            walk_recursive(
                sd,
                depth=(depth + 1),
                max_depth=max_depth
            )
            for sd in subdirs 
        ]
        cat.add_children([item for item in items if item])
        return cat

In [15]:
def walk_recursive_threaded(path, depth=0, max_depth=None, executor=None):
    files, subdirs = list_files_and_subdirs(path)
    if '.zmetadata' in files:
        #print(path)
        return collection_from_zarr_group(path)
    elif max_depth and depth >= max_depth:
        return none
    else:
        cat = catalog(path)
        def walk_thunk(item):
            return walk_recursive_threaded(item, depth=(depth + 1), max_depth=max_depth, executor=executor)
        if depth <= 1:
            items = executor.map(walk_thunk, subdirs)
        else:
            items = [walk_thunk(s) for s in subdirs]
        cat.add_children([item for item in items if item])
        return cat

In [16]:
fs.clear_instance_cache()

In [17]:
%%time

basepath = 'cmip6/CMIP'
path = basepath + '/AWI'

cat = walk_recursive(path)

HBox(children=(HTML(value='AWI'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='AWI-CM-1-1-MR'), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='r1i1p1f1'), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=32.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value='fx'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='r1i1p1f1'), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=31.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='fx'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='historical'), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value='r1i1p1f1'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=29.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=17.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='day'), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value='fx'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='r2i1p1f1'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=19.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=17.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='day'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='r3i1p1f1'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=24.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=17.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='day'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='r4i1p1f1'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=24.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='day'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='r5i1p1f1'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=24.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='day'), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='r1i1p1f1'), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=36.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=18.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(HTML(value='day'), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value='fx'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='AWI-ESM-1-1-LR'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='r1i1p1f1'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=26.0), HTML(value='')))

HBox(children=(HTML(value='Eday'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='Lmon'), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='day'), FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(HTML(value='fx'), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value='r1i1p1f1'), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value='Amon'), FloatProgress(value=0.0, max=33.0), HTML(value='')))

HBox(children=(HTML(value='Omon'), FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(HTML(value='SImon'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value='fx'), FloatProgress(value=0.0, max=6.0), HTML(value='')))


CPU times: user 10.6 s, sys: 1.09 s, total: 11.7 s
Wall time: 12.4 s


In [25]:
cat.normalize_hrefs(path)
cat.validate_all()

In [26]:
cat.describe()

* <Catalog id=TaiESM1>
    * <Catalog id=1pctCO2>
        * <Catalog id=r1i1p1f1>
            * <Catalog id=Amon>
                * <Catalog id=clivi>
                    * <Collection id=gn>
                * <Catalog id=clt>
                    * <Collection id=gn>
                * <Catalog id=clwvi>
                    * <Collection id=gn>
                * <Catalog id=co2mass>
                    * <Collection id=gm>
                * <Catalog id=evspsbl>
                    * <Collection id=gn>
                * <Catalog id=hfls>
                    * <Collection id=gn>
                * <Catalog id=hur>
                    * <Collection id=gn>
                * <Catalog id=hus>
                    * <Collection id=gn>
                * <Catalog id=pr>
                    * <Collection id=gn>
                * <Catalog id=prc>
                    * <Collection id=gn>
                * <Catalog id=prsn>
                    * <Collection id=gn>
                * <Catalog id=prw>
  

In [27]:
# !rm -rf cmip6/
# cat.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)