In [1]:
import os
import glob
import intake
import xarray as xr

from makecatalog_utils import src_header
from makecatalog_utils import src_footer

#from bs4 import BeautifulSoup
#import codecs
#import argparse

In [2]:
# Eliminate Warnings
import warnings
warnings.filterwarnings("ignore")


In [3]:
def generate_catalog(file_path_name, dataset_sub_name, parent_page, top_dir,tags, concat):
    """
    FILE_NAME: If there are more than one file, FILE_NAME is the pattern for the NetCDF files, otherwise, Name of the NetCDF file. e.g.: 'air.mon.mean.nc' 

    DATASET_SUB_NAME: Name of the directory containing the NetCDf data files, e.g.: 'GHCN_CAMS'. If there is subdirectory like monthly, daily, etc., it should also be included and separated by "_".

    PARENT_PAGE: Name of the parent directory in the dataset type hierarchy, e.g.: Temperature

    TAG: A dataset may need to be catalogued into multiple child catalogs, e.g.: "Atmosphere", "Temperature". Please keep the format consistent
    """
    file_path_name = file_path_name.strip('""')
    path, fileName = os.path.split(file_path_name)
#    print("1 :"+ file_path_name)
#    print("2 :"+ dataset_sub_name)
#    print("3 :"+ parent_page)
#    print("4 :"+ tags)
    nfiles = len(glob.glob(file_path_name))
    
    # Set is_combine based on number of files
    if (nfiles > 1):
        is_combine= True
    else:
        is_combine= False

    temp = dataset_sub_name

    if int(is_combine) == True:
        # Read with xarray
        source = xr.open_mfdataset(file_path_name,combine='nested',concat_dim=concat)
        src = source
        # Use intake with xarray kwargs
        source = intake.open_netcdf(file_path_name,concat_dim=concat,xarray_kwargs={'combine':'nested','decode_times':True})
    else:
        source = intake.open_netcdf(file_path_name)
        src = xr.open_dataset(file_path_name)
        source.discover()
        
    yamlPath='../intake-catalogs/'+top_dir+'/'+parent_page+'/'
    if (not os.path.isdir(yamlPath)):
        os.makedirs(yamlPath)
        
    yamlFile=dataset_sub_name.strip('""')
    dataset_sub_name = open(yamlPath+'/'+yamlFile+ '.yaml', 'w')
    dataset_sub_name.write(source.yaml())
    dataset_sub_name.close()
    
    print('YAML FILE CREATED: '+ yamlPath+'/'+yamlFile)
    
    #############################################

    # CATALOG_DIR: Github repository containing the master catalog
    # NOTE: It will be more accurate later
    #catalog_dir = "https://raw.githubusercontent.com/ou-esplab/SoM-ESPLab-DATASETS-CATALOG/main/intake-catalogs/"
    catalog_dir='.'        
    open_catalog = catalog_dir + '/'+temp +".yaml"

    # Look for attributes in the data
    try:
        title = src.attrs['title'] 
    except:
        title = dataset_sub_name
    try:
        url = src.attrs['References']
    except:
        url =""
        
    # Here url roles as the location
    url = path
    tags =tags.split(',')
    
    # Make HTML src code
    html_repr =xr.core.formatting_html.dataset_repr(src).replace('\\n', '\n')
    _header = src_header(title, parent_page, open_catalog, url, tags, open_catalog)
    _footer = src_footer()
    html_src = _header + html_repr + _footer
    
    # Write HTML src code a file
    
    htmlPath='../'+top_dir+'/'+parent_page+'/'
    if (not os.path.isdir(htmlPath)):
        os.makedirs(htmlPath)
    #page_name = fileName.replace('*','').replace('..','.')
    page_name=yamlFile
    html_page = htmlPath+page_name +".html" 
    with open(html_page , "w") as file:
        file.write(html_src)

    print('HTML PAGE CREATED: ', html_page)

In [4]:
l1=['/data/esplab/shared/reanalysis/era5/daily/z200/z.*.nc','ERA5-daily-z200','ERA5-daily','reanalysis','reanalysis,geopotential,daily,gridded','']
l2=['/data/esplab/shared/reanalysis/era5/daily/z500/z.*.nc','ERA5-daily-z500','ERA5-daily','reanalysis','reanalysis,geopotential,daily,gridded','']
l3=['/data/esplab/shared/reanalysis/era5/daily/z850/z.*.nc','ERA5-daily-z850','ERA5-daily','reanalysis','reanalysis,geopotentail,daily,gridded','']

datasets=[l1,l2,l3]

In [5]:
for file,dataset,parent,topdir,tags,concat in datasets:
    print(file,dataset,parent,topdir,tags,concat)
    generate_catalog(file,dataset,parent,topdir,tags,concat)

/data/esplab/shared/reanalysis/era5/daily/z200/z.*.nc ERA5-daily-z200 ERA5-daily reanalysis reanalysis,geopotential,daily,gridded 
YAML FILE CREATED: ../intake-catalogs/reanalysis/ERA5-daily//ERA5-daily-z200
HTML PAGE CREATED:  ../reanalysis/ERA5-daily/ERA5-daily-z200.html
/data/esplab/shared/reanalysis/era5/daily/z500/z.*.nc ERA5-daily-z500 ERA5-daily reanalysis reanalysis,geopotential,daily,gridded 
YAML FILE CREATED: ../intake-catalogs/reanalysis/ERA5-daily//ERA5-daily-z500
HTML PAGE CREATED:  ../reanalysis/ERA5-daily/ERA5-daily-z500.html
/data/esplab/shared/reanalysis/era5/daily/z850/z.*.nc ERA5-daily-z850 ERA5-daily reanalysis reanalysis,geopotentail,daily,gridded 
YAML FILE CREATED: ../intake-catalogs/reanalysis/ERA5-daily//ERA5-daily-z850
HTML PAGE CREATED:  ../reanalysis/ERA5-daily/ERA5-daily-z850.html
