### This notebook shows how to find URLs from ESGF
- Choose an experiment_id, variable_id and table_id
- Saves the URLs in a dataframe

In [1]:
import numpy as np
import pandas as pd
import os
import gcsfs
import xarray as xr
from functools import partial
from IPython.display import display
from glob import glob
import warnings
import datetime
import pprint

import configparser
config = configparser.ConfigParser()

In [2]:
from esgf import esgf_search_sites, search, esgf_search

### Load specified `zcs_id` from config file as well as I/O and ESGF specs

In [3]:
# edit config.cfg to set local path (zarr_local) for saving search results

config.read('config.cfg')
zcs_dict = {}
zcs_dict['activity_id'] = (config.get('zcs_id', 'activity_id'))
zcs_dict['institution_id'] = config.get("zcs_id", "institution_id")
zcs_dict['source_id'] = config.get("zcs_id", "source_id")
zcs_dict['experiment_id'] = config.get("zcs_id", "experiment_id")
zcs_dict['member_id'] = config.get("zcs_id", "member_id")
zcs_dict['table_id'] = config.get("zcs_id", "table_id")
zcs_dict['variable_id'] = config.get("zcs_id", "variable_id")
zcs_dict['grid_label'] = config.get("zcs_id", "grid_label")

zarr_local = config.get("FILEPATHS", "zarr_local")
if not os.path.exists(zarr_local):
    print('no such path')
update_ESGF = config.getboolean("ESGF", "update_ESGF")
local_node = config.getboolean("ESGF", "local_node")

In [4]:
fs = gcsfs.GCSFileSystem(token='anon', access='read_only')
dtype = esgf_search_sites()
print('possible ESGF API search nodes: ',list(dtype.keys()))
node = 'llnl'
print("setting current ESGF node to %s" %node)
ESGF_site = dtype[node]

possible ESGF API search nodes:  ['llnl', 'ipsl', 'nci', 'ceda', 'gfdl', 'dkrz']
setting current ESGF node to llnl


### Get what exists in GCS as zarr stores already 

In [5]:
dGC = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype={'version': 'unicode'})

In [6]:
for key in zcs_dict:
    if zcs_dict[key] != 'All':
        dzLocal = dGC.loc[(dGC[key] == zcs_dict[key])]

pprint.pprint(zcs_dict)
print("there are currently %.0f models in GCS for this specification" %len(dzLocal))

{'activity_id': 'All',
 'experiment_id': 'ssp370',
 'grid_label': 'All',
 'institution_id': 'All',
 'member_id': 'All',
 'source_id': 'All',
 'table_id': 'day',
 'variable_id': 'tasmax'}
there are currently 8461 models in GCS for this specification


### create a dataframe of all matching ESGF datasets 

In [7]:
if update_ESGF:
    experiment_id = zcs_dict['experiment_id']
    variable_id = zcs_dict['variable_id']
    table_id = zcs_dict['table_id']
    print(experiment_id,variable_id,table_id)
    
    dESGF = esgf_search(server=ESGF_site, mip_era='CMIP6', variable_id=variable_id, 
                           table_id=table_id, experiment_id=experiment_id, 
                           page_size=500, verbose=False, local_node=False)

    dESGF.to_csv(os.path.join(zarr_local, 'ESGF_specific.csv'),index=False)
else:
    dESGF = pd.read_csv('csv/ESGF_specific.csv')
    dESGF = dESGF.drop_duplicates(subset =["file_name","version","checksum"])

ssp370 tasmax day


In [8]:
dESGF['zcs_id'] = [s.split('|')[0]for s in dESGF.dataset_id]
zcs_ids = dESGF.zcs_id.unique()

print('Found', dESGF.zcs_id.nunique(), 'datasets, consisting of',len(dESGF),'netcdf files:')

Found 255 datasets, consisting of 7297 netcdf files:


In [9]:
#url_type is one of: Globus_url, GridFTP_url, HTTPServer_url, OPENDAP_url
    
def get_dataset_urls(dESGF,zcs_id,url_type='HTTPServer_url'):
    df = dESGF[dESGF.zcs_id==zcs_id]
    return sorted(df[url_type].values)    

In [10]:
# Choose one with not too many netcdf files:

zcs_id = zcs_ids[3]
urls = get_dataset_urls(dESGF,zcs_id,url_type='OPENDAP_url')
print(urls)

['https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp370/r1i1p1f1/day/tasmax/gn/v20210323/tasmax_day_TaiESM1_ssp370_r1i1p1f1_gn_20150101-20241231.nc', 'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp370/r1i1p1f1/day/tasmax/gn/v20210323/tasmax_day_TaiESM1_ssp370_r1i1p1f1_gn_20250101-20341231.nc', 'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp370/r1i1p1f1/day/tasmax/gn/v20210323/tasmax_day_TaiESM1_ssp370_r1i1p1f1_gn_20350101-20441231.nc', 'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp370/r1i1p1f1/day/tasmax/gn/v20210323/tasmax_day_TaiESM1_ssp370_r1i1p1f1_gn_20450101-20541231.nc', 'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp370/r1i1p1f1/day/tasmax/gn/v20210323/tasmax_day_TaiESM1_ssp370_r1i1p1f1_gn_20550101-20641231.nc', 'https://esgf-data1.llnl.gov/thredds/dodsC/css03_data/

## So now we have:

- *zcs_ids*: list of all datasets corresponding to a given (experiment_id, variable_id, table_id) in the ESGF repository
- *get_dataset_urls()*: list of urls for each dataset

### To finish the pipeline, now we loop over zcs_ids:

- if same dataset is already in GC, then skip (find_dataset()==True)  
- download and cache the netcdfs needed for the dataset
- pre-process and concatenate the dataset, fixing known problems, flagging new problems
- if all is good, save the zarr to target location
- update progress log (currently a Google Sheet)


In [11]:
# Check if the dataset is already in GC

def find_dataset(dGC,zid):
    zstore = '/'.join(zid.split('.'))[:-1] 
    dfz = dGC[dGC.zstore.str.contains(zstore)]
    if len(dfz) >=1:
        #print(zid,'same version already exists')
        return True
    vstore = '/'.join(zid.split('.')[:-1]) 
    dfv = dGC[dGC.zstore.str.contains(vstore)]
    if len(dfv) >=1:
        print(zid,'different version exists - maybe update?')
        return True
    return False

In [12]:
for zid in zcs_ids:
    if find_dataset(dGC,zid):
        continue
    print(zid,'Dataset is needed! Proceed with recipe')

CMIP6.ScenarioMIP.AS-RCEC.TaiESM1.ssp370.r1i1p1f1.day.tasmax.gn.v20210323 Dataset is needed! Proceed with recipe
CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp370.r11i1p1f1.day.tasmax.gn.v20210525 Dataset is needed! Proceed with recipe
CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp370.r12i1p1f1.day.tasmax.gn.v20210525 Dataset is needed! Proceed with recipe
CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp370.r13i1p1f1.day.tasmax.gn.v20210525 Dataset is needed! Proceed with recipe
CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp370.r14i1p1f1.day.tasmax.gn.v20210525 Dataset is needed! Proceed with recipe
CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp370.r15i1p1f1.day.tasmax.gn.v20210525 Dataset is needed! Proceed with recipe
CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp370.r16i1p1f1.day.tasmax.gn.v20210525 Dataset is needed! Proceed with recipe
CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp370.r17i1p1f1.day.tasmax.gn.v20210525 Dataset is needed! Proceed with recipe
CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp370.r18i1p1f1.day.ta