In [1]:
import pandas as pd
import gcsfs
import xarray as xr
from datetime import datetime
import itertools

import myconfig
from mytasks import Check, Download, ReadFiles, SaveAsZarr, Upload, Cleanup
from mysearch import esgf_search

In [2]:
# CONFIGURE ESGF Search here
node_pref = myconfig.node_pref
dtype = myconfig.dtype

# reset the preference rank to omit a particular data node
# node_pref['esgf-data1.llnl.gov'] = 999

ESGF_site = dtype['llnl']
#ESGF_site = dtype['dkrz']

print('zarrs will be written to: ',myconfig.local_target_prefix)

zarrs will be written to:  /h111/naomi/zarr-minimal/


In [3]:
# CONFIGURE GCS
fs     = gcsfs.GCSFileSystem(token='anon', access='read_only',cache_timeout=-1)
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# make available to all modules
myconfig.fs = fs
myconfig.df_GCS = df_GCS

In [4]:
core_experiments = ['ssp126','ssp245','ssp370','ssp585','ssp119','ssp434','ssp460','ssp534-over','1pctCO2','abrupt-4xCO2','amip','historical','piControl',
 'esm-hist','esm-piControl','esm-piControl-spinup','piControl-spinup','amip-hist']
core_Amon_vars = ['ts','psl','hfls','ps','sfcWind','uas','vas','tas','pr','prc','evspsbl']
core_Omon_vars = ['tos','sos','zos']

In [5]:
# Pick keyword values to specify your search here. Not specifying a particular keyword means it will find all.
all_search = {
     'table_id'      : ['Omon']
    ,'experiment_id' : ['1pctCO2','abrupt-4xCO2','amip','historical','piControl']
    ,'variable_id'   : core_Omon_vars
    #,'member_id'     : ['r1i1p1f1']
    #,'source_id'     : ['CESM2-WACCM']
}

exp1 = '-'.join(all_search['experiment_id'])
tab1 = '-'.join(all_search['table_id'])
label = f'{tab1}-{exp1}'

In [6]:
update_ESGF = True
if update_ESGF:
    x = [value for key,value in all_search.items()]
    searches = [p for p in itertools.product(*x)]

    dESGF = []
    for s in searches:
        search = dict(zip(all_search.keys(),s))
        print(search)
        df = esgf_search(search, server=ESGF_site)
        if len(df)>0:
            dESGF += [df]

    df_ESGF = pd.concat(dESGF)
    df_ESGF.to_csv(f'csv/ESGF_{label}.csv',index=False)
else:
    df_ESGF = pd.read_csv(f'csv/ESGF_{label}.csv', dtype='unicode')

len(df_ESGF), len(df_ESGF.ds_dir.unique())

{'table_id': 'Omon', 'experiment_id': '1pctCO2', 'variable_id': 'tos'}
{'table_id': 'Omon', 'experiment_id': '1pctCO2', 'variable_id': 'sos'}
{'table_id': 'Omon', 'experiment_id': '1pctCO2', 'variable_id': 'zos'}
{'table_id': 'Omon', 'experiment_id': 'abrupt-4xCO2', 'variable_id': 'tos'}
{'table_id': 'Omon', 'experiment_id': 'abrupt-4xCO2', 'variable_id': 'sos'}
{'table_id': 'Omon', 'experiment_id': 'abrupt-4xCO2', 'variable_id': 'zos'}
{'table_id': 'Omon', 'experiment_id': 'amip', 'variable_id': 'tos'}
{'table_id': 'Omon', 'experiment_id': 'amip', 'variable_id': 'sos'}
{'table_id': 'Omon', 'experiment_id': 'amip', 'variable_id': 'zos'}
empty search response
{'table_id': 'Omon', 'experiment_id': 'historical', 'variable_id': 'tos'}
{'table_id': 'Omon', 'experiment_id': 'historical', 'variable_id': 'sos'}
{'table_id': 'Omon', 'experiment_id': 'historical', 'variable_id': 'zos'}
{'table_id': 'Omon', 'experiment_id': 'piControl', 'variable_id': 'tos'}
{'table_id': 'Omon', 'experiment_id': 

(52337, 2710)

In [7]:
# make df of all needed
NewNeeded = True
if NewNeeded:
    df = pd.merge(df_ESGF,df_GCS, how='outer', indicator=True)
    df_needed = df[df._merge == 'left_only']
    
    keep_keys = df_ESGF.keys()
    all_keys = df.keys()
    drop_keys = list(set(all_keys) - set(keep_keys))
    df_needed = df_needed.drop(drop_keys,1)

    num_stores = 0
    if len(df_needed) > 0:
        num_stores = df_needed.ds_dir.nunique() 
        print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    else:
        print('no new data available')
        exit    
    
    df_needed['member'] = [int(s.split('r')[-1].split('i')[0]) for s in df_needed['member_id']]
    df_needed['zsize'] = [df_needed[df_needed.ds_dir==zs]['file_size'].sum() for zs in df_needed['ds_dir']]
    df_needed = df_needed.sort_values(by=['zsize'])
    
    df_needed.to_csv(f'csv/needed_{label}.csv',index=False)
else:
    df_needed = pd.read_csv(f'csv/needed_{label}.csv')

len(df_needed), len(df_needed.ds_dir.unique())

needed: nfiles=3486, nstores=78


(3486, 78)

In [8]:
# make available to all modules
myconfig.df_needed = df_needed

In [9]:
ds_dirs = df_needed.ds_dir.unique()
numdsets = len(ds_dirs)

In [10]:
request_id = 'test' #datetime.now().strftime('%Y%m%d-%H%M')
progress_log  = f'logs/progress_{label}.log'
failure_log  = f'logs/failure_{label}.log'
success_log  = f'logs/success_{label}.log'

In [11]:
def write_log(file,str,verbose=True):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(f'{datetime.now().strftime("%Y%m%d-%H%M")}: {str}\n')
    f.close()
    return

In [None]:
for item, ds_dir in enumerate(ds_dirs):
    
    print(f'\n{item}/{numdsets-1}',ds_dir)

    (ierr, exc) = Check(ds_dir)
    if ierr > 0:
        write_log(progress_log,f'{ds_dir} with error {ierr}: {exc}'); continue
        
    (gfiles, ierr, exc) = Download(ds_dir)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue
        
    (ds,ierr,exc) = ReadFiles(ds_dir, gfiles)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue

    (ierr,exc) = SaveAsZarr(ds_dir, ds)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue
        
    (gsurl, ierr,exc) = Upload(ds_dir)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue

    (ierr,exc) = Cleanup(ds_dir, gfiles)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue

    write_log(success_log,f'{item}/{numdsets-1}: {ds_dir} saved to {gsurl}'); continue


0/77 CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Omon/tos/gn
call Check:
CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Omon/tos/gn with error 1: noUse in codes

1/77 CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Omon/sos/gn
call Check:
CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Omon/sos/gn with error 1: noUse in codes

2/77 CMIP/NASA-GISS/GISS-E2-1-G/historical/r10i1p5f1/Omon/tos/gn
call Check:
call Download:
netcdfs/tos_Omon_GISS-E2-1-G_historical_r10i1p5f1_gn_185001-190012.nc
netcdfs/tos_Omon_GISS-E2-1-G_historical_r10i1p5f1_gn_190101-195012.nc
netcdfs/tos_Omon_GISS-E2-1-G_historical_r10i1p5f1_gn_195101-200012.nc
netcdfs/tos_Omon_GISS-E2-1-G_historical_r10i1p5f1_gn_200101-201412.nc
call ReadFiles:
164 165
call SaveAsZarr:
call Upload:
successfully uploaded as gs://cmip6/CMIP/NASA-GISS/GISS-E2-1-G/historical/r10i1p5f1/Omon/tos/gn
call Cleanup:
2/77: CMIP/NASA-GISS/GISS-E2-1-G/historical/r10i1p5f1/Omon/tos/gn saved to gs://cmip6/CMIP/NASA-GISS/GISS-E2-1-G/historical/r10i1p5f1