In [1]:
import pandas as pd
import gcsfs
import xarray as xr
from datetime import datetime

import myconfig
from mytasks import Check, Download, ReadFiles, SaveAsZarr, Upload, Cleanup
from mysearch import esgf_search

In [2]:
# CONFIGURE ESGF Search here

import myconfig
node_pref = myconfig.node_pref
dtype = myconfig.dtype

# reset the preference rank to omit
# node_pref['esgf-data1.llnl.gov'] = 999

ESGF_site = dtype['llnl']
#ESGF_site = dtype['dkrz']

print('zarrs will be written to: ',myconfig.local_target_prefix)

zarrs will be written to:  /h111/naomi/zarr-minimal/


In [3]:
# CONFIGURE GCS
fs     = gcsfs.GCSFileSystem(token='anon', access='read_only',cache_timeout=-1)
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# make available to all modules
myconfig.fs = fs
myconfig.df_GCS = df_GCS

In [4]:
core_experiments = ['ssp126','ssp245','ssp370','ssp585','ssp119','ssp434','ssp460','ssp534-over','1pctCO2','abrupt-4xCO2','amip','historical','piControl',
 'esm-hist','esm-piControl','esm-piControl-spinup','piControl-spinup','amip-hist']
core_Amon_vars = ['ts','psl','hfls','ps','sfcWind','uas','vas','tas','pr','prc','evspsbl']
core_Omon_vars = ['tos','sos','zos']

In [10]:
# specify your search. not specifying means it will find all
all_search = {
     'table_id'      : ['Amon']
    ,'experiment_id' : ['historical']
    ,'variable_id'   : core_Amon_vars
    #,'member_id'     : ['r1i1p1f1']
    #,'source_id'     : ['CESM2-WACCM']
}

exp1 = all_search['experiment_id'][0]
tab1 = all_search['table_id'][0]
label = f'{exp1}-{tab1}'


In [9]:
import itertools


update_ESGF = True
if update_ESGF:
    x = [value for key,value in all_search.items()]
    searches = [p for p in itertools.product(*x)]

    dESGF = []
    for s in searches:
        search = dict(zip(all_search.keys(),s))
        print(search)
        df = esgf_search(search, server=ESGF_site)
        if len(df)>0:
            dESGF += [df]

    df_ESGF = pd.concat(dESGF)
    df_ESGF.to_csv(f'csv/ESGF_{label}.csv',index=False)
else:
    exp1 = all_search['experiment_id'][0]
    tab1 = all_search['table_id'][0]
    df_ESGF = pd.read_csv(f'csv/ESGF_{label}.csv', dtype='unicode')

len(df_ESGF), len(df_ESGF.ds_dir.unique())

{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'ts'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'psl'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'hfls'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'ps'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'sfcWind'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'uas'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'vas'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'tas'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'pr'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'prc'}
{'table_id': 'Amon', 'experiment_id': 'historical', 'variable_id': 'evspsbl'}


(101282, 5689)

In [11]:
# make df of all needed
NewNeeded = True
if NewNeeded:
    df = pd.merge(df_ESGF,df_GCS, how='outer', indicator=True)
    df_needed = df[df._merge == 'left_only']
    
    keep_keys = df_ESGF.keys()
    all_keys = df.keys()
    drop_keys = list(set(all_keys) - set(keep_keys))
    df_needed = df_needed.drop(drop_keys,1)

    num_stores = 0
    if len(df_needed) > 0:
        num_stores = df_needed.ds_dir.nunique() 
        print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    else:
        print('no new data available')
        exit    
    
    df_needed['member'] = [int(s.split('r')[-1].split('i')[0]) for s in df_needed['member_id']]
    df_needed['zsize'] = [df_needed[df_needed.ds_dir==zs]['file_size'].sum() for zs in df_needed['ds_dir']]
    df_needed = df_needed.sort_values(by=['zsize'])
    
    df_needed.to_csv(f'csv/needed_{label}.csv',index=False)
else:
    df_needed = pd.read_csv(f'csv/needed_{label}.csv')

len(df_needed), len(df_needed.ds_dir.unique())

needed: nfiles=1352, nstores=19


(1352, 19)

In [12]:
# make available to all modules
myconfig.df_needed = df_needed

In [13]:
ds_dirs = df_needed.ds_dir.unique()
numdsets = len(ds_dirs)

In [14]:
request_id = 'test' #datetime.now().strftime('%Y%m%d-%H%M')
progress_log  = f'txt/progress_{label}.log'
failure_log  = f'txt/failure_{label}.log'
success_log  = f'txt/success_{label}.log'

In [15]:
def write_log(file,str,verbose=True):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(f'{datetime.now().strftime("%Y%m%d-%H%M")}: {str}\n')
    f.close()
    return

In [None]:
for item, ds_dir in enumerate(ds_dirs):
    
    print(f'\n{item}/{numdsets-1}',ds_dir)

    (ierr, exc) = Check(ds_dir)
    if ierr > 0:
        write_log(progress_log,f'{ds_dir} with error {ierr}: {exc}'); continue
        
    (gfiles, ierr, exc) = Download(ds_dir)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue
        
    (ds,ierr,exc) = ReadFiles(ds_dir, gfiles)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue

    (ierr,exc) = SaveAsZarr(ds_dir, ds)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue
        
    (gsurl, ierr,exc) = Upload(ds_dir)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue

    (ierr,exc) = Cleanup(ds_dir, gfiles)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir} with error {ierr}: {exc}'); continue

    write_log(success_log,f'{item}/{numdsets-1}: {ds_dir} saved to {gsurl}'); continue


0/18 CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/psl/gn
call Check:
CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/psl/gn with error 1: noUse in codes

1/18 CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/pr/gn
call Check:
CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/pr/gn with error 1: noUse in codes

2/18 CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/ts/gn
call Check:
CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/ts/gn with error 1: noUse in codes

3/18 CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/prc/gn
call Check:
CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/prc/gn with error 1: noUse in codes

4/18 CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/ps/gn
call Check:
CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/ps/gn with error 1: noUse in codes

5/18 CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/evspsbl/gn
call Check:
CMIP/NASA-GISS/GISS-E2-1-G/historical/r7i1p3f1/Amon/evspsbl/gn with error 1: noUse in codes

6/18 CM