In [None]:
import pandas as pd
import gcsfs
import xarray as xr
from datetime import datetime
import itertools
import os

import myconfig
from mydataset import id2dict_, dir2url_, dir2dict
from mytasks import Check, Download, ReadFiles, SaveAsZarr, Upload, Cleanup
from mysearch import esgf_search

In [None]:
# CONFIGURE ESGF Search here
node_pref = myconfig.node_pref
dtype = myconfig.dtype
myconfig.local_target_prefix = '/h113/naomi/zarr-minimal/'
dir2local = dir2url_(myconfig.local_target_prefix)

ESGF_site = dtype['llnl']
#ESGF_site = dtype['dkrz']

print('zarrs will be written to: ',myconfig.local_target_prefix)

In [None]:
# CONFIGURE GCS
fs     = gcsfs.GCSFileSystem(token='anon', access='read_only',cache_timeout=-1)
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# make available to all modules (except those)
myconfig.fs = fs
myconfig.df_GCS = df_GCS

In [None]:
core_mips = ['CMIP', 'ScenarioMIP', 'DAMIP']

fx_tables = ['AERfx', 'Efx', 'IfxAnt', 'IfxGre', 'Ofx', 'fx']
yr_tables = ['Eyr', 'IyrAnt', 'IyrGre', 'Oyr']
other_tables = ['Odec', 'E1hrClimMon','Oclim'] 

core_experiments = [
    '1pctCO2', 'abrupt-4xCO2',  'historical', 'piControl' 
    ,'ssp119', 'ssp126', 'ssp245', 'ssp370', 'ssp434', 'ssp460', 'ssp534-over', 'ssp585'
                   ]
more_experiments = [
     'piControl-spinup', 'amip-hist', 'esm-hist', 'esm-piControl', 'esm-piControl-spinup'
    ,'1pctCO2-bgc','lgm', 'past1000', 'amip'
                   ]
                   
core_Amon_2dvars = ['evspsbl', 'hfls', 'pr', 'prc', 'ps', 'psl', 'sfcWind', 'tas', 'ts', 'uas', 'vas','huss','hurs']
flux_Amon_2dvars = ['rlds', 'rlus', 'rsds', 'rsus', 'hfds', 'hfls', 'hfss','tauu','tauv']
core_Omon_2dvars = ['tos', 'sos', 'zos']
flux_Omon_2dvars = ['tauuo', 'tauvo']

core_Amon_3dvars = ['ta', 'ua', 'va', 'zg', 'wap', 'hur', 'hus']
core_Omon_3dvars = ['masscello', 'so', 'thetao', 'umo', 'uo', 'vmo', 'vo', 'wmo', 'wo']
core_Omon_tracers = ['chl', 'chlos', 'dfe', 'dfeos', 'epc100', 'fgco2', 'intpp', 'no3', 'no3os', 'phyc', 'phycos', 'phydiat', 'phydiatos', 'si', 'sios', 'spco2', 'zooc', 'zoocos']

In [None]:
# Pick keyword values to specify your search here. Not specifying a particular keyword means it will find all.
search = 'test'
all_search = {
     'table_id'      : ['SImon']
    ,'experiment_id' : ['ssp119', 'ssp126']
    ,'variable_id'   : ["sithick", "siconc", "sisnthick", "sisnconc"]
    #,'member_id'     : ['r1i1p1f1']
    #,'source_id'     : ['CESM2-WACCM']
}

search = 'O2d-1f'

# define some common searches:
if search == 'A2d-1c':
    all_search = {'table_id': ['Amon'], 'experiment_id': core_experiments, 'variable_id': core_Amon_2dvars}
if search == 'A2d-1f':
    all_search = {'table_id': ['Amon'], 'experiment_id': core_experiments, 'variable_id': flux_Amon_2dvars}
if search == 'A3d-1c':
    all_search = {'table_id': ['Amon'], 'experiment_id': core_experiments, 'variable_id': core_Amon_3dvars}

if search == 'O2d-1f':
    all_search = {'table_id': ['Omon'], 'experiment_id': core_experiments, 'variable_id': flux_Omon_2dvars}
if search == 'O2d-1c':
    all_search = {'table_id': ['Omon'], 'experiment_id': core_experiments, 'variable_id': core_Omon_2dvars}
if search == 'O3d-1c':
    all_search = {'table_id': ['Omon'], 'experiment_id': core_experiments, 'variable_id': core_Omon_3dvars}
if search == 'O3d-1t':
    all_search = {'table_id': ['Omon'], 'experiment_id': core_experiments, 'variable_id': core_Omon_tracers}

if search == 'other-coremips':
    all_search = {'table_id': other_tables, 'activity_id': core_mips}
if search == 'fx-coremips':
    all_search = {'table_id': fx_tables, 'activity_id': core_mips}
if search == 'yr-coremips':
    all_search = {'table_id': yr_tables, 'activity_id': core_mips}

# check if ANOTHER notebook is doing the same search
lock_file = f'logs/{search}.lock'
resp = 'n'
if os.path.exists(lock_file):
    resp = input('clear matching logs? (y/n)')
if resp == 'y':
    command = f'/bin/rm logs/*{search}*'
    print(command)
    os.system(command)
    
f = open(lock_file,'w')
date = str(datetime.now().strftime("%Y%m%d-%H%M"))
f.write(f'started {search} at {date}')
f.close()

label = search
all_search, label

In [None]:
update_ESGF = True
if update_ESGF:
    x = [value for key,value in all_search.items()]
    searches = [p for p in itertools.product(*x)]

    dESGF = []
    for s in searches:
        search = dict(zip(all_search.keys(),s))
        print(search)
        df = esgf_search(search, server=ESGF_site)
        if len(df)>0:
            dESGF += [df]

    df_ESGF = pd.concat(dESGF)
    df_ESGF.to_csv(f'csv/ESGF_{label}.csv',index=False)
else:
    df_ESGF = pd.read_csv(f'csv/ESGF_{label}.csv', dtype='unicode')

len(df_ESGF), len(df_ESGF.ds_dir.unique())

In [None]:
# make df of all needed
NewNeeded = True
if NewNeeded:
    df = pd.merge(df_ESGF,df_GCS, how='outer', indicator=True)
    df_needed = df[df._merge == 'left_only']
    
    keep_keys = df_ESGF.keys()
    all_keys = df.keys()
    drop_keys = list(set(all_keys) - set(keep_keys))
    df_needed = df_needed.drop(drop_keys,1)

    num_stores = 0
    if len(df_needed) > 0:
        num_stores = df_needed.ds_dir.nunique() 
        print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    else:
        print('no new data available')
        exit    
    
    df_needed['member'] = [int(s.split('r')[-1].split('i')[0]) for s in df_needed['member_id']]
    df_needed = df_needed.sort_values(by=['member'])
    #df_needed['zsize'] = [df_needed[df_needed.ds_dir==zs]['file_size'].sum() for zs in df_needed['ds_dir']]
    #df_needed = df_needed.sort_values(by=['zsize'])
    
    df_needed.to_csv(f'csv/needed_{label}.csv',index=False)
else:
    df_needed = pd.read_csv(f'csv/needed_{label}.csv')

print('Variables')
for var in df_needed.variable_id.unique():
    print(var,df_needed[df_needed.variable_id==var].ds_dir.nunique())

print('\nExperiments')
for exp in df_needed.experiment_id.unique():
    print(exp,df_needed[df_needed.experiment_id==exp].ds_dir.nunique())

In [None]:
# make available to all modules
myconfig.df_needed = df_needed

In [None]:
ds_dirs = df_needed.ds_dir.unique()
numdsets = len(ds_dirs)

In [None]:
progress_log  = f'logs/progress_{label}.log'
failure_log  = f'logs/failure_{label}.log'
success_log  = f'logs/success_{label}.log'
logs = {1:progress_log, 2:failure_log, 3:success_log}

### ierr:
- 0 : proceed with next task
- 1 : write to progress_log, go to next dataset (finished or try again)
- 2 : write to failure_log,  go to next dataset (mark as un-usable - do not try again until problem is solved) 
- 3 : write to success_log,  go to next dataset (dataset added to cloud)

In [None]:
def write_log(file,str,verbose=True):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(f'{str}\n')
    f.close()
    return

In [None]:
# reload the catalog
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# refresh the gcsfs
fs.invalidate_cache()

for item, ds_dir in enumerate(ds_dirs):

    print(f'\n{item}/{numdsets-1}',ds_dir)

    #skip = 'EC-Earth3'
    skip = 'none'
    if skip in ds_dir:
        write_log(progress_log,f'{ds_dir} skipping {skip}') 
        continue
        
    (ierr, exc) = Check(ds_dir, dir2local)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir} {exc}'); continue
                     
    (gfiles, ierr, exc) = Download(ds_dir)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue
        
    (ds,ierr,exc) = ReadFiles(ds_dir, gfiles, dir2dict)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue

    (ierr,exc) = SaveAsZarr(ds_dir, ds, dir2local)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue
        
    (ierr,exc) = Upload(ds_dir, dir2local)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue

    (ierr,exc) = Cleanup(ds_dir, gfiles, dir2local)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue

    write_log(success_log,f'{dir2local(ds_dir)} saved to GCS'); continue