In [1]:
import pandas as pd
import gcsfs
import xarray as xr
from datetime import datetime
import itertools

import myconfig
from mydataset import id2dict_, dir2url_, dir2dict
from mytasks import Check, Download, ReadFiles, SaveAsZarr, Upload, Cleanup
from mysearch import esgf_search

In [3]:
# CONFIGURE ESGF Search here
node_pref = myconfig.node_pref
dtype = myconfig.dtype
myconfig.local_target_prefix = '/h112/naomi/zarr-minimal/'
dir2local = dir2url_(myconfig.local_target_prefix)

ESGF_site = dtype['llnl']
#ESGF_site = dtype['dkrz']

print('zarrs will be written to: ',myconfig.local_target_prefix)

zarrs will be written to:  /h112/naomi/zarr-minimal/


In [4]:
# CONFIGURE GCS
fs     = gcsfs.GCSFileSystem(token='anon', access='read_only',cache_timeout=-1)
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# make available to all modules (except those)
myconfig.fs = fs
myconfig.df_GCS = df_GCS

In [5]:
core_experiments = [
    '1pctCO2', 'abrupt-4xCO2',  'historical', 'piControl' 
    ,'ssp119', 'ssp126', 'ssp245', 'ssp370', 'ssp434', 'ssp460', 'ssp534-over', 'ssp585'
                   ]
more_experiments = [
     'piControl-spinup', 'amip-hist', 'esm-hist', 'esm-piControl', 'esm-piControl-spinup'
    ,'1pctCO2-bgc','lgm', 'past1000', 'amip'
                   ]
                   
core_Amon_2dvars = ['evspsbl', 'hfls', 'pr', 'prc', 'ps', 'psl', 'sfcWind', 'tas', 'ts', 'uas', 'vas','huss','hurs']
flux_Amon_2dvars = ['rlds', 'rlus', 'rsds', 'rsus', 'hfds', 'hfls', 'hfss','tauu','tauv']
core_Omon_2dvars = ['tos', 'sos', 'zos']
flux_Omon_2dvars = ['tauuo', 'tauuo']
core_Amon_3dvars = ['ta', 'ua', 'va', 'zg', 'wap', 'hur', 'hus']
core_Omon_3dvars = ['masscello', 'so', 'thetao', 'umo', 'uo', 'vmo', 'vo', 'wmo', 'wo']
core_Omon_tracers = ['chl', 'chlos', 'dfe', 'dfeos', 'epc100', 'fgco2', 'intpp', 'no3', 'no3os', 'phyc', 'phycos', 'phydiat', 'phydiatos', 'si', 'sios', 'spco2', 'zooc', 'zoocos']

In [6]:
# Pick keyword values to specify your search here. Not specifying a particular keyword means it will find all.
all_search = {
     'table_id'      : ['Omon','Amon','AERmon','SImon']
    ,'experiment_id' : ['historical','ssp585','ssp370']
    ,'variable_id'   : ["uas", "vas", "chl", "clt", "sithick", "siconc", "sisnthick", "sisnconc","tas","toz"]
    #,'member_id'     : ['r1i1p1f1']
    #,'source_id'     : ['CESM2-WACCM']
}

search = 'A2d-1'

# define some common searches:
if search == 'A2d-1':
    all_search = {'table_id': ['Amon'], 'experiment_id': core_experiments, 'variable_id': core_Amon_2dvars}
    label = f'Amon-core_experiments-core_2dvars'
if search == 'A2d-1f':
    all_search = {'table_id': ['Amon'], 'experiment_id': ['historical'], 'variable_id': flux_Amon_2dvars}
    label = f'Amon-core_experiments-flux_2dvars'

if search == 'O2d-1f':
    all_search = {'table_id': ['Omon'], 'experiment_id': core_experiments, 'variable_id': flux_Omon_2dvars}
    label = f'Omon-core_experiments-flux_Omon_2dvars'
if search == 'O2d-1c':
    all_search = {'table_id': ['Omon'], 'experiment_id': core_experiments, 'variable_id': core_Omon_2dvars}
    label = f'Omon-core_experiments-core_Omon_2dvars'
if search == 'hiba':
    all_search = {'table_id': ['day'], 'experiment_id': ['ssp370','ssp126','ssp585','ssp245','historical'], 
                  'variable_id': ['tas','pr','tasmax','tasmin','vas','uas']}
    label = 'hiba'
if search == 'trond':
    all_search = {'table_id': ['Omon'], 'experiment_id': ['ssp370','ssp585','historical'], 
                  'variable_id': ["thetao","o2"]}
    label = 'trond'


all_search, label

({'table_id': ['Amon'],
  'experiment_id': ['1pctCO2',
   'abrupt-4xCO2',
   'historical',
   'piControl',
   'ssp119',
   'ssp126',
   'ssp245',
   'ssp370',
   'ssp434',
   'ssp460',
   'ssp534-over',
   'ssp585'],
  'variable_id': ['evspsbl',
   'hfls',
   'pr',
   'prc',
   'ps',
   'psl',
   'sfcWind',
   'tas',
   'ts',
   'uas',
   'vas',
   'huss',
   'hurs']},
 'Amon-core_experiments-core_2dvars')

In [7]:
update_ESGF = False
if update_ESGF:
    x = [value for key,value in all_search.items()]
    searches = [p for p in itertools.product(*x)]

    dESGF = []
    for s in searches:
        search = dict(zip(all_search.keys(),s))
        print(search)
        df = esgf_search(search, server=ESGF_site)
        if len(df)>0:
            dESGF += [df]

    df_ESGF = pd.concat(dESGF)
    df_ESGF.to_csv(f'csv/ESGF_{label}.csv',index=False)
else:
    df_ESGF = pd.read_csv(f'csv/ESGF_{label}.csv', dtype='unicode')

len(df_ESGF), len(df_ESGF.ds_dir.unique())

(447124, 25459)

In [8]:
# make df of all needed
NewNeeded = False
if NewNeeded:
    df = pd.merge(df_ESGF,df_GCS, how='outer', indicator=True)
    df_needed = df[df._merge == 'left_only']
    
    keep_keys = df_ESGF.keys()
    all_keys = df.keys()
    drop_keys = list(set(all_keys) - set(keep_keys))
    df_needed = df_needed.drop(drop_keys,1)

    num_stores = 0
    if len(df_needed) > 0:
        num_stores = df_needed.ds_dir.nunique() 
        print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    else:
        print('no new data available')
        exit    
    
    df_needed['member'] = [int(s.split('r')[-1].split('i')[0]) for s in df_needed['member_id']]
    df_needed['zsize'] = [df_needed[df_needed.ds_dir==zs]['file_size'].sum() for zs in df_needed['ds_dir']]
    df_needed = df_needed.sort_values(by=['zsize'])
    
    df_needed.to_csv(f'csv/needed_{label}.csv',index=False)
else:
    df_needed = pd.read_csv(f'csv/needed_{label}.csv')

print('Variables')
for var in df_needed.variable_id.unique():
    print(var,df_needed[df_needed.variable_id==var].ds_dir.nunique())

print('\nExperiments')
for exp in df_needed.experiment_id.unique():
    print(exp,df_needed[df_needed.experiment_id==exp].ds_dir.nunique())

Variables
huss 1036
hurs 1180
evspsbl 5
ps 2
tas 1
vas 3
psl 2
ts 1
hfls 2
prc 3
sfcWind 2
pr 3
uas 2

Experiments
ssp585 343
ssp370 172
ssp245 221
ssp126 269
1pctCO2 68
abrupt-4xCO2 59
historical 607
piControl 51
ssp460 52
ssp434 144
ssp534-over 128
ssp119 128


In [9]:
# make available to all modules
myconfig.df_needed = df_needed

In [10]:
ds_dirs = df_needed.ds_dir.unique()
numdsets = len(ds_dirs)

In [11]:
progress_log  = f'logs/progress_{label}.log'
failure_log  = f'logs/failure_{label}.log'
success_log  = f'logs/success_{label}.log'
success_log

'logs/success_Amon-core_experiments-core_2dvars.log'

In [12]:
def write_log(file,str,verbose=True):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(f'{str}\n')
    f.close()
    return

In [13]:
# reload the catalog
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# refresh the gcsfs
fs.invalidate_cache()

for item, ds_dir in enumerate(ds_dirs):
    if item < 1870:
        continue
    print(f'\n{item}/{numdsets-1}',ds_dir)
    if 'EC-Earth3' in ds_dir:
        continue
    #if 'IITM-ESM' in ds_dir:
    #    continue
    #if '/uas' not in ds_dir:
    #    continue

    (ierr, exc) = Check(ds_dir, dir2local)
    if ierr > 0:
        write_log(progress_log,f'{ds_dir} {ierr}: {exc}'); continue
                    
    (gfiles, ierr, exc) = Download(ds_dir)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}'); continue
        
    (ds,ierr,exc) = ReadFiles(ds_dir, gfiles, dir2dict)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}'); continue

    (ierr,exc) = SaveAsZarr(ds_dir, ds, dir2local)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}'); continue
        
    (ierr,exc) = Upload(ds_dir, dir2local)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}'); continue

    (ierr,exc) = Cleanup(ds_dir, gfiles, dir2local)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}'); continue

    write_log(success_log,f'{dir2local(ds_dir)} saved to GCS'); continue


1870/2241 CMIP/NASA-GISS/GISS-E2-1-G/historical/r4i1p5f1/Amon/hurs/gn
CMIP/NASA-GISS/GISS-E2-1-G/historical/r4i1p5f1/Amon/hurs/gn 3: store already in cloud

1871/2241 CMIP/NASA-GISS/GISS-E2-1-G/historical/r6i1p3f1/Amon/hurs/gn
CMIP/NASA-GISS/GISS-E2-1-G/historical/r6i1p3f1/Amon/hurs/gn 3: store already in cloud

1872/2241 CMIP/NASA-GISS/GISS-E2-1-H/historical/r3i1p1f2/Amon/hurs/gn
CMIP/NASA-GISS/GISS-E2-1-H/historical/r3i1p1f2/Amon/hurs/gn 3: store already in cloud

1873/2241 CMIP/NASA-GISS/GISS-E2-1-H/historical/r4i1p1f2/Amon/hurs/gn
CMIP/NASA-GISS/GISS-E2-1-H/historical/r4i1p1f2/Amon/hurs/gn 3: store already in cloud

1874/2241 CMIP/NASA-GISS/GISS-E2-1-H/historical/r5i1p1f2/Amon/hurs/gn
CMIP/NASA-GISS/GISS-E2-1-H/historical/r5i1p1f2/Amon/hurs/gn 3: store already in cloud

1875/2241 CMIP/NASA-GISS/GISS-E2-1-H/historical/r5i1p3f1/Amon/hurs/gn
CMIP/NASA-GISS/GISS-E2-1-H/historical/r5i1p3f1/Amon/hurs/gn 3: store already in cloud

1876/2241 CMIP/NASA-GISS/GISS-E2-1-H/historical/r3i1p3f1/

  decode_timedelta=decode_timedelta,


259 260


  decode_timedelta=decode_timedelta,


successfully uploaded to gs://cmip6/ScenarioMIP/NCAR/CESM2-WACCM/ssp534-over/r1i1p1f1/Amon/huss/gn
/h112/naomi/zarr-minimal/ScenarioMIP/NCAR/CESM2-WACCM/ssp534-over/r1i1p1f1/Amon/huss/gn saved to GCS

1915/2241 ScenarioMIP/MIROC/MIROC6/ssp585/r40i1p1f1/Amon/hurs/gn
netcdfs/hurs_Amon_MIROC6_ssp585_r40i1p1f1_gn_201501-210012.nc
85 86
successfully uploaded to gs://cmip6/ScenarioMIP/MIROC/MIROC6/ssp585/r40i1p1f1/Amon/hurs/gn
/h112/naomi/zarr-minimal/ScenarioMIP/MIROC/MIROC6/ssp585/r40i1p1f1/Amon/hurs/gn saved to GCS

1916/2241 ScenarioMIP/MIROC/MIROC6/ssp585/r30i1p1f1/Amon/hurs/gn
netcdfs/hurs_Amon_MIROC6_ssp585_r30i1p1f1_gn_201501-210012.nc
85 86
successfully uploaded to gs://cmip6/ScenarioMIP/MIROC/MIROC6/ssp585/r30i1p1f1/Amon/hurs/gn
/h112/naomi/zarr-minimal/ScenarioMIP/MIROC/MIROC6/ssp585/r30i1p1f1/Amon/hurs/gn saved to GCS

1917/2241 ScenarioMIP/MIROC/MIROC6/ssp585/r23i1p1f1/Amon/hurs/gn
netcdfs/hurs_Amon_MIROC6_ssp585_r23i1p1f1_gn_201501-210012.nc
85 86
successfully uploaded to gs://