# To Handle New Data Requests Automatically
- beginning of notebook is assumed to be interactive until the requests have been checked
- all progress and exception logging is done only for main loop
- copy and paste the e-mail response and send from gcs.cmip6.ldeo@gmail.com account

In [None]:
import pandas as pd
import gcsfs
import xarray as xr
from datetime import datetime
import itertools

### Local modules

In [None]:
import myconfig
from mytasks import Check, Download, ReadFiles, SaveAsZarr, Upload, Cleanup
from mysearch import esgf_search
from myrequest import requests, set_request_id
from myresponse import response, get_details, dict_to_dfcat

In [None]:
# CONFIGURE ESGF Search here
node_pref = myconfig.node_pref
dtype = myconfig.dtype
myconfig.local_target_prefix = '/h112/naomi/zarr-minimal/'

# reset the preference rank to omit a particular data node
# node_pref['esgf-data1.llnl.gov'] = 999

ESGF_site = dtype['llnl']
#ESGF_site = dtype['dkrz']

print('zarrs will be written to: ',myconfig.local_target_prefix)

In [None]:
# CONFIGURE GCS
fs     = gcsfs.GCSFileSystem(token='anon', access='read_only',cache_timeout=-1)
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# make available to all modules
myconfig.fs = fs
myconfig.df_GCS = df_GCS

### Get new Google Sheet requests
- by default, only the new rows from the sheet are considered
- specifying a list of rows or emails will add older entries 

In [None]:
df_prior = pd.read_csv('csv/requests.csv')

rows = []   
emails = []
rows = [3] #range(0,5) #[0,5,9]
#emails = ['c.wang@princeton.edu']

df_request_new, dtrouble = requests(df_prior,rows=rows,emails=emails)
request_id = set_request_id()

# Check for mal-formed requests (non-existent variables, etc)
if len(dtrouble)>=1:
    print(dtrouble)

# print all active requests:
display(df_request_new)

# by index
#type = 'input'
type = 'last'

if type == 'input':
    resp = input('index?  (after entering number, click on next cell to advance)')
    df_request_new = df_request_new.loc[[int(resp)]]
else:
    timestamps = df_request_new.Timestamp.unique()
    df_request_new = df_request_new[df_request_new.Timestamp == timestamps[0]]

df_request_new

In [None]:
all_search = {}
search_keys = {'table':'table_id','experiments':'experiment_id','variables':'variable_id','models':'source_id','members':'member_id'}

for item, row in df_request_new.iterrows():
    all_search['table_id'] = [row.table]
    request = item
    for key in search_keys.keys():
        if key == 'table':
            continue
        klist = row[key]
        if not ('All' in klist)|('One' in klist):  # Note, we no longer get just one member_id without specifying which
            all_search[search_keys[key]] = klist

label = f'request-{request}'

label, all_search

In [None]:
update_ESGF = False
if update_ESGF:
    x = [value for key,value in all_search.items()]
    searches = [p for p in itertools.product(*x)]

    dESGF = []
    for s in searches:
        search = dict(zip(all_search.keys(),s))
        print(search)
        df = esgf_search(search, server=ESGF_site)
        if len(df)>0:
            dESGF += [df]

    df_ESGF = pd.concat(dESGF)
    df_ESGF.to_csv(f'csv/ESGF_{label}.csv',index=False)
else:
    df_ESGF = pd.read_csv(f'csv/ESGF_{label}.csv', dtype='unicode')

len(df_ESGF), len(df_ESGF.ds_dir.unique())

In [None]:
# make df of all needed
NewNeeded = False
if NewNeeded:
    df = pd.merge(df_ESGF,df_GCS, how='outer', indicator=True)
    df_needed = df[df._merge == 'left_only']
    
    keep_keys = df_ESGF.keys()
    all_keys = df.keys()
    drop_keys = list(set(all_keys) - set(keep_keys))
    df_needed = df_needed.drop(drop_keys,1)

    num_stores = 0
    if len(df_needed) > 0:
        num_stores = df_needed.ds_dir.nunique() 
        print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    else:
        print('no new data available')
        exit    
    
    df_needed['member'] = [int(s.split('r')[-1].split('i')[0]) for s in df_needed['member_id']]
    df_needed['zsize'] = [df_needed[df_needed.ds_dir==zs]['file_size'].sum() for zs in df_needed['ds_dir']]
    df_needed = df_needed.sort_values(by=['zsize'])
    
    df_needed.to_csv(f'csv/needed_{label}.csv',index=False)
else:
    df_needed = pd.read_csv(f'csv/needed_{label}.csv')

len(df_needed), len(df_needed.ds_dir.unique())

In [None]:
# make available to all modules
myconfig.df_needed = df_needed

In [None]:
ds_dirs = df_needed.ds_dir.unique()
numdsets = len(ds_dirs)

In [None]:
request_id = 'test' #datetime.now().strftime('%Y%m%d-%H%M')
progress_log  = f'logs/progress_{label}.log'
failure_log  = f'logs/failure_{label}.log'
success_log  = f'logs/success_{label}.log'

In [None]:
def write_log(file,str,verbose=False):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(f'{str}\n')
    f.close()
    return

In [None]:
# reload the catalog
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# refresh the gcsfs
fs.invalidate_cache()

verbose = False
zdict = {}
for item, ds_dir in enumerate(ds_dirs):
    #if item > 0:
    #    continue
    print(f'\n{item}/{numdsets-1}',ds_dir)

    (ierr, exc) = Check(ds_dir)
    if ierr > 0:
        write_log(progress_log,f'{ds_dir} {ierr}: {exc}',verbose=verbose); continue
        
    (gfiles, ierr, exc) = Download(ds_dir)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}',verbose=verbose); continue
        
    (ds, ierr, exc) = ReadFiles(ds_dir, gfiles)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}',verbose=verbose); continue

    (ierr, exc) = SaveAsZarr(ds_dir, ds)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}',verbose=verbose); continue
        
    (gsurl, ierr, exc) = Upload(ds_dir)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}',verbose=verbose); continue

    (ierr, exc) = Cleanup(ds_dir, gfiles)
    if ierr > 0:
        write_log(failure_log,f'{ds_dir},noUse, {ierr}: {exc}',verbose=verbose); continue

    vlist = get_details(ds_dir, ds)
    zdict[item] = vlist
    
    write_log(success_log,f'{item}/{numdsets-1}: {ds_dir} saved to {gsurl}',verbose=verbose); continue

In [None]:
assert False

### Make a table of acquired data to send in email to requestor

In [None]:
if len(zdict) == 0 :
    print('nothing else to do')
    exit
else:
    dz = dict_to_dfcat(zdict)

In [None]:
dz.zstore.values

In [None]:
try:
    df_master_new = pd.concat([df_GCS, dz],sort=True)
except:
    df_master_new = df_GCS

In [None]:
ldict = []
names = ""
print('Re: CMIP6 GCS Data Request (Responses)')
for row in df_request_new.values:
    rdict = dict(zip(df_request_new.keys(),row))
    #print(rdict)
    name = rdict['requester']
    timestamp = rdict['Timestamp']
    names += name
    del rdict['response status']
    ldict += [rdict]
    dfr = df_request_new[df_request_new.Timestamp == timestamp]
    
    print('Dear',name+':')
    print('\n  Here are the results from your recent CMIP6 data request(s).  The master catalog, https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv, will be updated with the nightly build.')
    #if len(dtrouble)>=1:
    #    print('\n '+dtrouble)
    print('\n  Please note: ')
    print('      - Data for some models (e.g., CAS/FGOALS-f3-L and NUIST/NESM3) must be obtained directly from servers which are too slow or unresponsive. ')
    print('      - We exclude data with known errors (as reported at ES-DOC) from the official listing at https://errata.es-doc.org/ .')
    print('        However, data labelled status=resolved or severity=low are included in the master catalog.')
    

    print('      - Some data we have not been able to clean up enough to get it concatenated and save to zarr. Other datasets are only available for disjointed time periods.')
    print('\n  See the sample Jupyter Notebook at https://gist.github.com/naomi-henderson/ed1801d8ee8b992dda252f8b126876a5 for a quick introduction to accessing the data.')
    print('\nFrom the folks at:\n  The Climate Data Science Lab\n  Division of Ocean and Climate Physics\n  LDEO/Columbia University')
    print('\n--------------------------')

    print('\nrequest:')
    display(rdict)

    print('\nresponse:')
    try:
        print('new stores added:\n',len(dz),'\n')
    except:
        print(f'no new data available at ESGF API search node {ESGF_site}')

    #print('\n',dfr,len(df_master_new))
    table = response(dfr,df_master_new)

    print("\navailable data:\n  this includes your new stores but does not include datasets marked 'onhold', 'wontfix' or 'new' in the ES-DOC ERRATA")
    display(table)
    print('\n\n')