# Handle New Data Requests Automatically
- beginning of notebook is assumed to be interactive until the requests have been checked
- all progress and exception logging is done only for main loop
- copy and paste the e-mail response and send from gcs.cmip6.ldeo@gmail.com account

In [None]:
import numpy as np
import pandas as pd
import os
import gcsfs
import xarray as xr
from functools import partial
from IPython.display import display
from glob import glob

### Local modules

In [None]:
from request import requests, set_request_id
from search import search, esgf_search_sites
from netcdf import get_ncfiles, concatenate
from identify import needed
from response import response, dict_to_dfcat, get_details

In [None]:
def getFolderSize(p):
    prepend = partial(os.path.join, p)
    return sum([(os.path.getsize(f) if os.path.isfile(f) else getFolderSize(f)) for f in map(prepend, os.listdir(p))])

### Initialization

In [None]:
# Where to write local zarr stores:
zarr_local = '/d1/naomi/zarrs'

fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

### Choose basic configuration parameters

In [None]:
dtype = esgf_search_sites()

local_node = False

print('possible ESGF API search nodes: ',list(dtype.keys()))

ESGF_site = dtype['llnl'];local_node = True
#ESGF_site = dtype['dkrz']
#ESGF_site = dtype['ipsl']
#ESGF_site = dtype['ceda'];local_node = False  # CEDA doesn't allow local-only searches

# List data nodes to skip for aquiring new netcdf files: broken or slow sites
skip_sites = ['esg.lasg.ac.cn','esgf-data2.diasjp.net','esgf-cnr.hpc.cineca.it'] #['dist.nmlab.snu.ac.kr']

The complete archive of CMIP6 output is made available for search and download via any one of the following portals:

USA, PCMDI/LLNL (California) - https://esgf-node.llnl.gov/search/cmip6/

France, IPSL - https://esgf-node.ipsl.upmc.fr/search/cmip6-ipsl/

Germany, DKRZ - https://esgf-data.dkrz.de/search/cmip6-dkrz/

UK, CEDA - https://esgf-index1.ceda.ac.uk/search/cmip6-ceda/

If you encounter slow responses from one search interface, you might try one of the other portals (perhaps one near you). Also note that the datasets themselves are stored (and partially replicated) on a federated system of data nodes, and again you may find differences from node to node in download speeds.

### Get prior Google Sheet requests

In [None]:
df_prior = pd.read_csv('csv/requests.csv')
df_prior.keys()
df_prior.tail()

### Get new Google Sheet requests
https://docs.google.com/spreadsheets/d/1SGTSK_h4xWX3gdgpeWeCpL_vhzf6tnGPmxetO1gOlQc/edit?usp=sharing
- by default, only the new rows from the sheet are considered
- specifying a list of rows or emails will add older entries 

In [None]:
rows = []   
emails = []

# modify here:
rows = [184]  # GithubTest
#emails = ['neil.swart@canada.ca']

df_request_new, dtrouble = requests(df_prior,rows=rows,emails=emails)
request_id = set_request_id()

# Print mal-formed requests (non-existent variables, etc)
if len(dtrouble)>=1:
    print(dtrouble)

df_request_new

In [None]:
# choose a new request to process:
timestamps = df_request_new.Timestamp.unique()
print(timestamps)
df_request_new = df_request_new[df_request_new.science == 'GithubTest']

df_request_new

In [None]:
df_request_new.comments.values

### Search ESGF for the availability of requested data

In [None]:
print(ESGF_site)
df_ESGF = search(ESGF_site,df_request_new,local_node=local_node)

### Get the master list of existing zarr stores
- df_avail includes all stores, EVEN THOSE with known ES-DOC issues 

In [None]:
df_avail = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype={'version': 'unicode'})
len(df_avail),len(df_ESGF)

df_ESGF.HTTPServer_url.values[0:2]

### Check the new requests:
- already exists in df_avail (what we have) - not needed
- exists in df_ESGF (what is available) - if not available, then not needed|

In [None]:
df_needed = needed(df_avail, df_request_new, df_ESGF)

if len(df_needed) > 0:
    num_stores = df_needed.zstore.nunique() 
    print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    #print(df_needed.zstore.unique())
else:
    print('no new data available')
    exit

In [None]:
#df_needed.zstore.unique()
print("table_id = '",*df_needed.table_id.unique(),"'",sep = "")
print('exps = [\'',end=""), print(*df_needed.experiment_id.unique(), sep = "','",end="" ), print('\']')
print('variables = [\'',end=""), print(*df_needed.variable_id.unique(), sep = "','",end="" ), print('\']')
#print('members = [\'',end=""), print(*df_needed.member_id.unique(), sep = "','",end="" ), print('\']')

In [None]:
assert False 

### Start logging the progress and exceptions

In [None]:
cat_file = 'csv/cmip6_'+request_id+'.csv'
log_file = 'txt/request_'+request_id+'.log'
print(log_file)

In [None]:
# open and close for each write in case of kernel interrupt
def write_log(file,str,verbose=True):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(str+'\n')
    f.close()
    return

In [None]:
df_needed['member'] = [int(s.split('r')[-1].split('i')[0]) for s in df_needed['member_id']]
df_needed = df_needed.sort_values(by=['source_id'])
df_needed = df_needed.sort_values(by=['member'])
#df_needed

### The real work is done in this next loop 
- could be done in parallel except for the writing to the log file

In [None]:
# reload the catalog
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# refresh the gcsfs
fs = gcsfs.GCSFileSystem(token='anon', access='read_only',cache_timeout=-1)

new_zarrs = df_needed.zstore.unique()

verbose = True

zbdirs = []
for i in range(1,84):
    zbdirs += ['/h'+str(i)]

zdict = {}  # construct dictionary for new rows to add to master catalog
for item,zarr in enumerate(new_zarrs):
    
    zbdir  = zarr_local  + zarr
    
    write_log(log_file,f"\n>>{item+1}/{num_stores}:<< local file: {zbdir}",verbose=verbose)
    
    # is zarr already in cloud?
    gsurl = 'gs://cmip6' + zarr
    contents = fs.ls(gsurl)
    if any("zmetadata" in s for s in contents):
        write_log(log_file,'store already in cloud',verbose=verbose)
        continue

    cstore = df_GCS[df_GCS.zstore == gsurl]

    if len(cstore) > 0:
        print('store already in cloud catalog')  # or on a shelf drive which is already in cloud
        continue

    # does zarr exist on active drives?  
    zstrs = glob('/h*/naomi/zarr-minimal' + zarr + '/.zmetadata')
    if len(zstrs) > 0 :
        print('store already exists locally, but not in cloud')
        continue       
            
    # Download the needed netcdf files - reading the known trouble codes from database
    gfiles,troubles,codes,okay = get_ncfiles(zarr,df_needed,skip_sites)
    
    write_log(log_file,troubles,verbose=verbose)
    
    if okay == False:
        continue

    if len(gfiles) == 0: 
        write_log(log_file,'no files available',verbose=verbose)
        continue
    
    variable_id = zarr.split('/')[-3]

    # concatenate in time with mfdataset
    gfiles = sorted(gfiles)
    status, ds, dstr = concatenate(zarr,gfiles,codes)  

    if status == 'failure':
        write_log(log_file,status+dstr,verbose=verbose)
        continue
    else:
        write_log(log_file,dstr)

    # convert to zarr, with consolidated metadata
    ds.to_zarr(zbdir, consolidated=True, mode='w')
        
    if not os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,'to_zarr failure: ',verbose=verbose)
        continue
   
    vlist = get_details(ds,zbdir,zarr)
    
    # upload to cloud
    command = '/usr/bin/gsutil -m cp -r '+ zbdir[:-1] + ' ' + gsurl[:-1]
    write_log(log_file,command,verbose=verbose)
    # uncomment next line to really upload to GC
    # os.system(command) 
        
    size_remote = fs.du(gsurl)
    size_local = getFolderSize(zbdir)
    assert (size_remote - size_local) < 100
    write_log(log_file,f'uploaded {zbdir} correctly',verbose=verbose)    

    try:
        ds = xr.open_zarr(fs.get_mapper(gsurl), consolidated=True)
        zdict[item] = vlist
        write_log(log_file,f'successfully saved as {zbdir}')
        for gfile in gfiles:
            os.system('rm -f '+ gfile)
    except:
        write_log(log_file,'store did not get saved to GCS properly')


In [None]:
if len(zdict) == 0 :
    print('nothing else to do')
    exit
else:
    dz = dict_to_dfcat(zdict)

In [None]:
assert False

### Make a table of acquired data to send in email to requestor

In [None]:
try:
    df_master_new = pd.concat([df_avail, dz],sort=True)
except:
    df_master_new = df_avail

In [None]:
ldict = []
names = ""
print('Re: CMIP6 GCS Data Request (Responses)')
for row in df_request_new.values:
    rdict = dict(zip(df_request_new.keys(),row))
    #print(rdict)
    name = rdict['requester']
    timestamp = rdict['Timestamp']
    names += name
    del rdict['response status']
    ldict += [rdict]
    dfr = df_request_new[df_request_new.Timestamp == timestamp]
    
    print('Dear',name+':')
    print('\n  Here are the results from your recent CMIP6 data request(s).  The master catalog, https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv, will be updated with the nightly build.')
    #if len(dtrouble)>=1:
    #    print('\n '+dtrouble)
    print('\n  Please note: ')
    print('      - Data for some models (e.g., CAS/FGOALS-f3-L and NUIST/NESM3) must be obtained directly from servers which are too slow or unresponsive. ')
    print('      - We exclude data with known errors (as reported at ES-DOC) from the official listing at https://errata.es-doc.org/ .')
    print('        However, data labelled status=resolved or severity=low are included in the master catalog.')
    

    print('      - Some data we have not been able to clean up enough to get it concatenated and save to zarr. Other datasets are only available for disjointed time periods.')
    print('\n  See the sample Jupyter Notebook at https://gist.github.com/naomi-henderson/ed1801d8ee8b992dda252f8b126876a5 for a quick introduction to accessing the data.')
    print('\nFrom the folks at:\n  The Climate Data Science Lab\n  Division of Ocean and Climate Physics\n  LDEO/Columbia University')
    print('\n--------------------------')

    print('\nrequest:')
    display(rdict)

    print('\nresponse:')
    try:
        print('new stores added:\n',len(dz),'\n')
    except:
        print(f'no new data available at ESGF API search node {ESGF_site}')

    #print('\n',dfr,len(df_master_new))
    table = response(dfr,df_master_new)

    print("\navailable data:\n  this includes your new stores but does not include datasets marked 'onhold', 'wontfix' or 'new' in the ES-DOC ERRATA")
    display(table)
    print('\n\n')

In [None]:
# Replace request database with new database
! mv csv/request_new.csv csv/requests.csv