# To Handle New Data Requests Automatically
- beginning of notebook is assumed to be interactive until the requests have been checked
- all progress and exception logging is done only for main loop
- copy and paste the e-mail response and send from gcs.cmip6.ldeo@gmail.com account

In [1]:
import numpy as np
import pandas as pd
import os
import gcsfs
import xarray as xr

### Local modules

In [2]:
from request import requests, set_request_id
from search import search, esgf_search_sites
from netcdf import get_ncfiles, concatenate
from identify import needed
from response import response, dict_to_dfcat, get_details

### Initialization

In [3]:
fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

# ONLY WHEN NECESSARY: Re-initialize cmip6-master.csv with enhanced csv file
#url_cloud = 'https://storage.googleapis.com/cmip6/cmip6.csv'  
#df = pd.read_csv(url_cloud)
#df.to_csv('csv/cmip6-master.csv',index=False)

zarr_local = '/h36/naomi/zarr-minimal'

### Choose basic configuration parameters

In [4]:
dtype = esgf_search_sites()
print('possible ESGF API search nodes: ',list(dtype.keys()))
ESGF_site = dtype['llnl']

# Skip the following sites for getting netcdf files: broken or slow sites
skip_sites = ['dist.nmlab.snu.ac.kr','esg.lasg.ac.cn','esgf-data2.diasjp.net']

possible ESGF API search nodes:  ['llnl', 'ipsl', 'nci', 'ceda', 'jpl', 'gfdl', 'dkrz']


### Get new Google Sheet requests

In [5]:
df_prior = pd.read_csv('csv/requests.csv')
#df_prior

### Get new Google Sheet requests
- by default, only the new rows from the sheet are considered
- specifying a list of rows or emails will add older entries 

In [6]:
Debug = False
# new, but also some older rows:
df_request_new, dtrouble = requests(df_prior,rows = [67])

# new, but also some older emails:
#df_request_new, dtrouble = requests(df_prior,emails=['henrifdrake@gmail.com'])

# only new:
#df_request_new, dtrouble = requests(df_prior)

request_id = set_request_id()
if len(dtrouble)>=1:
    print(dtrouble)

df_request_new

Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester
67,11/10/2019 11:05:14,naomi@ldeo.columbia.edu,,[All],[All],[All],"[hfgeoubed, lithk, topg]",IfxGre,Naomi Henderson


### Search ESGF for the availability of requested data

In [7]:
df_ESGF = search(ESGF_site,df_request_new)


naomi@ldeo.columbia.edu
IfxGre hfgeoubed ['All'] ['All']
IfxGre lithk ['All'] ['All']
IfxGre topg ['All'] ['All']


In [8]:
if Debug:
    source_ids = df_ESGF.source_id.unique()
    df_ESGF = df_ESGF[df_ESGF.source_id==source_ids[1]]
    df_ESGF

### Get the master list of existing zarr stores

In [9]:
df_master = pd.read_csv('gs://cmip6/cmip6-zarr-consolidated-stores-noQC.csv')

### Check the new requests:
- already exists in df_master (what we have) - not needed
- exists in df_ESGF (what is available) - if not available, then not needed

In [10]:
df_needed = needed(df_master, df_request_new, df_ESGF)

if len(df_needed) > 0:
    num_stores = df_needed.zstore.nunique() 
    print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
else:
    print('no new data available')
    exit
    
print(df_needed.zstore.unique())

needed: nfiles=6, nstores=6
['/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/hfgeoubed/gn'
 '/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/IfxGre/hfgeoubed/gn'
 '/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/lithk/gn'
 '/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/IfxGre/lithk/gn'
 '/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/topg/gn'
 '/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/IfxGre/topg/gn']


### Start logging the progress and exceptions

In [11]:
cat_file = 'csv/cmip6_'+request_id+'.csv'
log_file = 'txt/request_'+request_id+'.log'

In [12]:
# open and close for each write in case of kernel interrupt
def write_log(file,str):
    f = open(file,'a')
    print(str)
    f.write(str+'\n')
    f.close()
    return

### The real work is done in this next loop - can be done in parallel

In [13]:
new_zarrs = df_needed.zstore.unique()

zdict = {}  # construct dictionary for new rows to add to master catalog
for item,zarr in enumerate(new_zarrs):
    #if item > 20:
    #    continue
    #zbdir  = 'ztemp'  + zarr
    zbdir  = zarr_local  + zarr
    
    write_log(log_file,f"\n{item+1}/{num_stores}: local file: {zbdir}")
    
    gsurl = 'gs://cmip6' + zarr
    contents = fs.ls(gsurl+'/.zmetadata')
    if any("zmetadata" in s for s in contents):
        write_log(log_file,'store already in cloud')
        continue

    # does it exist locally? 
    if os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,f'already exists: {zbdir}')
        continue

    gfiles,troubles = get_ncfiles(zarr,df_needed,skip_sites)
    if len(gfiles) == 0: 
        write_log(log_file,'no files available')
        continue
    
    variable_id = zarr.split('/')[-2]
    for gfile in gfiles:   # changes file sizes!!
        command = '/usr/bin/ncatted -h -O -a missing_value\,'+variable_id+',d,, '+gfile
        os.system(command)
    
    # concatenate in time with mfdataset
    print(gfiles)
    status, ds, dstr = concatenate(zarr,gfiles)  

    if status == 'failure':
        print(status,dstr)
        write_log(log_file,dstr)
        continue
    else:
        write_log(log_file,dstr)

    ds.to_zarr(zbdir, consolidated=True, mode='w')  

    if not os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,'to_zarr failure')
        continue
   
    vlist = get_details(ds,zbdir,zarr)
    
    # upload to cloud
    command = '/usr/bin/gsutil -m cp -r '+ zbdir + ' ' + gsurl
    write_log(log_file,command)
    os.system(command) 
        
    try:
        ds = xr.open_zarr(fs.get_mapper(gsurl), consolidated=True)
        zdict[item] = vlist
        write_log(log_file,f'successfully saved as {zbdir}') 
        for gfile in gfiles:
            os.system('rm -f '+ gfile)
    except:
        write_log(log_file,'store did not get saved to GCS properly')
    


1/6: local file: /h36/naomi/zarr-minimal/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/hfgeoubed/gn
curl http://esgf-data.ucar.edu/thredds/fileServer/esg_dataroot/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/hfgeoubed/gn/v20191120/hfgeoubed_IfxGre_CESM2-WACCM-FV2_historical_r1i1p1f1_gn.nc -o nctemp/hfgeoubed_IfxGre_CESM2-WACCM-FV2_historical_r1i1p1f1_gn.nc
['nctemp/hfgeoubed_IfxGre_CESM2-WACCM-FV2_historical_r1i1p1f1_gn.nc']

/usr/bin/gsutil -m cp -r /h36/naomi/zarr-minimal/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/hfgeoubed/gn gs://cmip6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/hfgeoubed/gn
successfully saved as /h36/naomi/zarr-minimal/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/hfgeoubed/gn

2/6: local file: /h36/naomi/zarr-minimal/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/IfxGre/hfgeoubed/gn
curl http://esgf-data.ucar.edu/thredds/fileServer/esg_dataroot/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/IfxGre/hfgeoubed/gn/v20

### Make a table of aquired data to send in email to requestor

In [15]:
if len(zdict) == 0 :
    print('nothing else to do')
    exit
else:
    dz = dict_to_dfcat(zdict)

In [16]:
df_master_new = pd.concat([df_master, dz])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [26]:
from IPython.display import display

ldict = []
names = ""
for row in df_request_new.values:
    rdict = dict(zip(df_request_new.keys(),row))
    names += rdict['requester']
    del rdict['response status']
    ldict += [rdict]
    
print('Dear',names+':')
print('\n  Here are the results from your recent CMIP6 data request(s).  The master catalog will be updated with the nightly build.')
print('\nFrom the folks at:\n  The Climate Data Science Lab\n  Division of Ocean and Climate Physics\n  LDEO/Columbia University')

print('\n--------------------------')

print('\nrequest:')
display(ldict[0])

print('\nnew stores added:\n',dz.zstore.values,'\n')

table = response(df_request_new,df_master_new)

print('available data (including the new stores):')
display(table)

Dear Naomi Henderson:

  Here are the results from your recent CMIP6 data request(s).  The master catalog will be updated with the nightly build.

From the folks at:
  The Climate Data Science Lab
  Division of Ocean and Climate Physics
  LDEO/Columbia University

--------------------------

request:


{'Timestamp': '11/10/2019 11:05:14',
 'E-mail': 'naomi@ldeo.columbia.edu',
 'members': ['All'],
 'experiments': ['All'],
 'models': ['All'],
 'variables': ['hfgeoubed', 'lithk', 'topg'],
 'table': 'IfxGre',
 'requester': 'Naomi Henderson'}


new stores added:
 ['gs://cmip6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/hfgeoubed/gn'
 'gs://cmip6/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/IfxGre/hfgeoubed/gn'
 'gs://cmip6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/lithk/gn'
 'gs://cmip6/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/IfxGre/lithk/gn'
 'gs://cmip6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/IfxGre/topg/gn'
 'gs://cmip6/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/IfxGre/topg/gn'] 

available data (including the new stores):


Unnamed: 0_level_0,variable_id,hfgeoubed,lithk,topg
experiment_id,source_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1pctCO2,CESM2,1,1,1
1pctCO2,CESM2-WACCM,1,1,1
amip,CESM2,3,3,3
amip,CESM2-WACCM,3,3,2
esm-hist,CESM2,2,0,1
esm-piControl,CESM2,1,1,1
historical,CESM2,11,11,11
historical,CESM2-WACCM,3,3,3
historical,CESM2-WACCM-FV2,1,1,1
lig127k,CESM2,1,1,1


In [18]:
! mv csv/request_new.csv csv/requests.csv