# To Handle New Data Requests Automatically
- beginning of notebook is assumed to be interactive until the requests have been checked
- all progress and exception logging is done only for main loop

In [1]:
import numpy as np
import pandas as pd
import os
import gcsfs
import xarray as xr

### Local modules

In [2]:
from request import requests, set_request_id
from search import search, esgf_search_sites
from netcdf import get_ncfiles, concatenate
from identify import needed
from response import response, dict_to_dfcat, get_details

### Initialization

In [3]:
fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

# ONLY WHEN NECESSARY: Re-initialize cmip6-master.csv with enhanced csv file
#url_cloud = 'https://storage.googleapis.com/cmip6/cmip6.csv'  
#df = pd.read_csv(url_cloud)
#df.to_csv('csv/cmip6-master.csv',index=False)

### Choose basic configuration parameters

In [4]:
dtype = esgf_search_sites()
print('possible ESGF API search nodes: ',list(dtype.keys()))
ESGF_site = dtype['llnl']

# Skip the following sites for getting netcdf files: broken or slow sites
skip_sites = ['dist.nmlab.snu.ac.kr','esg.lasg.ac.cn','esgf-data2.diasjp.net']

possible ESGF API search nodes:  ['llnl', 'ipsl', 'nci', 'ceda', 'jpl', 'gfdl', 'dkrz']


### Get new Google Sheet requests

In [5]:
df_prior = pd.read_csv('csv/requests.csv')
#df_prior

### Get new Google Sheet requests
- by default, only the new rows from the sheet are considered
- specifying a list of rows or emails will add older entries 

In [6]:
# new, but also some older rows:
df_request_new, dtrouble = requests(df_prior,rows = [63])

# new, but also some older emails:
#df_request_new, dtrouble = requests(df_prior,emails=['henrifdrake@gmail.com'])

# only new:
#df_request_new, dtrouble = requests(df_prior)

request_id = set_request_id()
if len(dtrouble)>=1:
    print(dtrouble)

df_request_new

Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester
63,10/15/2019 23:45:29,dbalwada@uw.edu,,[],"[historical, piControl]",[All],[vsf],Omon,Dhruv Balwada


### Search ESGF for the availability of requested data

In [7]:
df_ESGF = search(ESGF_site,df_request_new)
len(df_ESGF)


dbalwada@uw.edu
Omon vsf ['All'] historical
Omon vsf ['All'] piControl


88

### Get the master list of existing zarr stores

In [8]:
df_master = pd.read_csv('csv/cmip6-master.csv')

### Check the new requests:
- already exists in df_master (what we have) - not needed
- exists in df_ESGF (what is available) - if not available, then not needed

In [9]:
df_needed = needed(df_master, df_request_new, df_ESGF)

if len(df_needed) > 0:
    num_stores = df_needed.zstore.nunique() 
    print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
else:
    print('no new data available')
    exit
    
#df_needed.zstore.values

needed: nfiles=12, nstores=5


### Start logging the progress and exceptions

In [10]:
cat_file = 'csv/cmip6_'+request_id+'.csv'
log_file = 'txt/request_'+request_id+'.log'

In [11]:
# open and close for each write in case of kernel interrupt
def write_log(file,str):
    f = open(file,'a')
    print(str)
    f.write(str+'\n')
    f.close()
    return

### The real work is done in this next loop - can be done in parallel

In [12]:
new_zarrs = df_needed.zstore.unique()

zdict = {}  # construct dictionary for new rows to add to master catalog
for item,zarr in enumerate(new_zarrs):
    #zbdir  = 'ztemp'  + zarr
    zbdir  = '/d5/naomi/zarr-minimal'  + zarr
    
    write_log(log_file,f"\n{item+1}/{num_stores}: local file: {zbdir}")
    
    gsurl = 'gs://cmip6' + zarr
    contents = fs.ls(gsurl+'/.zmetadata')
    if any("zmetadata" in s for s in contents):
        write_log(log_file,'store already in cloud')
        continue

    # does it exist locally? 
    if os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,f'already exists: {zbdir}')
        continue

    gfiles,troubles = get_ncfiles(zarr,df_needed,skip_sites)
    if len(gfiles) == 0: 
        write_log(log_file,'no files available')
        continue
    
    variable_id = zarr.split('/')[-2]
    for gfile in gfiles:   # changes file sizes!!
        command = '/usr/bin/ncatted -h -O -a missing_value\,'+variable_id+',d,, '+gfile
        os.system(command)
    
    # concatenate in time with mfdataset
    status, ds, dstr = concatenate(zarr,gfiles)  

    if status == 'failure':
        write_log(log_file,dstr)
        continue
    else:
        write_log(log_file,dstr)

    ds.to_zarr(zbdir, consolidated=True, mode='w')  

    if not os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,'to_zarr failure')
        continue
   
    vlist = get_details(ds,zbdir,zarr)
    
    # upload to cloud
    command = '/usr/bin/gsutil -m cp -r '+ zbdir + ' ' + gsurl
    write_log(log_file,command)
    os.system(command) 
        
    try:
        ds = xr.open_zarr(fs.get_mapper(gsurl), consolidated=True)
        zdict[item] = vlist
        write_log(log_file,f'successfully saved as {zbdir}') 
        for gfile in gfiles:
            os.system('rm -f '+ gfile)
    except:
        write_log(log_file,'store did not get saved to GCS properly')
    


1/5: local file: /d5/naomi/zarr-minimal/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/Omon/vsf/gn
no files available

2/5: local file: /d5/naomi/zarr-minimal/CMIP/NCAR/CESM2-WACCM/historical/r2i1p1f1/Omon/vsf/gn
curl http://aims3.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2-WACCM/historical/r2i1p1f1/Omon/vsf/gn/v20190227/vsf_Omon_CESM2-WACCM_historical_r2i1p1f1_gn_185001-201412.nc -o nctemp/vsf_Omon_CESM2-WACCM_historical_r2i1p1f1_gn_185001-201412.nc

/usr/bin/gsutil -m cp -r /d5/naomi/zarr-minimal/CMIP/NCAR/CESM2-WACCM/historical/r2i1p1f1/Omon/vsf/gn gs://cmip6/CMIP/NCAR/CESM2-WACCM/historical/r2i1p1f1/Omon/vsf/gn
successfully saved as /d5/naomi/zarr-minimal/CMIP/NCAR/CESM2-WACCM/historical/r2i1p1f1/Omon/vsf/gn

3/5: local file: /d5/naomi/zarr-minimal/CMIP/NCAR/CESM2-WACCM/historical/r3i1p1f1/Omon/vsf/gn
curl http://aims3.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/NCAR/CESM2-WACCM/historical/r3i1p1f1/Omon/vsf/gn/v20190227/vsf_Omon_CESM2-WACCM_historical_r3i1p1f1_gn_

### Make a table of aquired data to send in email to requestor

In [13]:
if len(zdict) == 0 :
    print('nothing else to do')
    exit
else:
    dz = dict_to_dfcat(zdict)

In [14]:
dz

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,date_start,date_stop,time_len,sizeG,dcpp_init_year
1,CMIP,NCAR,CESM2-WACCM,historical,r2i1p1f1,Omon,vsf,gn,gs://cmip6/CMIP/NCAR/CESM2-WACCM/historical/r2...,1850-01-15,2014-12-15,1980,0.629,
2,CMIP,NCAR,CESM2-WACCM,historical,r3i1p1f1,Omon,vsf,gn,gs://cmip6/CMIP/NCAR/CESM2-WACCM/historical/r3...,1850-01-15,2014-12-15,1980,0.629,
3,CMIP,UA,MCM-UA-1-0,historical,r1i1p1f2,Omon,vsf,gn,gs://cmip6/CMIP/UA/MCM-UA-1-0/historical/r1i1p...,1850-01-17,2014-12-17,1980,0.0705,
4,CMIP,FIO-QLNM,FIO-ESM-2-0,piControl,r1i1p1f1,Omon,vsf,gn,gs://cmip6/CMIP/FIO-QLNM/FIO-ESM-2-0/piControl...,0301-01-16,0875-12-16,6900,2.19,


In [15]:
df_master_new = pd.concat([df_master, dz])

In [16]:
response(df_request_new,df_master_new)

GCS request for: dbalwada@uw.edu ; table_id= Omon
(number of member_ids)*(number of grid_labels)
--------------------------
variable_id                vsf
experiment_id source_id       
historical    CESM2         11
              CESM2-WACCM    3
              MCM-UA-1-0     2
              NorCPM1        1
piControl     CESM2          1
              CESM2-WACCM    1
              FIO-ESM-2-0    1
              MCM-UA-1-0     1
              NorESM1-F      1



In [17]:
! mv csv/request_new.csv csv/requests.csv