# To Handle New Data Requests Automatically
- beginning of notebook is assumed to be interactive until the requests have been checked
- all progress and exception logging is done only for main loop

In [24]:
import numpy as np
import pandas as pd
import os
import gcsfs
import xarray as xr

### Local modules

In [25]:
from request import requests, set_request_id
from search import search, esgf_search_sites
from netcdf import get_ncfiles, concatenate
from identify import needed
from response import response, dict_to_dfcat, get_details

### Initialization

In [26]:
fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

# ONLY WHEN NECESSARY: Re-initialize cmip6-master.csv with enhanced csv file
#url_cloud = 'https://storage.googleapis.com/cmip6/cmip6.csv'  
#df = pd.read_csv(url_cloud)
#df.to_csv('csv/cmip6-master.csv',index=False)

### Choose basic configuration parameters

In [41]:
dtype = esgf_search_sites()
print('possible ESGF API search nodes: ',list(dtype.keys()))
ESGF_site = dtype['llnl']

# Skip the following sites for getting netcdf files: broken or slow sites
skip_sites = ['dist.nmlab.snu.ac.kr','esg.lasg.ac.cn','esgf-data2.diasjp.net']

possible ESGF API search nodes:  ['llnl', 'ipsl', 'nci', 'ceda', 'jpl', 'gfdl', 'dkrz']


### Get new Google Sheet requests

In [30]:
df_prior = pd.read_csv('csv/requests.csv')
#df_prior

### Get new Google Sheet requests
- by default, only the new rows from the sheet are considered
- specifying a list of rows or emails will add older entries 

In [29]:
# new, but also some older rows:
#df_request_new, dtrouble = requests(df_prior,rows = [70])

# new, but also some older emails:
#df_request_new, dtrouble = requests(df_prior,emails=['henrifdrake@gmail.com'])

# only new:
df_request_new, dtrouble = requests(df_prior)

request_id = set_request_id()
if len(dtrouble)>=1:
    print(dtrouble)

df_request_new

Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester
74,12/11/2019 3:51:36,bbronselaer@email.arizona.edu,,[All],[1pctCO2],[All],[tos],Omon,Ben Bronselaer


### Search ESGF for the availability of requested data

In [31]:
df_ESGF = search(ESGF_site,df_request_new)
len(df_ESGF)


bbronselaer@email.arizona.edu
Omon tos ['All'] 1pctCO2


319

### Get the master list of existing zarr stores

In [32]:
df_master = pd.read_csv('gs://cmip6/cmip6-zarr-consolidated-stores-noQC.csv')

### Check the new requests:
- already exists in df_master (what we have) - not needed
- exists in df_ESGF (what is available) - if not available, then not needed

In [45]:
df_needed = needed(df_master, df_request_new, df_ESGF)

if len(df_needed) > 0:
    num_stores = df_needed.zstore.nunique() 
    print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
else:
    print('no new data available')
    exit
    
print(df_needed.zstore.unique())

needed: nfiles=22, nstores=15
['/CMIP/CAMS/CAMS-CSM1-0/1pctCO2/r2i1p1f1/Omon/tos/gn'
 '/CMIP/CAS/FGOALS-f3-L/1pctCO2/r1i1p1f1/Omon/tos/gn'
 '/CMIP/CAS/FGOALS-f3-L/1pctCO2/r2i1p1f1/Omon/tos/gn'
 '/CMIP/CAS/FGOALS-f3-L/1pctCO2/r3i1p1f1/Omon/tos/gn'
 '/CMIP/CCCma/CanESM5/1pctCO2/r1i1p2f1/Omon/tos/gn'
 '/CMIP/CCCma/CanESM5/1pctCO2/r2i1p1f1/Omon/tos/gn'
 '/CMIP/CCCma/CanESM5/1pctCO2/r2i1p2f1/Omon/tos/gn'
 '/CMIP/CCCma/CanESM5/1pctCO2/r3i1p1f1/Omon/tos/gn'
 '/CMIP/CCCma/CanESM5/1pctCO2/r3i1p2f1/Omon/tos/gn'
 '/CMIP/CNRM-CERFACS/CNRM-CM6-1-HR/1pctCO2/r1i1p1f2/Omon/tos/gn'
 '/CMIP/MOHC/HadGEM3-GC31-LL/1pctCO2/r2i1p1f3/Omon/tos/gn'
 '/CMIP/MOHC/HadGEM3-GC31-LL/1pctCO2/r3i1p1f3/Omon/tos/gn'
 '/CMIP/MOHC/HadGEM3-GC31-LL/1pctCO2/r4i1p1f3/Omon/tos/gn'
 '/CMIP/MOHC/UKESM1-0-LL/1pctCO2/r2i1p1f2/Omon/tos/gn'
 '/CMIP/NASA-GISS/GISS-E2-1-G/1pctCO2/r1i1p3f1/Omon/tos/gn']


### Start logging the progress and exceptions

In [34]:
cat_file = 'csv/cmip6_'+request_id+'.csv'
log_file = 'txt/request_'+request_id+'.log'

In [35]:
# open and close for each write in case of kernel interrupt
def write_log(file,str):
    f = open(file,'a')
    print(str)
    f.write(str+'\n')
    f.close()
    return

### The real work is done in this next loop - can be done in parallel

In [42]:
new_zarrs = df_needed.zstore.unique()

zdict = {}  # construct dictionary for new rows to add to master catalog
for item,zarr in enumerate(new_zarrs):
    #if item > 20:
    #    continue
    #zbdir  = 'ztemp'  + zarr
    zbdir  = '/h36/naomi/zarr-minimal'  + zarr
    
    write_log(log_file,f"\n{item+1}/{num_stores}: local file: {zbdir}")
    
    gsurl = 'gs://cmip6' + zarr
    contents = fs.ls(gsurl+'/.zmetadata')
    if any("zmetadata" in s for s in contents):
        write_log(log_file,'store already in cloud')
        continue

    # does it exist locally? 
    if os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,f'already exists: {zbdir}')
        continue

    gfiles,troubles = get_ncfiles(zarr,df_needed,skip_sites)
    if len(gfiles) == 0: 
        write_log(log_file,'no files available')
        continue
    
    variable_id = zarr.split('/')[-2]
    for gfile in gfiles:   # changes file sizes!!
        command = '/usr/bin/ncatted -h -O -a missing_value\,'+variable_id+',d,, '+gfile
        os.system(command)
    
    # concatenate in time with mfdataset
    print(gfiles)
    status, ds, dstr = concatenate(zarr,gfiles)  

    if status == 'failure':
        print(status,dstr)
        write_log(log_file,dstr)
        continue
    else:
        write_log(log_file,dstr)

    ds.to_zarr(zbdir, consolidated=True, mode='w')  

    if not os.path.isfile(zbdir+'/.zmetadata'):
        write_log(log_file,'to_zarr failure')
        continue
   
    vlist = get_details(ds,zbdir,zarr)
    
    # upload to cloud
    command = '/usr/bin/gsutil -m cp -r '+ zbdir + ' ' + gsurl
    write_log(log_file,command)
    os.system(command) 
        
    try:
        ds = xr.open_zarr(fs.get_mapper(gsurl), consolidated=True)
        zdict[item] = vlist
        write_log(log_file,f'successfully saved as {zbdir}') 
        for gfile in gfiles:
            os.system('rm -f '+ gfile)
    except:
        write_log(log_file,'store did not get saved to GCS properly')
    


1/15: local file: /h36/naomi/zarr-minimal/CMIP/CAMS/CAMS-CSM1-0/1pctCO2/r2i1p1f1/Omon/tos/gn
already exists: /h36/naomi/zarr-minimal/CMIP/CAMS/CAMS-CSM1-0/1pctCO2/r2i1p1f1/Omon/tos/gn

2/15: local file: /h36/naomi/zarr-minimal/CMIP/CAS/FGOALS-f3-L/1pctCO2/r1i1p1f1/Omon/tos/gn
no files available

3/15: local file: /h36/naomi/zarr-minimal/CMIP/CAS/FGOALS-f3-L/1pctCO2/r2i1p1f1/Omon/tos/gn
no files available

4/15: local file: /h36/naomi/zarr-minimal/CMIP/CAS/FGOALS-f3-L/1pctCO2/r3i1p1f1/Omon/tos/gn
no files available

5/15: local file: /h36/naomi/zarr-minimal/CMIP/CCCma/CanESM5/1pctCO2/r1i1p2f1/Omon/tos/gn
already exists: /h36/naomi/zarr-minimal/CMIP/CCCma/CanESM5/1pctCO2/r1i1p2f1/Omon/tos/gn

6/15: local file: /h36/naomi/zarr-minimal/CMIP/CCCma/CanESM5/1pctCO2/r2i1p1f1/Omon/tos/gn
already exists: /h36/naomi/zarr-minimal/CMIP/CCCma/CanESM5/1pctCO2/r2i1p1f1/Omon/tos/gn

7/15: local file: /h36/naomi/zarr-minimal/CMIP/CCCma/CanESM5/1pctCO2/r2i1p2f1/Omon/tos/gn
already exists: /h36/naomi/zar

In [None]:
assert False

### Make a table of aquired data to send in email to requestor

In [43]:
if len(zdict) == 0 :
    print('nothing else to do')
    exit
else:
    dz = dict_to_dfcat(zdict)

nothing else to do


In [44]:
# New data added
dz.zstore.values

array(['gs://cmip6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg/ssp245/r1i1p1f1/Amon/pr/gr',
       'gs://cmip6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg/ssp245/r2i1p1f1/Amon/pr/gr'],
      dtype=object)

In [46]:
df_master_new = pd.concat([df_master, dz])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [47]:
response(df_request_new,df_master_new)

GCS request for: bbronselaer@email.arizona.edu ; table_id= Omon
(number of member_ids)*(number of grid_labels)
--------------------------
variable_id                    tos
experiment_id source_id           
1pctCO2       AWI-CM-1-1-MR      1
              BCC-CSM2-MR        1
              BCC-ESM1           1
              CAMS-CSM1-0        1
              CESM2              1
              CESM2-WACCM        1
              CNRM-CM6-1         1
              CNRM-ESM2-1        4
              CanESM5            1
              E3SM-1-0           1
              EC-Earth3-Veg      1
              GFDL-CM4           1
              GFDL-ESM4          1
              GISS-E2-1-G        2
              GISS-E2-1-H        1
              HadGEM3-GC31-LL    1
              IPSL-CM6A-LR       1
              MCM-UA-1-0         1
              MIROC-ES2L         1
              MIROC6             1
              MRI-ESM2-0         1
              NESM3              1
              SAM0-UNI

In [48]:
! mv csv/request_new.csv csv/requests.csv