# To Handle New Data Requests Automatically

In [1]:
import numpy as np
import pandas as pd
import os
import gcsfs
import xarray as xr

### Local modules

In [2]:
from request import requests, set_request_id
from search import search, esgf_search_sites
from netcdf import get_ncfiles, concatenate
from identify import needed
from response import response, dict_to_dfcat, get_details

### Initialization

In [3]:
fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

# ONLY WHEN NECESSARY: Re-initialize cmip6-master.csv with enhanced csv file
#url_cloud = 'https://storage.googleapis.com/cmip6/cmip6.csv'  
#df = pd.read_csv(url_cloud)
#df.to_csv('csv/cmip6-master.csv',index=False)

### Choose basic configuration

In [4]:
dtype = esgf_search_sites()
print('possible ESGF API search nodes: ',list(dtype.keys()))
ESGF_site = dtype['llnl']

# Skip the following sites for getting netcdf files: broken or slow sites
skip_sites = ['dist.nmlab.snu.ac.kr','esg.lasg.ac.cn','esgf-data2.diasjp.net']

possible ESGF API search nodes:  ['llnl', 'ipsl', 'nci', 'ceda', 'jpl', 'gfdl', 'dkrz']


### Get new Google Sheet requests, open some log files

In [5]:
df_request_new = requests()
request_id = set_request_id()
file_cat = 'csv/cmip6_'+request_id+'.csv'
file_x = 'txt/exceptions_'+request_id+'.txt'
file_log = 'txt/request_'+request_id+'.log'

f_cat = open(file_cat,'w')
f_x = open(file_x,'w')
f_log = open(file_log,'w')

df_request_new.members.values

67 naomi@ldeo.columbia.edu []


array([list(['All'])], dtype=object)

### Search ESGF for the availability of requested data

In [6]:
df_ESGF = search(ESGF_site,df_request_new)
len(df_ESGF)

naomi@ldeo.columbia.edu
IfxGre hfgeoubed ['All'] ['All']
IfxGre lithk ['All'] ['All']
IfxGre topg ['All'] ['All']


89

In [8]:
df_master = pd.read_csv('csv/cmip6-master.csv')

In [9]:

df_needed = needed(df_master, df_request_new, df_ESGF)


In [10]:
print('number of files needed',len(df_needed))
num_stores = df_needed.zstore.nunique() 
print('number of stores to be created',num_stores)

number of files needed 89
number of stores to be created 89


In [11]:
# stop if there is nothing to do
assert len(df_needed) >= 1

### The work is done in this next loop - can be done in parallel

In [None]:
new_zarrs = df_needed.zstore.unique()

zdict = {}  # construct dictionary for new rows to add to master catalog
for item,zarr in enumerate(new_zarrs):
    #zbdir  = 'ztemp'  + zarr
    zbdir  = '/d5/naomi/zarr-minimal'  + zarr
    
    print(f"{item}/{num_stores}: local file: {zbdir}")
    
    # does it exist in ztemp already?
    if os.path.isfile(zbdir+'/.zmetadata'):
        print(item,'already exists:',zbdir)
        continue

    gfiles = get_ncfiles(zarr,df_needed,skip_sites)
    if len(gfiles) == 0: 
        print(item,'no files available')
        continue
    
    variable_id = zarr.split('/')[-2]
    for gfile in gfiles:   # changes file sizes!!
        command = '/usr/bin/ncatted -h -O -a missing_value\,'+variable_id+',d,, '+gfile
        os.system(command)
    
    # concatenate in time with mfdataset
    status, ds, ddict = concatenate(zarr,gfiles)  

    if status == 'failure':
        print(item,'oops, no dice')
        continue

    ds.to_zarr(zbdir, consolidated=True, mode='w')  

    if not os.path.isfile(zbdir+'/.zmetadata'):
        print('to_zarr failure')
        continue
   
    gsurl, vlist = get_details(ds,zbdir,zarr)
    
    # upload to cloud
    contents = fs.ls(gsurl+'/.zmetadata')
    if any("zmetadata" in s for s in contents):
        print(item,'store already in cloud')
        continue
    else:
        command = '/usr/bin/gsutil -m cp -r '+ zbdir + ' ' + gsurl
        print(command)
        os.system(command) 
        
    try:
        ds = xr.open_zarr(fs.get_mapper(gsurl), consolidated=True)
        zdict[item] = vlist
        print(item,'successfully saved as ',zbdir) 
        for gfile in gfiles:
           os.system('rm -f '+ gfile)
    except:
        print('store did not get saved to GCS properly')
    

In [14]:
dz = dict_to_dfcat(zdict)

In [15]:
df_master_new = pd.concat([df_master, dz])

### Make a table of aquired data to send in email to requestor

In [16]:
response(df_request_new,df_master_new)

GCS request for: naomi@ldeo.columbia.edu ; table_id= IfxGre
(number of member_ids)*(number of grid_labels)
--------------------------
variable_id                hfgeoubed  lithk  topg
experiment_id source_id                          
1pctCO2       CESM2                1      1     1
              CESM2-WACCM          1      1     1
amip          CESM2                3      3     3
              CESM2-WACCM          3      3     2
esm-hist      CESM2                2      0     1
esm-piControl CESM2                1      1     1
historical    CESM2               11     11    11
              CESM2-WACCM          3      3     3
lig127k       CESM2                1      1     1
piControl     CESM2                1      1     1
              CESM2-WACCM          1      1     1
ssp585        CESM2                2      2     2
              CESM2-WACCM          1      1     1

