## Evaluate what is available vs. what is in GC
- Searches an ESGF node to see what is available
- Print missing models for each experiment and variable

In [1]:
import numpy as np
import pandas as pd

### Local modules

In [2]:
from request import requests, set_request_id
from search import search, esgf_search_sites
from utilities import search_df

### Get prior Google Sheet requests

In [3]:
df_prior = pd.read_csv('csv/requests.csv')
#df_prior

### Get new Google Sheet requests
- by default, only the new rows from the sheet are considered
- specifying a list of rows or emails will add older entries 

In [9]:
rows = []   
emails = []

# modify here:
rows = [186,187,188]

df_request_new, dtrouble = requests(df_prior,rows=rows,emails=emails)

request_id = set_request_id()

df_request_new

Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester,science,comments
186,2020-06-23,naomi@ldeo.columbia.edu,,[r1i1p1f1],"[historical, ssp126, ssp245, ssp370]",[All],[tasmax],day,Rhodium,global downscaling,R1
187,2020-06-23,naomi@ldeo.columbia.edu,,[r1i1p1f1],"[historical, ssp126, ssp245, ssp370]",[All],[tasmin],day,Rhodium,global downscaling,R2
188,2020-06-23,naomi@ldeo.columbia.edu,,[r1i1p1f1],"[historical, ssp126, ssp245, ssp370]",[All],[pr],day,Rhodium,global downscaling,R3
127,4/1/2020 12:56:10,maximilian.goebel@phd.iseg.ulisboa.pt,,[All],"[1pctCO2, abrupt-4xCO2, esm-hist, esm-piContro...",[All],"[siextentn, siarean]",SImon,Maximilian GÃ¶bel,I would like to conduct research in the field ...,Will the repository be updated on a regular ba...
197,2020-06-23,naomi@ldeo.columbia.edu,,[r1i1p1f1],[ssp126],[NorESM2-LM],"[pr, tasmin, tasmax]",day,Naomi,global downscaling,
198,2020-06-23,naomi@ldeo.columbia.edu,,[r1i1p1f1],[ssp126],[NorESM2-LM],"[pr, tasmin, tasmax]",day,Naomi,global downscaling,
199,2020-06-24,naomi@ldeo.columbia.edu,,[r1i1p1f1],[historical],[ACCESS-ESM1-5],[so],Omon,Naomi,issue testing,old JB issue


In [10]:
# choose a new request to process:
timestamps = df_request_new.Timestamp.unique()
#print(timestamps)
#print(df_request_new.members)
#df_request_new = df_request_new[df_request_new.Timestamp == timestamps[-1]]
#df_request_new = df_request_new[df_request_new.members=='[r1i1p1f1]']
df_request_new = df_request_new[df_request_new.requester == 'Rhodium']

#member = df_request_new.members.values[0][0]
member = 'all'
df_request_new

Unnamed: 0,Timestamp,E-mail,response status,members,experiments,models,variables,table,requester,science,comments
186,2020-06-23,naomi@ldeo.columbia.edu,,[r1i1p1f1],"[historical, ssp126, ssp245, ssp370]",[All],[tasmax],day,Rhodium,global downscaling,R1
187,2020-06-23,naomi@ldeo.columbia.edu,,[r1i1p1f1],"[historical, ssp126, ssp245, ssp370]",[All],[tasmin],day,Rhodium,global downscaling,R2
188,2020-06-23,naomi@ldeo.columbia.edu,,[r1i1p1f1],"[historical, ssp126, ssp245, ssp370]",[All],[pr],day,Rhodium,global downscaling,R3


### Search ESGF for the availability of requested data

In [14]:
# Send search request to chosen node (might take awhile)

dtype = esgf_search_sites()
print('possible ESGF API search nodes: ',list(dtype.keys()))

#ESGF = 'ceda'; local_node = False
#ESGF = 'dkrz'; local_node = False
#ESGF = 'ipsl'; local_node = False
ESGF = 'llnl'; local_node = False

df_ESGF = search(dtype[ESGF],df_request_new,local_node=local_node,verbose=False)

possible ESGF API search nodes:  ['llnl', 'ipsl', 'nci', 'ceda', 'gfdl', 'dkrz']

naomi@ldeo.columbia.edu
day tasmax ['All'] historical
https://esgf-node.llnl.gov/esg-search/search/?offset=0&limit=500&mip_era=CMIP6&variable_id=tasmax&table_id=day&experiment_id=historical&type=File&format=application%2Fsolr%2Bjson&latest=true
https://esgf-node.llnl.gov/esg-search/search/?offset=0&limit=500&mip_era=CMIP6&variable_id=tasmax&table_id=day&experiment_id=historical&type=File&format=application%2Fsolr%2Bjson&latest=true
https://esgf-node.llnl.gov/esg-search/search/?offset=500&limit=500&mip_era=CMIP6&variable_id=tasmax&table_id=day&experiment_id=historical&type=File&format=application%2Fsolr%2Bjson&latest=true
https://esgf-node.llnl.gov/esg-search/search/?offset=1000&limit=500&mip_era=CMIP6&variable_id=tasmax&table_id=day&experiment_id=historical&type=File&format=application%2Fsolr%2Bjson&latest=true
https://esgf-node.llnl.gov/esg-search/search/?offset=1500&limit=500&mip_era=CMIP6&variable_id=t

In [None]:
# Make dataframe of the search results

zarr_format = '/%(activity_drs)s/%(institution_id)s/%(source_id)s/%(experiment_id)s/%(member_id)s/%(table_id)s/%(variable_id)s/%(grid_label)s'
keys = ['activity_drs','institution_id','source_id','experiment_id','member_id','table_id','variable_id','grid_label']

zstores = []
for index, row in df_ESGF.iterrows():
    tdict={}
    for key in keys:
        tdict[key] = row[key]
    zarr_file = zarr_format % tdict
    zstores += ['gs://cmip6'+zarr_file]

df_ESGF['zstore'] = zstores

do_not_need = ['HTTPServer_url', 'OPENDAP_url', 'size', 'tracking_id', 'file_name', ]

dESGF = df_ESGF.drop(do_not_need, axis=1).drop_duplicates()
dESGF.to_csv(f'dESGF-{ESGF}.csv',index=False)

In [None]:
# PRINT simple table of results
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# use this to get number of models (one per model)
dm = dESGF[['experiment_id','source_id','member_id','variable_id']].groupby([
            'experiment_id','member_id','source_id','variable_id']).nunique()[['member_id']]

table = pd.DataFrame.pivot_table(dm,
                                 values='member_id',
                                 index=['source_id','variable_id'],
                                 columns=['experiment_id'],
                                 aggfunc=np.sum,
                                 fill_value=0)
print(ESGF)
print('activity_drs, table_id = ',dESGF.activity_drs.unique(),dESGF.table_id.unique())
print(table)

In [45]:
# print names of missing MODELS (in dESGF, but not in GC)

#dESGF = pd.read_csv(f'dESGF-{ESGF}.csv')

exps = dESGF.experiment_id.unique()
variables = dESGF.variable_id.unique()
tables = dESGF.table_id.unique()

d2 = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')
dGC = search_df(d2,variable_id=variables,experiment_id=exps,table_id=tables)

for var in variables:
    d1v = dESGF[(dESGF.variable_id==var)&(dESGF.member_id=='r1i1p1f1')]
    d2v = dGC[(dGC.variable_id==var)&(dGC.member_id=='r1i1p1f1')]
    for exp in exps:
        d1ve = d1v[d1v.experiment_id==exp]
        d2ve = d2v[d2v.experiment_id==exp]
        missing = set(d1ve.source_id.unique()) - set(d2ve.source_id.unique())
        #print(var,exp,d1ve.zstore.nunique(),d2ve.zstore.nunique())
        print(var,exp,missing)

tasmax historical set()
tasmax ssp126 {'FGOALS-g3', 'CESM2'}
tasmax ssp245 {'FGOALS-g3', 'CESM2'}
tasmax ssp370 {'CESM2'}
tasmin historical {'FGOALS-f3-L'}
tasmin ssp126 {'FGOALS-g3', 'CESM2'}
tasmin ssp245 {'FGOALS-g3', 'CESM2'}
tasmin ssp370 {'CESM2'}
pr historical {'IITM-ESM'}
pr ssp126 set()
pr ssp245 set()
pr ssp370 set()


CEDA, local_node = False

number of models with r1i1p1f1:
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                     34      22      24      24
tasmax                 31      21      23      23
tasmin                 31      21      23      23

number of models with any member_id:
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                     41      29      29      28
tasmax                 38      28      29      27
tasmin                 38      28      29      27

number of runs (all member_ids):
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                    423     121     155     161
tasmax                410     119     154     158
tasmin                410     119     154     158

DKRZ, local_node = False

number of models with r1i1p1f1:
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                     34      22      24      24
tasmax                 31      21      23      23
tasmin                 31      21      23      23

number of models with any member_id:
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                     41      29      29      28
tasmax                 38      28      29      27
tasmin                 38      28      29      27

number of runs (all member_ids):
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                    423     121     155     161
tasmax                410     119     154     158
tasmin                410     119     154     158    

IPSL, local_node = False

number of models with r1i1p1f1:
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                     34      22      24      24
tasmax                 31      21      23      23
tasmin                 31      21      23      23

number of models with any member_id:
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                     41      28      29      28
tasmax                 38      27      29      27
tasmin                 38      27      29      27

number of runs (all member_ids):
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                    415     120     155     155
tasmax                398     118     154     152
tasmin                398     118     154     152


LLNL, local_node = False

number of models with r1i1p1f1:
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                     34      22      24      24
tasmax                 30      20      23      23
tasmin                 31      21      23      23

number of models with any member_id:
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                     41      29      29      28
tasmax                 36      26      29      27
tasmin                 38      28      29      27

number of runs (all member_ids):
activity_drs, table_id =  ['CMIP' 'ScenarioMIP' 'AerChemMIP'] ['day']
experiment_id  historical  ssp126  ssp245  ssp370
variable_id                                      
pr                    423     121     155     161
tasmax                286     112     154     158
tasmin                410     119     154     158