In [1]:
import pandas as pd
import zarr
import fsspec

In [3]:
from myidentify import tracks2version, tracks2source, jdict2source, id2jdict
from myutilities import search_df
from mysearch import esgf_search
import myconfig

In [4]:
def gsurl2tracks(gsurl):
    mapper = fsspec.get_mapper(gsurl)
    group = zarr.open_consolidated(mapper)
    tracks = group.attrs['tracking_id']
    tracking_ids = tracks.split('\n')

    if len(tracking_ids) != len(set(tracking_ids)):
        msg = f'\nnetcdf file tracking_ids are NOT UNIQUE!\n{tracking_ids}\n'
        warnings.warn("\n" + msg)

    return tracks

In [11]:
dfcat = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

In [25]:
# check current search results matching any dataset in GCS:

target_keys = ['mip_era','activity_id','institution_id','source_id','experiment_id','member_id','table_id','variable_id','grid_label']

def gsurl2search(gsurl):
    values = gsurl[11:-1].split('/')
    return dict(zip(target_keys,values))
#gsurl = dfcat.zstore.values[0]

gsurl = 'gs://cmip6/CMIP6/AerChemMIP/AS-RCEC/TaiESM1/histSST/r1i1p1f1/AERmon/od550aer/gn/'
asearch = gsurl2search(gsurl)
print(asearch)
esgf_search(asearch, toFilter = False)

{'mip_era': 'CMIP6', 'activity_id': 'AerChemMIP', 'institution_id': 'AS-RCEC', 'source_id': 'TaiESM1', 'experiment_id': 'histSST', 'member_id': 'r1i1p1f1', 'table_id': 'AERmon', 'variable_id': 'od550aer', 'grid_label': 'gn'}


Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,version_id,ncfile,file_size,url,data_node,ds_dir,node_order,start,stop
0,AerChemMIP,AS-RCEC,TaiESM1,histSST,r1i1p1f1,AERmon,od550aer,gn,v20201223,od550aer_AERmon_TaiESM1_histSST_r1i1p1f1_gn_18...,244071707,https://esgf-data1.llnl.gov/thredds/fileServer...,esgf-data1.llnl.gov,AerChemMIP/AS-RCEC/TaiESM1/histSST/r1i1p1f1/AE...,0,185001,197912
1,AerChemMIP,AS-RCEC,TaiESM1,histSST,r1i1p1f1,AERmon,od550aer,gn,v20201223,od550aer_AERmon_TaiESM1_histSST_r1i1p1f1_gn_19...,65768031,https://esgf-data1.llnl.gov/thredds/fileServer...,esgf-data1.llnl.gov,AerChemMIP/AS-RCEC/TaiESM1/histSST/r1i1p1f1/AE...,0,198001,201412


In [20]:
# check size of any dataset in GCS:

import gcsfs
fs = gcsfs.GCSFileSystem(token='anon')
gsurl = 'gs://cmip6/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Oyr/expc/gr/v20190514/'
size_remote = fs.du(gsurl)
print(size_remote/1e9,'G')

0.635802018 G


In [29]:
# check current and/or available versions for any dataset in GCS:
gsurl_approx = 'gs://cmip6/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Oyr/expc/gr/'
gsurl = dfcat[dfcat.zstore.str.contains(gsurl_approx)].zstore.values[0]
print(gsurl)
# If you just want to see the current version:
version_cat = dfcat[dfcat.zstore.str.contains(gsurl)].version.values[0]
print('current version from GC catalog = ',version_cat)

# But this checks for all versions (as listed in the Data Handle Service)
tracks = gsurl2tracks(gsurl)
(version,jdict) = tracks2version(tracks)
print('latest version from handler = ', version)

# And this checks for all versions currently available in ESGF
asearch = gsurl2search(gsurl)
dfs = esgf_search(asearch, toFilter = False)
version_ESGF = list(set(dfs.version_id))
print('version(s) available from ESGF = ', version_ESGF)


gs://cmip6/CMIP6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Oyr/expc/gr/v20190514/
current version from GC catalog =  20190514
current version from GC tracks =  ['20190514']
latest version from handler =  20190514
version(s) available from ESGF =  ['v20190514']


In [None]:
# To make a dataframe of a google sheet
#data = wks.get_all_values()
#headers = data.pop(0)
#pd.DataFrame(data, columns=headers)

In [30]:
# THIS IS NEAT:
#   get current counts for one experiement_id, table_id, by variable:
import requests

format = 'application%2Fsolr%2Bjson'
table = 'SImon'
exp = 'historical'
url = f'https://esgf-node.llnl.gov/esg-search/search?format={format}&experiment_id={exp}&table_id={table}&facets=variable_id&limit=0'
r = requests.get(url)
vcount = r.json()['facet_counts']['facet_fields']['variable_id']
variables = vcount[::2]
var_count = vcount[1::2]
vardict = dict(zip(variables,var_count))

In [31]:
D = {k: v for k, v in sorted(vardict.items(), key=lambda x: x[1])}
#plt.bar(range(len(D)), list(D.values()), align='center')
#plt.xticks(range(len(D)), list(D.keys()))
#plt.show()
D

{'sndmasswindrif': 12,
 'simprefrozen': 32,
 'siitdsnconc': 61,
 'sidragbot': 62,
 'sirdgthick': 67,
 'sidragtop': 87,
 'sistresave': 137,
 'sisali': 139,
 'sistremax': 139,
 'sishevel': 145,
 'siitdsnthick': 150,
 'sirdgconc': 157,
 'siitdthick': 162,
 'sndmasssubl': 165,
 'siflswdbot': 167,
 'siitdconc': 172,
 'simpmass': 199,
 'sfdsi': 205,
 'siforcetiltx': 205,
 'siforcetilty': 205,
 'siforceintstrx': 233,
 'siforceintstry': 240,
 'siforcecoriolx': 242,
 'siforcecorioly': 246,
 'siareaacrossline': 250,
 'snmassacrossline': 251,
 'siflfwdrain': 260,
 'sidivvel': 263,
 'sitempsnic': 264,
 'sisaltmass': 266,
 'sndmassdyn': 282,
 'simpconc': 284,
 'sndmasssi': 299,
 'simassacrossline': 309,
 'sitempbot': 329,
 'siflsensupbot': 332,
 'siextentn': 345,
 'siextents': 345,
 'siflfwbot': 352,
 'siflcondbot': 357,
 'sifb': 359,
 'sisnhc': 363,
 'sifllatstop': 376,
 'sicompstren': 398,
 'sidconcdyn': 402,
 'sidmassth': 406,
 'siflswutop': 410,
 'sidconcth': 411,
 'sidmassdyn': 415,
 'siflsens