### Versions
- Unfortunately, 'version' was not stored in the metadata of each file nor in the name of the file
- 

In [1]:
import pandas as pd
import zarr
import fsspec

In [2]:
from myidentify import gsurl2tracks, tracks2version, tracks2source, dsid2source, id2jdict, jdict_ds2source
from myutilities import search_df
from mysearch import esgf_search
import myconfig

In [9]:
dfcat = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

In [4]:
def gsurl2search(gsurl):
    values = gsurl[11:-1].split('/')
    keys = myconfig.target_keys
    return dict(zip(keys,values))

In [15]:
# standard example
gsurl = 'gs://cmip6/CMIP/NCAR/CESM2/historical/r11i1p1f1/Oyr/expc/gr/'

version_cat = dfcat[dfcat.zstore == gsurl].version.values[0]
print('current version from GC catalog = ',version_cat)

tracks = gsurl2tracks(gsurl)
(version,jdict) = tracks2version(tracks)
print('latest version from handler = ', version)

asearch = gsurl2search(gsurl)
dfs = esgf_search(asearch, toFilter = False)
version_ESGF = list(set(dfs.version_id))
print('version(s) available from ESGF = ', version_ESGF)

#source_urls =tracks2source(tracks) 
#source_urls

current version from GC catalog =  20190514
current version from GC tracks =  20190514
latest version from handler =  20190514
version(s) available from ESGF =  ['v20190514']


In [13]:
#gsurl = 'gs://cmip6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/3hr/uas/gn/'  # just created, but scripts say there is a newer version!!!!

gsurl = 'gs://cmip6/ScenarioMIP/HAMMOZ-Consortium/MPI-ESM-1-2-HAM/ssp370/r1i1p1f1/3hr/uas/gn/' # duplicate dataset_ids and versions 
                                                                # since two files are in both datasets and one file is only in the most recent
tracks = gsurl2tracks(gsurl)
tracking_ids = tracks.split('\n')
tracking_ids

['hdl:21.14100/d5aae306-16fe-4a2f-9706-8c41b4d20584',
 'hdl:21.14100/43133a86-783d-4d67-a73f-c512d2e27582',
 'hdl:21.14100/4fd3a7ec-d1ab-421b-963e-bcf82fcd8cdb']

In [5]:
# Now, what if we try this for another? This example has two features. The tracking_ids of the netcdf files are not unique and this is a replacement version.
gsurl = 'gs://cmip6/ScenarioMIP/NCAR/CESM2/ssp370/r4i1p1f1/Amon/ts/gn/'
tracks = gsurl2tracks(gsurl)
(version,jdict) = tracks2version(tracks)
print('version = ', version)

# But since the tracking_ids were not unique, we won't get all of the urls!!
source_urls = tracks2source(tracks) 
source_urls


netcdf file tracking_ids are NOT UNIQUE!
['hdl:21.14100/33cbdc29-fbc9-44ab-9e09-5dc7824441cf', 'hdl:21.14100/33cbdc29-fbc9-44ab-9e09-5dc7824441cf']



cloud version from tracks =  20200528
version =  20200528


['http://esgf-data.ucar.edu/thredds/fileServer/esg_dataroot/CMIP6/ScenarioMIP/NCAR/CESM2/ssp370/r4i1p1f1/Amon/ts/gn/v20200528/ts_Amon_CESM2_ssp370_r4i1p1f1_gn_206501-210012.nc']

In [14]:
# This example is a Dataset where there are three versions (and an ES-DOC ERRATA link from second version)
gsurl = 'gs://cmip6/CMIP/EC-Earth-Consortium/EC-Earth3/historical/r1i1p1f1/Amon/tasmin/gr/'

version_cat = dfcat[dfcat.zstore == gsurl].version.values[0]
print('current version from GC catalog = ',version_cat)

tracks = gsurl2tracks(gsurl)
(version,jdict) = tracks2version(tracks)
print('latest version from handler = ', version)

asearch = gsurl2search(gsurl)
dfs = esgf_search(asearch, toFilter = False)
version_ESGF = list(set(dfs.version_id))
print('version(s) available from ESGF = ', version_ESGF)

#source_urls =tracks2source(tracks) 
#source_urls

current version from GC catalog =  20190926
current version from GC tracks =  20190926



*** Newer version exists, see: http://hdl.handle.net/hdl:21.14100/480d0915-c4de-3b4a-89da-dbce9ace46ce


*** Newer version exists, see: http://hdl.handle.net/hdl:21.14100/b7fc3bc4-2489-3627-b8ce-bf665b908fb6



latest version from handler =  20200310
version(s) available from ESGF =  ['v20200310', 'v20190926']


In [None]:
dfcat = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv')

df = search_df(dfcat,table_id='Amon',experiment_id='historical',variable_id='tas')
df['member'] = [int(s.split('r')[-1].split('i')[0]) for s in df['member_id']]
df = df.sort_values(by=['member'])
df = df.reset_index(drop=True)

len(df)

In [None]:
for index, row in df.iterrows():
    if index > 13:
        continue
    gsurl = row['zstore']
    version = row['version']
    print(index, gsurl, version)
    
    tracks = gsurl2tracks(gsurl)
    (version_new,jdict) = tracks2version(tracks)
    print(version_new)
    
    surls = jdict_ds2source(jdict)
    print(surls)