In [1]:
import pandas as pd
import numpy as np
import sys, os
from glob import glob
import xarray as xr
import datetime
import warnings
import gcsfs
import datetime

In [2]:
# local modules
from myidentify import gsurl2tracks, tracks2version

In [3]:
# lets put the date here to know when it was last run
date = str(datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
print(date)

01/23/2021, 00:27:19


In [4]:
# are there extra local drives with new zarr stores? (helps limit the search for new datasets)

mach = os.uname()[1]
if 'haden' in mach:
    local_storage = True
else:
    local_storage = False
    zarr_local = '/d1/naomi/cmip6-zarrs'  # usually matches location in GetSpecified and Requests

In [5]:
# A. Make new noQC catalog
#    1. Collect listings of all files in GC to make dataframe 'dz_GC'
#       a. figure out which file listings need to be updated, update and concatenate

if local_storage:
    #new_drives =  ['/net/abbey/a2','/net/abbey/a3','/net/carney/d2','/h120']
    new_drives = ['/net/abbey/a4','/net/carney/d3','/h119','/h121','/h122']

    drives = [s + '/naomi/zarr-minimal' for s in new_drives]
else:
    drives = [zarr_local]  # This must match location in Requests.ipynb or GetSpecified.ipynb

new_drives = list(set(drives))

new_activities=[]
for drive in new_drives:
    dirs = glob(drive+'/*/*')
    for d in dirs:
        new_activities += [d.split('/')[-2]+'/'+d.split('/')[-1]]

new_activities = sorted(list(set(new_activities)))   #. [:2]   # just test a few ***

# override, if necessary:
#new_activities = []

print(new_activities)

['C4MIP/BCC', 'C4MIP/CCCma', 'C4MIP/CNRM-CERFACS', 'C4MIP/CSIRO', 'C4MIP/IPSL', 'C4MIP/MIROC', 'C4MIP/MOHC', 'C4MIP/MPI-M', 'C4MIP/MRI', 'C4MIP/NASA-GISS', 'C4MIP/NCAR', 'C4MIP/NCC', 'C4MIP/NOAA-GFDL', 'CMIP/AS-RCEC', 'CMIP/AWI', 'CMIP/BCC', 'CMIP/CAMS', 'CMIP/CAS', 'CMIP/CCCR-IITM', 'CMIP/CCCma', 'CMIP/CMCC', 'CMIP/CNRM-CERFACS', 'CMIP/CSIRO', 'CMIP/CSIRO-ARCCSS', 'CMIP/E3SM-Project', 'CMIP/EC-Earth-Consortium', 'CMIP/FIO-QLNM', 'CMIP/HAMMOZ-Consortium', 'CMIP/INM', 'CMIP/IPSL', 'CMIP/KIOST', 'CMIP/MIROC', 'CMIP/MOHC', 'CMIP/MPI-M', 'CMIP/MRI', 'CMIP/NASA-GISS', 'CMIP/NCAR', 'CMIP/NCC', 'CMIP/NIMS-KMA', 'CMIP/NOAA-GFDL', 'CMIP/NUIST', 'CMIP/SNU', 'CMIP/THU', 'CMIP/UA', 'ScenarioMIP/AS-RCEC', 'ScenarioMIP/AWI', 'ScenarioMIP/BCC', 'ScenarioMIP/CAMS', 'ScenarioMIP/CAS', 'ScenarioMIP/CCCR-IITM', 'ScenarioMIP/CCCma', 'ScenarioMIP/CMCC', 'ScenarioMIP/CNRM-CERFACS', 'ScenarioMIP/CSIRO', 'ScenarioMIP/CSIRO-ARCCSS', 'ScenarioMIP/DKRZ', 'ScenarioMIP/DWD', 'ScenarioMIP/E3SM-Project', 'ScenarioMI

In [6]:
fs = gcsfs.GCSFileSystem(token='anon',access='read_only',cache_timeout=-1)

for activity_id in new_activities:
    print(activity_id)
    aname = activity_id.replace('/','-')
    GCfile = f'GC/GC_files_{aname}.txt'
    try:
        flist = fs.glob(f'gs://cmip6/{activity_id}/**/.zmetadata')
        with open(GCfile, "w") as file:
            file.write('\n'.join(flist))    
    except:
        print(f'skipping {activity_id}')
        continue

C4MIP/BCC
C4MIP/CCCma
C4MIP/CNRM-CERFACS
C4MIP/CSIRO
C4MIP/IPSL
C4MIP/MIROC
C4MIP/MOHC
C4MIP/MPI-M
C4MIP/MRI
C4MIP/NASA-GISS
C4MIP/NCAR
C4MIP/NCC
C4MIP/NOAA-GFDL
CMIP/AS-RCEC
CMIP/AWI
CMIP/BCC
CMIP/CAMS
CMIP/CAS
CMIP/CCCR-IITM
CMIP/CCCma
CMIP/CMCC
CMIP/CNRM-CERFACS
CMIP/CSIRO
CMIP/CSIRO-ARCCSS
CMIP/E3SM-Project
CMIP/EC-Earth-Consortium
CMIP/FIO-QLNM
CMIP/HAMMOZ-Consortium
CMIP/INM
CMIP/IPSL
CMIP/KIOST
CMIP/MIROC
CMIP/MOHC
CMIP/MPI-M
CMIP/MRI
CMIP/NASA-GISS
CMIP/NCAR
CMIP/NCC
CMIP/NIMS-KMA
CMIP/NOAA-GFDL
CMIP/NUIST
CMIP/SNU
CMIP/THU
CMIP/UA
ScenarioMIP/AS-RCEC
ScenarioMIP/AWI
ScenarioMIP/BCC
ScenarioMIP/CAMS
ScenarioMIP/CAS
ScenarioMIP/CCCR-IITM
ScenarioMIP/CCCma
ScenarioMIP/CMCC
ScenarioMIP/CNRM-CERFACS
ScenarioMIP/CSIRO
ScenarioMIP/CSIRO-ARCCSS
ScenarioMIP/DKRZ
ScenarioMIP/DWD
ScenarioMIP/E3SM-Project
ScenarioMIP/EC-Earth-Consortium
ScenarioMIP/FIO-QLNM
ScenarioMIP/HAMMOZ-Consortium
ScenarioMIP/INM
ScenarioMIP/IPSL
ScenarioMIP/KIOST
ScenarioMIP/MIROC
ScenarioMIP/MOHC
ScenarioMIP/MPI-M

In [7]:
# A. Make new noQC catalog
#    1. Collect listings of all files in GC to make dataframe 'dz_GC'
#       b. read in list of zarr stores and turn into df with 8-tuple dataset id

GCfiles = sorted(glob('GC/GC_files_*.txt'))
files = []
for GCfile in GCfiles:
    with open(GCfile, "r") as file:
        files += file.read().split('\n')

ddict = {}
for item, tdir in enumerate(files):
    #store = tdir.split('.zmetadata')[0]
    store = 'gs://'+ tdir.split('.zmetadata')[0]
    vlist = tdir.split('/')[-9:-1]
    if vlist[-2] == vlist[-1]:
        print('must fix:',tdir)
        assert False
        
    vlist += [store]
    ddict[item] = vlist

dz_GC = pd.DataFrame.from_dict(ddict, orient='index')
dz_GC = dz_GC.rename(columns={0: "activity_id", 1: "institution_id", 2:"source_id",
                        3:"experiment_id",4:"member_id",5:"table_id",6:"variable_id",
                        7:"grid_label",8:"zstore"}) 

dz_GC["dcpp_init_year"] = dz_GC.member_id.map(lambda x: float(x.split("-")[0][1:] if x.startswith("s") else np.nan))
dz_GC["member_id"] = dz_GC["member_id"].map(lambda x: x.split("-")[-1] if x.startswith("s") else x)

In [8]:
# A. Make new noQC catalog
#    2. read dz_exclude from errata files

dz_exclude = pd.read_csv('csv/errata-files.csv')
dz_exclude['zstore'] = ['gs://cmip6/' + s.split('CMIP6.')[-1][:-9].replace('.','/') \
                        + '/' for s in dz_exclude.file_id]
dz_exclude['vstore'] = dz_exclude.apply(lambda row: row.zstore + 'v' + str(row.version), axis = 1)
len(dz_exclude)

  interactivity=interactivity, compiler=compiler, result=result)


164212

In [9]:
# A. Make new noQC catalog
#    3. read in dz_old from old noQC to get known versions from old catalog

dz_old = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')
dz_new = pd.merge(dz_GC, dz_old[['zstore','version']], on='zstore', how='left', sort=False)

In [10]:
# A. 
#   4a. update version if store has been replaced by a newer version

df_newversions = pd.read_csv('csv/newversions.csv', dtype='unicode')

for index, row in df_newversions.iterrows():
    zstore = row.zstore
    newver = row.version
    #print(index,zstore,newver)
    oldver = dz_new[dz_new.zstore==zstore].version.values[0] 
    #print(oldver,newver)
    if newver == oldver:
        ver = oldver
    else:
        #print(newver,oldver)
        ver = newver
    dz_new.loc[dz_new.zstore == zstore,'version'] = newver 

In [11]:
### try write using list comprehension

# A. Make new noQC catalog
#    4b. fix ambiguous or missing versions in dz_new

version = []

for index, row in dz_new.iterrows():
    zstore = row.zstore
    ver = row.version           
    if ver == 'ambiguous' or pd.isna(ver):
        try:
            tracks = gsurl2tracks(zstore)
            (newver,jdict) = tracks2version(tracks,verbose=False)
            #print(ver,newver)
            ver = newver
        except:
            print('trouble determining version for ',zstore)
            ver = 'ambiguous'
        
    version += [ver]

dz_new['version'] = version

hdl:21.14100/a0c3ebf8-d91a-39b8-8fbf-fc3f9186c61f;hdl:21.14100/8c32f8e1-91fa-3760-b76e-431716c66902
hdl:21.14100/13f935da-282d-3a37-a0d8-672076e8f069;hdl:21.14100/354bc274-4007-347b-ae24-089107a0fe62
hdl:21.14100/89f5aef1-2ac0-3fde-b527-320946e44a99;hdl:21.14100/5e034560-81af-3acb-9fb9-9af76c85f76d
hdl:21.14100/806dca52-aafe-3859-8be4-9df7717dbcf8;hdl:21.14100/a516d5db-9274-3363-9649-e771cc109d94
hdl:21.14100/b7e961e9-2f67-388a-b47d-4b43892a132b;hdl:21.14100/0d3ce68b-a421-33c3-84a9-199f18da8f39

*** Newer version exists, see: http://hdl.handle.net/hdl:21.14100/fbde07e0-6472-3cfd-ad7c-16c3d08512ee


*** Newer version exists, see: http://hdl.handle.net/hdl:21.14100/6dc8fe35-5d00-3471-84b4-c51905f11372


*** Newer version exists, see: http://hdl.handle.net/hdl:21.14100/bbc59391-58eb-3c41-a73b-6854dd612719

hdl:21.14100/f4148f37-4960-3f95-acb9-ed71c68ed395;hdl:21.14100/97f86fb1-af36-38e7-a9a3-24f5aa8d3a3e
hdl:21.14100/09a84e98-2f5e-38fb-b777-da08b6970bd5;hdl:21.14100/f0bb1ff8-e1cd-32e2-a06

In [12]:
# A. Make new noQC catalog
#    5. using vstore=zstore+version, update dz_new with status, severity and url columns

# combine zstore and version
dz_new['vstore'] = dz_new.apply(lambda row: row.zstore + 'v' + str(row.version), axis = 1)

# Find vstores (= zstore+version) in dz_GC which have issues at ES-DOC
set_A = set(sorted(list(dz_new.vstore.values))) 
set_B = set(sorted(list(dz_exclude.vstore.values)))

in_both = sorted(list(set_A.intersection(set_B)))
print(len(set_A),len(set_B),len(in_both))

status = []
severity = []
url = []
for index, row in dz_new.iterrows():
    vstore = row.vstore
    if vstore in in_both:
        dze = dz_exclude[dz_exclude.vstore==vstore]
        status += [dze.status.values[0]]
        severity += [dze.severity.values[0]]
        url += [dze.issue_url.values[0]]
    else:
        status += ['good']
        severity += ['none']
        url += ['none']

dz_new['status'] = status
dz_new['severity'] = severity
dz_new['issue_url'] = url

401937 164212 11485


In [13]:
# MAKE SURE NEW CATALOG IS LARGER THAN OLD
print(len(dz_new),len(dz_old))
assert len(dz_new) > len(dz_old)

401937 401452


In [14]:
# A. Make new noQC catalog
#    6a. save old noQC catalog and then upload to GC

date = str(datetime.datetime.now().strftime("%Y%m%d"))
old_cat = f'old_catalogs/pangeo-cmip6-{date}-noQC.csv'
os.system(f'cp csv/pangeo-cmip6-noQC.csv {old_cat}')
os.system(f'gzip {old_cat}')
ret = os.system(f'/usr/bin/gsutil -m cp {old_cat}.gz gs://cmip6/{old_cat}.gz')

In [15]:
#    6b. save new noQC catalog and then upload to GC

dz_new.drop(['vstore'],1).to_csv('csv/pangeo-cmip6-noQC.csv', mode='w+', index=False)

ret = os.system('/usr/bin/gsutil -m cp csv/pangeo-cmip6-noQC.csv gs://cmip6/cmip6-zarr-consolidated-stores-noQC.csv')
if ret != 0:
    print('noQC upload not working')

In [16]:
# B. Make new standard catalog
#    1. eliminate harmless issues from dz_exclude
#       these should all be properly evaluated - I just made a first guess

dz_exclude = dz_exclude[dz_exclude.status != 'resolved']
dz_exclude = dz_exclude[dz_exclude.severity != 'low']
dz_exclude = dz_exclude[dz_exclude.issue_uid != 'b6302400-3620-c8f1-999b-d192c0349084']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '45f9e7b9-1844-7a92-8b54-10d954e621db']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '4aa40e49-b2d4-0b29-a6b1-c80ee8dce11a']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '2f6b5963-f87e-b2df-a5b0-2f12b6b68d32']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '61fb170e-91bb-4c64-8f1d-6f5e342ee421']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '90cac29b-eaff-c450-8621-ea31e305a40e']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '8fbd8df5-c349-315b-9ec3-5a2f4ec4ec63']
dz_exclude = dz_exclude[dz_exclude.issue_uid != 'ad5ca671-39d0-39ed-bf4f-6c8fb1a06047']

In [17]:
# B. Make new standard catalog
#    2. use this (smaller) list of issues to eliminate the more serious issues from standard catalog

# Find zstores in both:
set_A = set(sorted(list(dz_new.vstore.values))) 
set_B = set(sorted(list(dz_exclude.vstore.values)))

in_both = sorted(list(set_A.intersection(set_B)))
print(len(set_A),len(set_B),len(in_both))

dfz = dz_new.copy()
dfz['issue'] = [value in in_both for value in dfz.vstore.values]
dz_issues = dfz[dfz.issue]
dz_clean  = dfz[dfz.issue==False]
dz_orig = pd.concat([dfz, dz_issues, dz_issues]).drop_duplicates(keep=False)
dz_orig = dz_orig.drop(['issue','vstore'],1)

401937 61783 4737


In [18]:
# MAKE SURE NEW CATALOG IS LARGER THAN OLD
dfcat = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')
print(len(dz_orig),len(dfcat))
assert len(dz_orig) > len(dfcat)

397200 396709


  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
# B. Make new standard catalog
#    3a. save old standard catalog and then upload to GC

date = str(datetime.datetime.now().strftime("%Y%m%d"))
old_cat = f'old_catalogs/pangeo-cmip6-{date}.csv'
os.system(f'cp csv/pangeo-cmip6.csv {old_cat}')
os.system(f'gzip {old_cat}')
ret = os.system(f'/usr/bin/gsutil -m cp {old_cat}.gz gs://cmip6/{old_cat}.gz')

In [20]:
# B. Make new standard catalog
#    3b. save new standard catalog and then upload to GC

dz_orig.drop(['status','severity','issue_url'],1).to_csv('csv/pangeo-cmip6.csv', mode='w+', index=False)

ret = os.system('/usr/bin/gsutil -m cp csv/pangeo-cmip6.csv gs://cmip6/cmip6-zarr-consolidated-stores.csv')
if ret != 0:
    print('standard catalog upload not working')
    
ret = os.system('/usr/bin/gsutil -m cp csv/pangeo-cmip6.csv gs://cmip6/pangeo-cmip6.csv')
if ret != 0:
    print('duplicate standard catalog upload not working')

In [21]:
print(new_drives)

['/h119/naomi/zarr-minimal', '/net/abbey/a4/naomi/zarr-minimal', '/h122/naomi/zarr-minimal', '/net/carney/d3/naomi/zarr-minimal', '/h121/naomi/zarr-minimal']
