In [1]:
import s3fs
import pandas as pd 
import xarray as xr
from glob import glob
import numpy as np
import os
import datetime, calendar, time

In [2]:
fs = s3fs.S3FileSystem(anon=True)

In [3]:
from myutilities import search_df

In [4]:
if False:
    tdir = 's3://cmip6-pds/DCPP/'
    acts = fs.ls(tdir)
    activity_ids = sorted(list(set([td.split('/')[-1] for td in acts])))
    print(activity_ids)    
    for activity_id in activity_ids:
        aname = activity_id.replace('/','-')
        GCfile = f'S3_ours/S3_DCPP-{aname}.txt'
        print('making:',GCfile)
        path = f's3://cmip6-pds/DCPP/{aname}/**/.zmetadata'
        flist = fs.glob(path)
        print(activity_id,len(flist))
        with open(GCfile, "w") as file:
            file.write('\n'.join(flist))    

In [5]:
tdir = 's3://cmip6-pds/CMIP6/'
acts = fs.ls(tdir)
activity_ids = sorted(list(set([td.split('/')[-1] for td in acts])))
activity_ids.remove('old_catalogs')
print(activity_ids)
#activity_ids = ['PAMIP']

['AerChemMIP', 'C4MIP', 'CDRMIP', 'CFMIP', 'CMIP', 'DAMIP', 'FAFMIP', 'GMMIP', 'HighResMIP', 'LS3MIP', 'LUMIP', 'OMIP', 'PAMIP', 'PMIP', 'RFMIP', 'ScenarioMIP']


In [6]:
for activity_id in activity_ids:
    aname = activity_id.replace('/','-')
    GCfile = f'S3_ours/S3_{aname}.txt'
    print('making:',GCfile)
    try:
        path = f's3://cmip6-pds/CMIP6/{activity_id}/**/.zmetadata'
        flist = fs.glob(path)
        print(activity_id,len(flist))
        with open(GCfile, "w") as file:
            file.write('\n'.join(flist))    
    except:
        print(f'skipping {activity_id}')
        continue

making: S3_ours/S3_AerChemMIP.txt
AerChemMIP 8879
making: S3_ours/S3_C4MIP.txt
C4MIP 2992
making: S3_ours/S3_CDRMIP.txt
CDRMIP 778
making: S3_ours/S3_CFMIP.txt
CFMIP 2970
making: S3_ours/S3_CMIP.txt
CMIP 125959
making: S3_ours/S3_DAMIP.txt
DAMIP 22740
making: S3_ours/S3_FAFMIP.txt
FAFMIP 420
making: S3_ours/S3_GMMIP.txt
GMMIP 960
making: S3_ours/S3_HighResMIP.txt
HighResMIP 1604
making: S3_ours/S3_LS3MIP.txt
LS3MIP 187
making: S3_ours/S3_LUMIP.txt
LUMIP 1119
making: S3_ours/S3_OMIP.txt
OMIP 329
making: S3_ours/S3_PAMIP.txt
PAMIP 42619
making: S3_ours/S3_PMIP.txt
PMIP 253
making: S3_ours/S3_RFMIP.txt
RFMIP 4731
making: S3_ours/S3_ScenarioMIP.txt
ScenarioMIP 138233


In [7]:
# A. Make new catalog
#    1. Collect listings of all files in GC to make dataframe 'dz_GC'
#       b. read in list of zarr stores and turn into df with 8-tuple dataset id

S3files = sorted(glob('S3_ours/S3_*.txt'))
zmetas = []
for S3file in S3files:
    with open(S3file, "r") as file:
        zmetas += file.read().split('\n')

ddict = {}
for item, tdir in enumerate(zmetas):
    vstore = tdir.split('/.zmetadata')[0].split('cmip6-pds/')[-1]
    if 'DCPP' in tdir:
        vstore = vstore + '/v00000000'
    vlist = vstore.split('/')[-9:]
    zstore = 's3://cmip6-pds/'+vstore+'/'
    vlist += [zstore]
    ddict[item] = vlist
    

dz = pd.DataFrame.from_dict(ddict, orient='index')
dz = dz.rename(columns={0: "activity_id", 1: "institution_id", 2:"source_id",
                        3:"experiment_id",4:"member_id",5:"table_id",6:"variable_id",
                        7:"grid_label",8:"version",9:"zstore"}) #,12:"size"})

dz["dcpp_init_year"] = dz.member_id.map(lambda x: float(x.split("-")[0][1:] if x.startswith("s") else np.nan))
dz["member_id"] = dz["member_id"].map(lambda x: x.split("-")[-1] if x.startswith("s") else x)
dz["version"] = dz["version"].map(lambda x: x[1:])
dz["zstore"] = dz["zstore"].map(lambda x: x[:-10] if x.startswith("s3://cmip6-pds/DCPP") else x)

dz = dz[['activity_id','institution_id','source_id','experiment_id','member_id','table_id','variable_id'
         ,'grid_label','zstore','dcpp_init_year','version']]

In [8]:
# combine this with the errata info to get the two catalogs
# A. Make new noQC catalog
#    2. read dz_exclude from errata files

dz_exclude = pd.read_csv('csv/errata-files.csv')
dz_exclude['zstore'] = ['s3://cmip6-pds/' + s.replace('#','/v').replace('.','/')+'/'  for s in dz_exclude.file_id]
dz_exclude.zstore.values[0], dz.zstore.values[0]

  interactivity=interactivity, compiler=compiler, result=result)


('s3://cmip6-pds/CMIP6/DAMIP/NASA-GISS/GISS-E2-1-G/hist-nat/r2i1p1f1/AERmon/bldep/gn/v20180906/',
 's3://cmip6-pds/CMIP6/AerChemMIP/AS-RCEC/TaiESM1/histSST/r1i1p1f1/AERmon/od550aer/gn/v20200310/')

In [9]:
# A. Make new noQC catalog
#    5. using vstore=zstore+version, update dz_new with status, severity and url columns
# this can be made much more efficient - with merge, etc

# Find vstores (= zstore+version) in dz_GC which have issues at ES-DOC
set_A = set(sorted(list(dz.zstore.values))) 
set_B = set(sorted(list(dz_exclude.zstore.values)))

in_both = sorted(list(set_A.intersection(set_B)))
print(len(set_A),len(set_B),len(in_both))

status = []
severity = []
url = []
for index, row in dz.iterrows():
    zstore = row.zstore
    if zstore in in_both:
        dze = dz_exclude[dz_exclude.zstore==zstore]
        status += [dze.status.values[0]]
        severity += [dze.severity.values[0]]
        url += [dze.issue_url.values[0]]
    else:
        status += ['good']
        severity += ['none']
        url += ['none']

dz['status'] = status
dz['severity'] = severity
dz['issue_url'] = url

497701 168186 12986


In [10]:
dz_s3 = dz.copy()
len(dz_s3)

497701

In [11]:
import myconfig
dz_s3['ds_dir'] = dz_s3.apply(lambda row: myconfig.target_format % row,axis=1)

dz_DCPP = dz_s3[dz_s3.activity_id=='DCPP']
dz_nDCPP = dz_s3[dz_s3.activity_id!='DCPP']

dz_nDCPP = dz_nDCPP.sort_values(by=['version'])
dz_nDCPP = dz_nDCPP.drop_duplicates(subset =["ds_dir"],keep='last')
dz_new = pd.concat([dz_nDCPP,dz_DCPP])
len(dz_new)

496165

In [12]:
# MAKE SURE NEW CATALOG IS LARGER THAN OLD
dz_old = pd.read_csv('csv/s3_pangeo-cmip6-noQC.csv', dtype='unicode')

print(len(dz_new),len(dz_old))
assert len(dz_new) >= len(dz_old)

496165 491362


In [13]:
#    6b. save new noQC catalog and then upload to GC
dz_new = dz_new[['activity_id', 'institution_id', 'source_id', 'experiment_id',
       'member_id', 'table_id', 'variable_id', 'grid_label', 
       'zstore', 'dcpp_init_year', 'version', 'status', 'severity', 'issue_url']]

dz_new = dz_new.sort_values(by=['activity_id','source_id','experiment_id','member_id'])
dz_new.to_csv('csv/s3_pangeo-cmip6-noQC.csv', mode='w+', index=False)

In [14]:
# B. Make new standard catalog
#    1. eliminate harmless issues from dz_exclude
#       these should all be properly evaluated - I just made a first guess

dz_exclude = dz_exclude[dz_exclude.status != 'resolved']
dz_exclude = dz_exclude[dz_exclude.severity != 'low']
dz_exclude = dz_exclude[dz_exclude.issue_uid != 'b6302400-3620-c8f1-999b-d192c0349084']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '45f9e7b9-1844-7a92-8b54-10d954e621db']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '4aa40e49-b2d4-0b29-a6b1-c80ee8dce11a']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '2f6b5963-f87e-b2df-a5b0-2f12b6b68d32']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '61fb170e-91bb-4c64-8f1d-6f5e342ee421']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '90cac29b-eaff-c450-8621-ea31e305a40e']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '8fbd8df5-c349-315b-9ec3-5a2f4ec4ec63']
dz_exclude = dz_exclude[dz_exclude.issue_uid != 'ad5ca671-39d0-39ed-bf4f-6c8fb1a06047']

In [15]:
# B. Make new standard catalog
#    2. use this (smaller) list of issues to eliminate the more serious issues from standard catalog

# Find zstores in both:
set_A = set(sorted(list(dz_new.zstore.values))) 
set_B = set(sorted(list(dz_exclude.zstore.values)))

in_both = sorted(list(set_A.intersection(set_B)))
print(len(set_A),len(set_B),len(in_both))

dfz = dz_new.copy()
dfz['issue'] = [value in in_both for value in dfz.zstore.values]
dz_issues = dfz[dfz.issue]
dz_clean  = dfz[dfz.issue==False]
dz_orig = pd.concat([dfz, dz_issues, dz_issues]).drop_duplicates(keep=False)
dz_orig = dz_orig.drop(['issue'],1)

496165 60528 5345


In [16]:
# MAKE SURE NEW CATALOG IS LARGER THAN OLD
dfcat = pd.read_csv('csv/s3_pangeo-cmip6.csv')
print(len(dz_orig),len(dfcat))
assert len(dz_orig) >= len(dfcat)

490820 486021


In [17]:
# B. Make new standard catalog
#    3b. save new standard catalog and then upload to GC

dz_orig.drop(['status','severity','issue_url'],1).to_csv('csv/s3_pangeo-cmip6.csv', mode='w+', index=False)

In [18]:
date = str(datetime.datetime.now().strftime("%Y%m%d"))
old_cat_loc = f'old_catalogs_s3/pangeo-cmip6-{date}-noQC.csv'
old_cat = f'old_catalogs/pangeo-cmip6-{date}-noQC.csv'
os.system(f'cp csv/s3_pangeo-cmip6-noQC.csv {old_cat_loc}')
os.system(f'gzip {old_cat_loc}')
os.system(f'rm {old_cat_loc}')
ret = os.system(f'/usr/local/bin/aws s3 cp {old_cat_loc}.gz s3://cmip6-pds/CMIP6/{old_cat}.gz')

old_cat_loc = f'old_catalogs_s3/s3_pangeo-cmip6-{date}.csv'
old_cat = f'old_catalogs/s3_pangeo-cmip6-{date}.csv'
os.system(f'cp csv/s3_pangeo-cmip6.csv {old_cat_loc}')
os.system(f'gzip {old_cat_loc}')
ret = os.system(f'/usr/local/bin/aws s3 cp {old_cat_loc}.gz s3://cmip6-pds/CMIP6/{old_cat}.gz')

In [19]:
ret = os.system('/usr/local/bin/aws s3 cp csv/s3_pangeo-cmip6-noQC.csv s3://cmip6-pds/pangeo-cmip6-noQC.csv')
os.system(f'gzip csv/s3_pangeo-cmip6-noQC.csv')
ret = os.system('/usr/local/bin/aws s3 cp csv/s3_pangeo-cmip6-noQC.csv.gz s3://cmip6-pds/pangeo-cmip6-noQC.csv.gz')
os.system(f'gunzip csv/s3_pangeo-cmip6-noQC.csv.gz')
if ret != 0:
    print('noQC upload not working')
    
ret = os.system('/usr/local/bin/aws s3 cp csv/s3_pangeo-cmip6.csv s3://cmip6-pds/pangeo-cmip6.csv')
if ret != 0:
    print('noQC upload not working')