In [None]:
import pandas as pd
import numpy as np
import sys, os
from glob import glob
import xarray as xr
import datetime
import warnings
import gcsfs
import datetime

In [None]:
# local modules
from identify import get_version

In [None]:
# lets put the date here to know when it was last run
date = str(datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
print(date)

In [None]:
# are there extra local drives with new zarr stores?

#local_storage = True
local_storage = False

In [None]:
# A. Make new noQC catalog
#    1. Collect listings of all files in GC to make dataframe 'dz_GC'
#       a. get all 'activities' = 'activity_id/institution_id'

fs = gcsfs.GCSFileSystem(token='anon', access='read_only')

dirs = fs.ls('gs://cmip6')
acts = []
for dir in dirs:
    acts += fs.ls('gs://'+dir)

activities_all = []
for act in acts:
    if 'CMIP3' in act:
        continue
    if 'CMIP5' in act:
        continue
    if 'tracmip' in act:
        continue
    if 'csv' in act:
        continue
    if 'json' in act:
        continue
    if 'GFDL_CM2_6' in act:
        continue
        
    activities_all += [act[6:-1]]
    
if False:    
    for activity_id in activities_all:
        #print(activity_id)
        os.system(f"/usr/bin/gsutil -m ls gs://cmip6/{activity_id}/**/.zmetadata > ncsv/GC_files_{activity_id.replace('/','-')}.csv")

In [None]:
#       b. figure out which file listings need to be updated, update and concatenate

if local_storage:
    new_drives=['/h68','/h84','/h85']
    drives = [s + '/naomi/zarr-minimal' for s in new_drives]
else:
    drives = ['/d1/naomi/cmip6-zarrs']  # This must match location in nb1-DataRequests.ipynb

new_drives = list(set(drives))

new_activities=[]
for drive in new_drives:
    dirs = glob(drive+'/*/*')
    for d in dirs:
        new_activities += [d.split('/')[-2]+'/'+d.split('/')[-1]]

new_activities = sorted(list(set(new_activities)))

# override, if necessary:
new_activities = ['CMIP/EC-Earth-Consortium']
#new_activities=[]

print(new_activities)

for activity_id in new_activities:
    print(activity_id)
    os.system(f"/usr/bin/gsutil -m ls gs://cmip6/{activity_id}/**/.zmetadata > ncsv/GC_files_{activity_id.replace('/','-')}.csv")

os.system("cat ncsv/GC_files_*.csv > ncsv/GC_files.csv") 

In [None]:
#       c. read in list of zarr stores and turn into df with 8-tuple dataset id

df = pd.read_csv('ncsv/GC_files.csv',names=(['zstore']))
len(df.zstore.unique())

files = df.zstore.values
ddict = {}
for item, tdir in enumerate(files):
    store = tdir.split('.zmetadata')[0]
    vlist = tdir.split('/')[-9:-1]
    if vlist[-2] == vlist[-1]:
        print('must fix:',tdir)
        assert False
        
    vlist += [store]
    ddict[item] = vlist

dz_GC = pd.DataFrame.from_dict(ddict, orient='index')
dz_GC = dz_GC.rename(columns={0: "activity_id", 1: "institution_id", 2:"source_id",
                        3:"experiment_id",4:"member_id",5:"table_id",6:"variable_id",
                        7:"grid_label",8:"zstore"}) 

dz_GC["dcpp_init_year"] = dz_GC.member_id.map(lambda x: float(x.split("-")[0][1:] if x.startswith("s") else np.nan))
dz_GC["member_id"] = dz_GC["member_id"].map(lambda x: x.split("-")[-1] if x.startswith("s") else x)

print(dz_GC.activity_id.count())

In [None]:
# If zarrs are stored locally, make sure all have been uploaded to GC and none are missing locally

if local_storage:
    dfcat = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

    all_drives=[]
    for i in range(1,86):
        all_drives += [f"/h{i}"]

    drives = sorted(glob('/h[0-9]*/naomi/zarr-minimal'))

    mounted_drives = []
    for drive in drives:
        dr = drive.split('/')[1]
        mounted_drives += ['/'+dr]
    mounted_drives = sorted(mounted_drives)
    shelf_drives = sorted(list(set(all_drives) - set(mounted_drives)))

    dz = []
    for drive in all_drives:
        file = f"shelf-new{drive}.csv"
        try:
            dd = pd.read_csv(file, dtype='unicode')
            dz += [dd]
            #print(file,len(dd))
        except:
            print(file,'Warning - file not yet created/drive does not exist')

    dz_all = pd.concat(dz)
    print(len(dz_all.zstore.unique()), len(dfcat.zstore.unique()))

    dfcat['path'] = [s.split('cmip6/')[-1].split('/.zmetadata')[0][:-1] for s in dfcat.zstore.values]
    dz_all['path']= [s.split('zarr-minimal/')[-1] for s in dz_all.zstore.values]
    extra_cloud = set(dfcat.path.unique()) - set(dz_all.path.unique())
    extra_local = set(dz_all.path.unique()) - set(dfcat.path.unique())

    if len(extra_cloud) > 0:
        print(len(extra_cloud),'extra files are in cloud, run ShelfDrive')
    if len(extra_local) > 0:
        print(len(extra_local),'extra files are in local, run UploadNew')
        
    dz_all.to_csv('shelf-new/local.csv', mode='w+', index=False)

In [None]:
extra_cloud

In [None]:
# A. Make new noQC catalog
#    2. read dz_exclude from errata files

dz_exclude = pd.read_csv('csv/errata-files.csv')
dz_exclude['zstore'] = ['gs://cmip6/' + s.split('CMIP6.')[-1][:-9].replace('.','/') \
                        + '/' for s in dz_exclude.file_id]

# Find zstores in dz_GC which have issues at ES-DOC
set_A = set(sorted(list(dz_GC.zstore.values))) 
set_B = set(sorted(list(dz_exclude.zstore.values)))

in_both = sorted(list(set_A.intersection(set_B)))
print(len(set_A),len(set_B),len(in_both))

In [None]:
# A. Make new noQC catalog
#    3. read in dz_old from old noQC to get known versions from old catalog

dz_old = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')
dz_new = pd.merge(dz_GC, dz_old[['zstore','version']], on='zstore', how='left', sort=False)

In [None]:
# A. Make new noQC catalog
#    4. make dz_new with status,severity, url and version columns

status = []
severity = []
url = []
version = []
for index, row in dz_new.iterrows():
    zstore = row.zstore
    ver = row.version
    if zstore in set_B:
        dze = dz_exclude[dz_exclude.zstore==zstore]
        status += [dze.status.values[0]]
        severity += [dze.severity.values[0]]
        url += [dze.issue_url.values[0]]
    else:
        status += ['good']
        severity += ['none']
        url += ['none']
        
    if pd.isna(ver):
        (ds_id,ver) = get_version(zstore)
    version += [ver]

dz_new['status'] = status
dz_new['severity'] = severity
dz_new['issue_url'] = url
dz_new['version'] = version

In [None]:
# MAKE SURE NEW CATALOG IS LARGER THAN OLD
print(len(dz_new),len(dz_old))
assert len(dz_new) > len(dz_old)

In [None]:
# A. Make new noQC catalog
#    5. save new noQC catalog and then upload to GC

date = str(datetime.datetime.now().strftime("%Y%m%d"))
print(date)
os.system('cp csv/pangeo-cmip6-noQC.csv csv/pangeo-cmip6-'+date+'-noQC.csv')
dz_new.to_csv('csv/pangeo-cmip6-noQC.csv', mode='w+', index=False)

ret = os.system('/usr/bin/gsutil -m cp csv/pangeo-cmip6-noQC.csv gs://cmip6/cmip6-zarr-consolidated-stores-noQC.csv')
if ret != 0:
    print('noQC upload not working')

In [None]:
# NEEDED??????    using apply function to create a new column 
#dfcat['zarrp'] = dfcat.apply(lambda row: row.zstore + 'v' + str(row.version), axis = 1) 
#dfcat.zarrp.values[:3]

In [None]:
#dz_exclude = pd.read_csv('csv/errata-files.csv')
#
#dz_exclude['zarrp'] = dz_exclude.apply(lambda row: 'gs://cmip6/' + \
#    row.file_id.split('CMIP6.')[-1][:-9].replace('.','/') + '/' + 'v' + str(row.version) , axis = 1)

In [None]:
# B. Make new standard catalog
#    1. eliminate harmless issues from dz_exclude

dz_exclude = dz_exclude[dz_exclude.status != 'resolved']
dz_exclude = dz_exclude[dz_exclude.severity != 'low']
dz_exclude = dz_exclude[dz_exclude.issue_uid != 'b6302400-3620-c8f1-999b-d192c0349084']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '45f9e7b9-1844-7a92-8b54-10d954e621db']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '4aa40e49-b2d4-0b29-a6b1-c80ee8dce11a']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '2f6b5963-f87e-b2df-a5b0-2f12b6b68d32']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '61fb170e-91bb-4c64-8f1d-6f5e342ee421']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '90cac29b-eaff-c450-8621-ea31e305a40e']
dz_exclude = dz_exclude[dz_exclude.issue_uid != '8fbd8df5-c349-315b-9ec3-5a2f4ec4ec63']
dz_exclude = dz_exclude[dz_exclude.issue_uid != 'ad5ca671-39d0-39ed-bf4f-6c8fb1a06047']

# Put datasets back as they are fixed in GCS
dz_exclude = dz_exclude[~((dz_exclude.issue_uid == 'eb69632c-a6e2-7667-a112-a98b7745e2ea')
                          &(dz_exclude.member_id == 'r4i1p1f1')
                          &(dz_exclude.experiment_id == 'ssp370')
                          &(dz_exclude.table_id == 'Amon'))]

# Put back so that Diana can see them:
dz_exclude = dz_exclude[~( (dz_exclude.member_id == 'r1i1p1f1')
                          &(dz_exclude.variable_id == 'pr')
                          &(dz_exclude.table_id == 'day'))]
dz_exclude = dz_exclude[~( (dz_exclude.member_id == 'r1i1p1f1')
                          &(dz_exclude.variable_id == 'tasmin')
                          &(dz_exclude.table_id == 'day'))]
dz_exclude = dz_exclude[~( (dz_exclude.member_id == 'r1i1p1f1')
                          &(dz_exclude.variable_id == 'tasmax')
                          &(dz_exclude.table_id == 'day'))]

In [None]:
# B. Make new standard catalog
#    2. use this (smaller) version to eliminate the more serious issues from standard catalog

# Find zstores in both:
set_A = set(sorted(list(dz_GC.zstore.values))) 
set_B = set(sorted(list(dz_exclude.zstore.values)))

in_both = sorted(list(set_A.intersection(set_B)))
print(len(set_A),len(set_B),len(in_both))

dfz = dz_GC.copy()
dfz['issue'] = [value in in_both for value in dfz.zstore.values]
dz_issues = dfz[dfz.issue]
dz_clean  = dfz[dfz.issue==False]
dz_orig = pd.concat([dfz, dz_issues, dz_issues]).drop_duplicates(keep=False)
dz_orig = dz_orig.drop(['issue'],1)

In [None]:
# MAKE SURE NEW CATALOG IS LARGER THAN OLD
dfcat = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')
print(len(dz_orig),len(dfcat))
assert len(dz_orig) > len(dfcat)

In [None]:
# B. Make new standard catalog
#    3. save new standard catalog and then upload to GC

date = str(datetime.datetime.now().strftime("%Y%m%d"))
os.system('cp csv/pangeo-cmip6.csv csv/pangeo-cmip6-'+date+'.csv')
dz_orig.to_csv('csv/pangeo-cmip6.csv', mode='w+', index=False)

ret = os.system('/usr/bin/gsutil -m cp csv/pangeo-cmip6.csv gs://cmip6/cmip6-zarr-consolidated-stores.csv')
if ret != 0:
    print('standard catalog upload not working')
    
ret = os.system('/usr/bin/gsutil -m cp csv/pangeo-cmip6.csv gs://cmip6/pangeo-cmip6.csv')
if ret != 0:
    print('duplicate standard catalog upload not working')
