# If new version is available, download and create new zarr store - if successful then:
    0. save new zstore in zbtemp
    1. delete old version in GC
    2. delete entry in ncsv/GC_files_{activity_id}-{institution_id}.csv
    if saving local copy:
        3. delete old local copy(ies)
        4. delete entry(ies) in shelf-new/h*.csv
        5. copy zarr from zbtemp to zbdir
    6. upload to cloud
    7. fix version name in LOCAL COPY of noQC catalog (df_GCS)

### STILL BEING CODED - DO NOT USE ###

In [2]:
import pandas as pd
import gcsfs
import xarray as xr
from datetime import datetime
import itertools
import os
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [3]:
import myconfig
from myidentify import tracking2version
from mysets import all_search
from mydataset import id2dict_, dir2url_, dir2dict
from mytasks import Check, Download, ReadFiles, SaveAsZarr, Upload, Cleanup
from mysearch import esgf_search

In [None]:
# Directly connect to Search Status Page for updating
# Anyone can view: https://docs.google.com/spreadsheets/d/1yAt7604tVt7OXXZUyL2uALtGP2WVa-Pb5NMuTluFsAc/edit?usp=sharing

json_keyfile = '/home/naomi/json/CMIP6-d0cb1df722d1.json'
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name(json_keyfile, scope)
gc = gspread.authorize(credentials)
sheet_name = "CMIP6_UpdateSchedule"
sh = gc.open(sheet_name)
wks = sh.worksheet("Searches")

# read the first row to get the column labels
columns = wks.row_values(3)
col_vupdate = columns.index('version update')

In [None]:
# Make a dataframe of the google sheet
#data = wks.get_all_values()
#headers = data.pop(0)
#pd.DataFrame(data, columns=headers)

In [None]:
# CONFIGURE ESGF Search here
node_pref = myconfig.node_pref
dtype = myconfig.dtype
hd = '/h116'
myconfig.local_target_prefix = hd + '/naomi/zarr-minimal/'
dir2local = dir2url_(myconfig.local_target_prefix)

clear_logs = True
update_ESGF = True
NewNeeded = True

search = 'Omon-1g'   # if search='test', must define below
search_row = wks.find(search).row
date = datetime.now().strftime('%Y-%m-%d')
wks.update_cell(search_row, col_run + 1, date)
wks.update_cell(search_row, col_drive + 1, hd)

ESGF_site = dtype['llnl']
#ESGF_site = dtype['dkrz']

print('zarrs will be written to: ',myconfig.local_target_prefix)

In [None]:
# CONFIGURE GCS
fs     = gcsfs.GCSFileSystem(token='anon', access='read_only',cache_timeout=-1)
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# make available to all modules (except those)
myconfig.fs = fs
myconfig.df_GCS = df_GCS

In [None]:
if search in all_search:
    # Use a pre-defined search
    asearch = all_search[search]
else:
    # Define your own search: Pick keyword values to specify your search, not specifying a particular keyword means it will find all.
    asearch = {
                     'table_id'      : ['SImon']
                    ,'experiment_id' : ['ssp119', 'ssp126']
                    ,'variable_id'   : ["sithick", "siconc", "sisnthick", "sisnconc"]
                    #,'member_id'     : ['r1i1p1f1']
                    #,'source_id'     : ['CESM2-WACCM']
                }

# check if ANOTHER notebook is doing the same search
lock_file = f'logs/{search}-vup.lock'
if os.path.exists(lock_file):
    if clear_logs:  
        command = f'/bin/rm logs/*{search}-vup*'
        print(command)
        os.system(command)
    else:
        assert False, f'lockfile {lock_file} exists'
        
f = open(lock_file,'w')
date = str(datetime.now().strftime("%Y%m%d-%H%M"))
f.write(f'started {search} at {date}')
f.close()

search, asearch

In [None]:
# make dataframe of ESGF search results

if update_ESGF:
    x = [value for key,value in asearch.items()]
    searches = [p for p in itertools.product(*x)]

    dESGF = []
    for s in searches:
        dsearch = dict(zip(asearch.keys(),s))
        print(dsearch)
        df = esgf_search(dsearch, server=ESGF_site)
        if len(df)>0:
            dESGF += [df]

    df_ESGF = pd.concat(dESGF)
    df_ESGF.to_csv(f'csv/ESGF_{search}.csv',index=False)
else:
    df_ESGF = pd.read_csv(f'csv/ESGF_{search}.csv', dtype='unicode')

len(df_ESGF), len(df_ESGF.ds_dir.unique())

In [None]:
# make dataframe of all needed

if NewNeeded:
    df = pd.merge(df_ESGF,df_GCS, how='outer', indicator=True)
    df_needed = df[df._merge == 'left_only']
    
    keep_keys = df_ESGF.keys()
    all_keys = df.keys()
    drop_keys = list(set(all_keys) - set(keep_keys))
    df_needed = df_needed.drop(drop_keys,1)

    num_stores = 0
    if len(df_needed) > 0:
        num_stores = df_needed.ds_dir.nunique() 
        print(f'needed: nfiles={len(df_needed)}, nstores={num_stores}')
    else:
        print('no new data available')
        exit    
    
    df_needed['member'] = [int(s.split('r')[-1].split('i')[0]) for s in df_needed['member_id']]
    df_needed = df_needed.sort_values(by=['member'])
    #df_needed['zsize'] = [df_needed[df_needed.ds_dir==zs]['file_size'].sum() for zs in df_needed['ds_dir']]
    #df_needed = df_needed.sort_values(by=['zsize'])
    
    df_needed.to_csv(f'csv/needed_{search}.csv',index=False)
else:
    df_needed = pd.read_csv(f'csv/needed_{search}.csv')

print('Variables')
for var in df_needed.variable_id.unique():
    print(var,df_needed[df_needed.variable_id==var].ds_dir.nunique())

print('\nExperiments')
for exp in df_needed.experiment_id.unique():
    print(exp,df_needed[df_needed.experiment_id==exp].ds_dir.nunique())

In [None]:
# make available to all modules
myconfig.df_needed = df_needed

In [None]:
ds_dirs = df_needed.ds_dir.unique()
numdsets = len(ds_dirs)

In [None]:
progress_log  = f'logs/progress_{search}-vup.log'
failure_log  = f'logs/failure_{search}-vup.log'
success_log  = f'logs/success_{search}-vup.log'
logs = {1:progress_log, 2:failure_log, 3:success_log}

### ierr:
- 0 : proceed with next task
- 1 : write to progress_log, go to next dataset (finished or try again)
- 2 : write to failure_log,  go to next dataset (mark as un-usable - do not try again until problem is solved) 
- 3 : write to success_log,  go to next dataset (dataset added to cloud)

In [None]:
def write_log(file,str,verbose=True):
    f = open(file,'a')
    if verbose:
        print(str)
    f.write(f'{str}\n')
    f.close()
    return

In [None]:
# reload the catalog
df_GCS = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores-noQC.csv', dtype='unicode')

# refresh the gcsfs
fs.invalidate_cache()

for item, ds_dir in enumerate(ds_dirs):

    print(f'\n{item}/{numdsets-1}',ds_dir)

    #skip = 'EC-Earth3'
    skip = 'none'
    if skip in ds_dir:
        write_log(progress_log,f'{ds_dir} skipping {skip}') 
        continue
    
    (ierr, exc) = Check(ds_dir, dir2local)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir} {exc}'); continue
                     
    (gfiles, ierr, exc) = Download(ds_dir)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue
        
    search_row = wks.find(search).row
    wks.update_cell(search_row, col_dataset + 1, ds_dir)
    
    (ds,ierr,exc) = ReadFiles(ds_dir, gfiles, dir2dict)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue

    (ierr,exc) = SaveAsZarr(ds_dir, ds, dir2local)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue
        
    (ierr,exc) = Upload(ds_dir, dir2local)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue

    (ierr,exc) = Cleanup(ds_dir, gfiles, dir2local)
    if ierr > 0:
        write_log(logs[ierr],f'{ds_dir}, {exc}'); continue

    date = datetime.now().strftime('%H:%M, %b%d')
    status_str = f'{item+1} of {numdsets} at {date}'
    search_row = wks.find(search).row
    wks.update_cell(search_row, col_status + 1, status_str)

    write_log(success_log,f'{dir2local(ds_dir)} saved to GCS'); continue

# KNOWN ISSUES:

### 1. Identical datasets with different version_ids
- some new versions are IDENTICAL to old - same checksums, tracking_ids, etc. 
- WHAT TO DO???? Need to prevent from updating again and again

skip_stores = ['gs://cmip6/CMIP/E3SM-Project/E3SM-1-1-ECA/historical/r1i1p1f1/Amon/prc/gr/'] # new version same as old

skip_stores += ['gs://cmip6/CMIP/E3SM-Project/E3SM-1-1-ECA/historical/r1i1p1f1/Amon/pr/gr/'] # new version same as old

### 2. Retracted Dataset Issue
- the ESGF API and web search pages give different results!!!
- skipping due to multiple esgf versions: 

Example:  /CMIP/CCCma/CanESM5/historical/r1i1p1f1/Amon/pr/gn/ ['v20190306' 'v20190429']

- The API returns both versions (version=latest DOES NOT WORK if a version has been retracted!!!), but ESGF search page only has one if other has been retracted

```    1.	
    CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.pr.gn
    IMPORTANT: this dataset has been retracted and is no longer available for download
    Data Node: crd-esgf-drc.ec.gc.ca
    Version: 20190306
    ß Full Dataset Services:   [ Show Metadata ]
    2.	
    CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.pr.gn
    Data Node: crd-esgf-drc.ec.gc.ca
    Version: 20190429
    Total Number of Files (for all variables): 1
    Full Dataset Services:  	[ Show Metadata ]   [ List Files ]   [ WGET Script ]   [ LAS ]   [ Show Citation ]   [ PID ] [ Further Info ]



### CloudCat will read newversions.csv and update the version ids
- if newversions.csv gets large, just remove all but first line after running MakeCloudCat