In [None]:
import os
import pandas as pd
from glob import glob
import json
import fnmatch
#import qgrid
import urllib.request, json
import datetime

## Check the ES-DOC errata pages for updates
- make 'csv/errata-files.csv' for using to make the updated catalog (nb2-NewCloudCat.ipynb)
- many questions remain:
  - does status=resolved mean these files have been REPLACED? same version?
  - what do the various severity levels really mean? seems to be different for different models
  - could make a list of changed/new issues
  - could go through all 200+ issues and evaluate - feasible? make new boolean column 'include'
  - how to more efficiently deal with issues with >1,000 files, etc

In [None]:
date = str(datetime.datetime.now().strftime("%Y%m%d"))
save_file = 'catalogs/retrieve-'+date+'.json'
os.system('cp catalogs/retrieve.json '+save_file)

os.system('wget --output-document=catalogs/retrieve.json https://errata.es-doc.org/1/issue/retrieve-all')

In [None]:
with open(save_file) as json_file:
    esdoc_data_old = json.load(json_file)
with open('catalogs/retrieve.json') as json_file:
    esdoc_data = json.load(json_file)
print(esdoc_data_old['count'],esdoc_data['count'])

In [None]:
issues=[]
for issue in range(0,esdoc_data['count']):
    uid = dict(esdoc_data['issues'][issue])['uid']
    issues += [uid]
    command = '/usr/bin/esgissue retrieve -i $PWD/issues -d $PWD/dsets --id ' + uid
    print(issue,command)
    os.system(command)

In [None]:
df = pd.DataFrame(columns=['uid', 'title', 'description', 'project', 'severity', 'status','urls'])
df_list = []
for item,issue in enumerate(issues):
    file_dsets = 'dsets/dset_'+issue+'.txt'
    if not os.path.isfile(file_dsets):
        file_dsets = 'dsets/dset_none.txt'

    file_issue = 'issues/issue_'+issue+'.json'
    
    with open(file_issue) as json_file:
        dict_issue = json.load(json_file)
   
    try:
        dlist = dict_issue['urls']
    except:
        dict_issue['urls'] = []
    
    df = df.append(dict_issue,ignore_index=True)
    df_dsets = pd.read_csv(file_dsets,delim_whitespace=True,header=None)
    df_dsets = df_dsets.rename(columns={0: "file_id"}).set_index([df_dsets.index])
    df_list += [list(df_dsets.file_id.values)]

df['file_ids'] = df_list
df = df.rename(columns={"uid": "issue_uid"})

In [None]:
df.to_csv('csv/ES-DOC_issues.csv', index=False)
df.tail()

In [None]:
keywords = ['issue_uid','source_id', 'experiment_id', 'member_id', 'table_id', 'variable_id', 'grid_label', 'version', 'file_id','status','severity','issue_url']
df_all = []
for index, row in df.iterrows():
    file_id = row['file_ids']
    issue = row['issue_uid']
    status = row['status']
    severity = row['severity']
    issue_url = "https://errata.es-doc.org/static/view.html?uid="+issue
    dfs = pd.DataFrame(columns=keywords)
    print(index, ': issue uid:', issue, ', number of affected files:',len(file_id))
    for file in file_id:
        try:
            [fill,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_version] = file.split('.')
            [grid_label,version] = grid_version.split('#')
            klist = [issue,source_id,experiment_id,member_id,table_id,variable_id,grid_label,version,file,status,severity,issue_url]
            kdict = dict(zip(keywords, klist))
            dfs = dfs.append(kdict,ignore_index=True)
            df_all += [dfs]
        except:   # one issue has a list of datasets with two lines that need to be split apart
            [s1,s2,s3]=file.split('CMIP6')
            files = ['CMIP6'+s2,'CMIP6'+s3]
            for f in files:
                [fill,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_version] = f.split('.')
                [grid_label,version] = grid_version.split('#')
                klist = [issue,source_id,experiment_id,member_id,table_id,variable_id,grid_label,version,f,status,severity,issue_url]
                kdict = dict(zip(keywords, klist))
                dfs = dfs.append(kdict,ignore_index=True)
                df_all += [dfs]
df_expand = pd.concat(df_all,sort=False).drop_duplicates(subset =["file_id"])

In [None]:
df_expand[['source_id','file_id']].groupby(['source_id']).count()
#df_expand

In [None]:
import datetime
date = str(datetime.datetime.now().strftime("%Y%m%d"))
os.system('cp csv/errata-files.csv csv/errata-files-'+date+'.csv')
df_expand.to_csv('csv/errata-files.csv', mode='w+', index=False)

In [None]:
assert False
# rest of notebook is just for testing

In [None]:
# Make a dataframe to pass to nb1-ReplaceFixed.ipynb - will check 'resolved' tracking_ids and replace if needed

df_fixed = df_expand[df_expand.status=='resolved']
df_fixed['zstore'] = ['gs://cmip6/' + s.split('CMIP6.')[-1][:-9].replace('.','/') + '/' for s in df_fixed.file_id]

zstores= df_fixed.zstore.unique()
len(zstores)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import qgrid
df_fixed.groupby(['source_id','table_id']).count()

In [None]:
# get dictionary of our existing zstores which may need to be updated
import pickle

with open(f'data/dict.pickle', 'rb') as handle:
    ldict = pickle.load(handle)
len(ldict)

In [None]:
lfix = {}
for zstore in zstores:
    try:
        lstore = ldict[zstore]
        lfix[zstore]=lstore
    except:
        continue

In [None]:
keywords = ['activity_id','institution_id','source_id', 'experiment_id', 'member_id', 'table_id', 
            'variable_id', 'grid_label', 'zstore', 'issue_uid']
dfs = pd.DataFrame(columns=keywords)
for zstore, lstore in lfix.items():
    #print(zstore)
    issue = df_fixed.query(f'zstore=="{zstore}"').issue_uid.values[0]
    zarr = zstore.split('gs://cmip6/')[-1][:-1]
    [activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label] = zarr.split('/')
    klist = [activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,issue]
    kdict = dict(zip(keywords, klist))
    dfs = dfs.append(kdict,ignore_index=True)

In [None]:
date = str(datetime.datetime.now().strftime("%Y%m%d"))
os.system('cp csv/to_fix.csv csv/to_fix-'+date+'.csv')
dfs.to_csv('csv/to_fix.csv',index=False)

In [None]:
dfs
