In [1]:
import os
import pandas as pd
from glob import glob
import json
import fnmatch
#import qgrid
import urllib.request, json
import datetime

## Check the ES-DOC errata pages for updates
- make 'csv/errata-files.csv' for using to make the updated catalog (MakeCloudCat.ipynb)
https://es-doc.github.io/esdoc-errata-client/installation.html
- many questions remain:
  - does status=resolved mean these files have been REPLACED? same version?
  - what do the various severity levels really mean? seems to be different for different models
  - could make a list of changed/new issues
  - could go through all 200+ issues and evaluate - feasible? make new boolean column 'include'
  - how to more efficiently deal with issues with >1,000 files, etc

In [4]:
# ORIGINAL INEFFICIENT METHOD - since  `esgissue retrieve` still does not work
cwd = os.getcwd()
with open('issues.txt', 'r') as f:
    for line in f:
        issue_id = line.strip().split('\"')[1]
        command = f'esgissue retrieve -i {cwd}/issues_new -d {cwd}/dsets_new --id {issue_id}'
        #print(command)
        os.system(command)

In [17]:
# maybe we should delete contents of issues and dsets directories?
# os/system('rm -rf issues dsets')
#command = '/usr/local/python/anaconda3/envs/pangeo-Oct2019/bin/esgissue retrieve -i $PWD/issues -d $PWD/dsets'
#command = 'esgissue retrieve -i $PWD/issues -d $PWD/dsets'
#os.system(command)

ifiles = glob('issues_new/issue*.json')
issues = [ifile.split('issue_')[1].split('.json')[0] for ifile in ifiles]
len(issues),issues[:4]

(305,
 ['ee4e0a5c-d1e1-a67d-8640-722e761e7da8',
  '99ffcdca-17dd-6843-9c42-511e28dce445',
  '97c484f8-293f-95de-adb9-854b365c4951',
  '5c00b84c-4622-689d-9a8d-e49d1a4acff6'])

In [None]:
df = pd.DataFrame(columns=['uid', 'title', 'description', 'project', 'severity', 'status','urls'])
df_list = []
for item,issue in enumerate(issues):
    file_dsets = 'dsets_new/dset_'+issue+'.txt'
    if not os.path.isfile(file_dsets):
        file_dsets = 'dsets_new/dset_none.txt'

    file_issue = 'issues_new/issue_'+issue+'.json'
    
    with open(file_issue) as json_file:
        dict_issue = json.load(json_file)
   
    try:
        dlist = dict_issue['urls']
    except:
        dict_issue['urls'] = []
    
    df = df.append(dict_issue,ignore_index=True)
    #print(item,file_dsets)
    
    df_dsets = pd.read_csv(file_dsets,delim_whitespace=True,header=None)
    df_dsets = df_dsets.rename(columns={0: "file_id"}).set_index([df_dsets.index])
    df_list += [list(df_dsets.file_id.values)]

df['file_ids'] = df_list
df = df.rename(columns={"uid": "issue_uid"})

In [10]:
df.to_csv('csv/ES-DOC_issues.csv', index=False)
df.issue_uid.nunique()

305

In [12]:
keywords = ['issue_uid','source_id', 'experiment_id', 'member_id', 'table_id', 'variable_id', 'grid_label', 'version', 'file_id','status','severity','issue_url']
df_all = []
for index, row in df.iterrows():
    file_id = row['file_ids']
    issue = row['issue_uid']
    status = row['status']
    severity = row['severity']
    issue_url = "https://errata.es-doc.org/static/view.html?uid="+issue
    dfs = pd.DataFrame(columns=keywords)
    print(index, ': issue uid:', issue, ', number of affected files:',len(file_id))
    for file in file_id:
        try:
            [fill,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_version] = file.split('.')
            [grid_label,version] = grid_version.split('#')
            klist = [issue,source_id,experiment_id,member_id,table_id,variable_id,grid_label,version,file,status,severity,issue_url]
            kdict = dict(zip(keywords, klist))
            dfs = dfs.append(kdict,ignore_index=True)
            df_all += [dfs]
        except:   # one issue has a list of datasets with two lines that need to be split apart
            #print('splitting lines for:',file_id)
            [s1,s2,s3]=file.split('CMIP6')
            files = ['CMIP6'+s2,'CMIP6'+s3]
            for f in files:
                [fill,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_version] = f.split('.')
                [grid_label,version] = grid_version.split('#')
                klist = [issue,source_id,experiment_id,member_id,table_id,variable_id,grid_label,version,f,status,severity,issue_url]
                kdict = dict(zip(keywords, klist))
                dfs = dfs.append(kdict,ignore_index=True)
            df_all += [dfs]
df_expand0 = pd.concat(df_all,sort=False).drop_duplicates(subset =["file_id"])

0 : issue uid: ee4e0a5c-d1e1-a67d-8640-722e761e7da8 , number of affected files: 158
1 : issue uid: 99ffcdca-17dd-6843-9c42-511e28dce445 , number of affected files: 50
2 : issue uid: 97c484f8-293f-95de-adb9-854b365c4951 , number of affected files: 112
3 : issue uid: 5c00b84c-4622-689d-9a8d-e49d1a4acff6 , number of affected files: 127
4 : issue uid: 780ffab8-ce57-e486-8994-32c5be093bff , number of affected files: 89
5 : issue uid: 4e6fbba4-6f27-9c2f-906e-ef699bb68dab , number of affected files: 10
6 : issue uid: dca706d0-3a8a-a707-b350-a939122f9132 , number of affected files: 4
7 : issue uid: c628a0f9-0f74-942c-9583-dacbcd306469 , number of affected files: 4
8 : issue uid: f345199d-e417-98fe-a79d-e2f2fa8b0cd5 , number of affected files: 1
9 : issue uid: 750a143a-5131-5631-a0ca-589273c817a1 , number of affected files: 6
10 : issue uid: b0e9b074-425e-0f90-9f98-bbf3be5913ea , number of affected files: 19
11 : issue uid: d9ab69ba-2cfd-40a0-b3f2-0271f6cee3ff , number of affected files: 14
12 

In [15]:
df_expand = pd.concat([df_expand0,df_expand1],sort=False).drop_duplicates(subset =["file_id"])
df_expand[['source_id','file_id']].groupby(['source_id']).count()
df_expand.head()

Unnamed: 0,issue_uid,source_id,experiment_id,member_id,table_id,variable_id,grid_label,version,file_id,status,severity,issue_url
0,ee4e0a5c-d1e1-a67d-8640-722e761e7da8,GISS-E2-1-G,hist-nat,r2i1p1f1,AERmon,bldep,gn,20180906,CMIP6.DAMIP.NASA-GISS.GISS-E2-1-G.hist-nat.r2i...,new,high,https://errata.es-doc.org/static/view.html?uid...
1,ee4e0a5c-d1e1-a67d-8640-722e761e7da8,GISS-E2-1-G,hist-nat,r2i1p1f1,AERmon,cltc,gn,20180906,CMIP6.DAMIP.NASA-GISS.GISS-E2-1-G.hist-nat.r2i...,new,high,https://errata.es-doc.org/static/view.html?uid...
2,ee4e0a5c-d1e1-a67d-8640-722e761e7da8,GISS-E2-1-G,hist-nat,r2i1p1f1,AERmon,cod,gn,20180906,CMIP6.DAMIP.NASA-GISS.GISS-E2-1-G.hist-nat.r2i...,new,high,https://errata.es-doc.org/static/view.html?uid...
3,ee4e0a5c-d1e1-a67d-8640-722e761e7da8,GISS-E2-1-G,hist-nat,r2i1p1f1,AERmon,ptp,gn,20180906,CMIP6.DAMIP.NASA-GISS.GISS-E2-1-G.hist-nat.r2i...,new,high,https://errata.es-doc.org/static/view.html?uid...
4,ee4e0a5c-d1e1-a67d-8640-722e761e7da8,GISS-E2-1-G,hist-nat,r2i1p1f1,AERmon,tatp,gn,20180906,CMIP6.DAMIP.NASA-GISS.GISS-E2-1-G.hist-nat.r2i...,new,high,https://errata.es-doc.org/static/view.html?uid...


In [16]:
import datetime
date = str(datetime.datetime.now().strftime("%Y%m%d"))
os.system('cp csv/errata-files.csv csv/errata-files-'+date+'.csv')
df_expand.to_csv('csv/errata-files.csv', mode='w+', index=False)