### CMIP6 Data Issues 
1. The ES-DOC errata pages are used for modelling centers to report issues with their published data here:
      [ES-DOC ERRATA]( https://errata.es-doc.org )
2. A separate list of exceptions is kept as we process the data (concatenating netcdf and saving as zarr)
      [ESGF to GCS Issues]( https://docs.google.com/spreadsheets/d/e/2PACX-1vRxKgz1xCH7zhUoDnl_llgEvbj2ssxoJiTUdbkHkkfWiCKU8EfZtPerar3ELjoIzAda5giR06QvbWGE/pubhtml?gid=128595157&single=true )
3. Issues with the existing Google Cloud collection are crowd sourced here:
      [GCS Issues]( https://tinyurl.com/y5cw76at )

### This notebook updates the list of processing exceptions in Issue 2.

In [28]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
from datetime import datetime
from ast import literal_eval
import os
import numpy as np
import os
gspread.__version__

'3.1.0'

In [45]:
json_keyfile = '/home/naomi/cmip6-zarr/json/Pangeo Hackathon-e48a41b13c91.json'
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name(json_keyfile, scope)
gc = gspread.authorize(credentials)

In [46]:
sheet_name = "CMIP6_DataExceptions (Responses)"
sh = gc.open(sheet_name)
print(sh.worksheets())

[<Worksheet 'Form Responses 1' id:252604281>, <Worksheet 'NH_additions' id:128595157>]


In [47]:
wks = sh.worksheet("NH_additions")

data = wks.get_all_values()
headers = data.pop(0)

df_cloud = pd.DataFrame(data, columns=headers)
#df_cloud['name'] = ['/'.join(s[:-3]) for s in df_cloud.values]
df_local = pd.read_csv('csv/exceptions.csv', na_filter= False)
#df_local['name'] = ['/'.join(s[:-3]) for s in df_local.values]

# add new from df_local
for item,row in enumerate(df_local.values):
    name = row[-1]
    #print(item,name)
    df_match = df_cloud[df_cloud.name==name]
    if len(df_match)==0:
        print(item,name,' was not in df_cloud')
        wks.append_row(list(row))

296 CNRM-CM6-1-HR/ssp126/r1i1p1f2/Omon/uo/gn/noUse  was not in df_cloud


In [48]:
wks = sh.worksheet("NH_additions")

data = wks.get_all_values()
headers = data.pop(0)

df_cloud = pd.DataFrame(data, columns=headers)
#df_cloud['name'] = ['/'.join(s[:-3]) for s in df_cloud.values]
df_local = pd.read_csv('csv/exceptions.csv', na_filter= False)
#df_local['name'] = ['/'.join(s[:-3]) for s in df_local.values]

# delete old from df
for item,row in enumerate(df_cloud.values):
    name = row[-1]
    #print(item,name)
    df_match = df_local[df_local.name==name]
    if len(df_match)==0:
        print(item,name,' is not in df_local')
        wks.delete_row(item+2)

In [6]:
assert False 

AssertionError: 

In [58]:
# add a new local entry
import datetime
date = str(datetime.datetime.now().strftime("%Y-%m-%d"))

store = 'CNRM-CM6-1/historical/r23i1p1f2/Omon/umo/gn/'
code = 'noUse'
reason = 'missing chunks of data'

row = store.split('/')[:-1] + [code,reason,date,store+code]
print(row)

['CNRM-CM6-1', 'historical', 'r23i1p1f2', 'Omon', 'umo', 'gn', 'noUse', 'missing chunks of data', '2020-03-02', 'CNRM-CM6-1/historical/r23i1p1f2/Omon/umo/gn/noUse']


In [59]:
# if it looks good, now check
dfl = df_local.append(pd.Series(row, index=df_local.columns ),ignore_index=True)
dfl.tail()

Unnamed: 0,source_id,experiment_id,member_id,table_id,variable_id,grid_label,reason_code,reason_txt,status,name
293,MRI-ESM2-0,ssp585,r1i1p1f1,CFmon,albisccp,gn,noUse,multiple time gaps,2020-02-10,MRI-ESM2-0/ssp585/r1i1p1f1/CFmon/albisccp/gn/n...
294,ACCESS-ESM1-5,piControl,r1i1p1f1,Omon,umo,gn,noUse,HDF error,2020-02-16,ACCESS-ESM1-5/piControl/r1i1p1f1/Omon/umo/gn/n...
295,CNRM-CM6-1,historical,r22i1p1f2,Omon,wo,gn,noUse,missing files,2020-03-01,CNRM-CM6-1/historical/r22i1p1f2/Omon/wo/gn/noUse
296,CNRM-CM6-1-HR,ssp126,r1i1p1f2,Omon,uo,gn,noUse,HDF errpr,2020-03-02,CNRM-CM6-1-HR/ssp126/r1i1p1f2/Omon/uo/gn/noUse
297,CNRM-CM6-1,historical,r23i1p1f2,Omon,umo,gn,noUse,missing chunks of data,2020-03-02,CNRM-CM6-1/historical/r23i1p1f2/Omon/umo/gn/noUse


In [60]:
dfl.to_csv('csv/exceptions.csv', index=False)

In [27]:
assert False

AssertionError: 

In [None]:
#wks.update_acell('B1', 'Bingo!')
#wks.append_row(['junk','more_junk'])
#wks.add_rows(2)
#for row in df_local.values[:100]:
#    wks.append_row(list(row))
#wks.delete_row(7)
#wks.row_values(8)

In [18]:
common_cols = df_cloud.columns.tolist()                              #generate list of column names
df12 = pd.merge(df_cloud, df_local, on=common_cols, how='inner')     #extract common rows with merge
df2 = df_local[~df_local['name'].isin(df12['name'])]
df1 = df_cloud[~df_cloud['name'].isin(df12['name'])]
len(df_cloud),len(df_local)

(282, 282)