In [1]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
from datetime import datetime
import ast

def set_bnds_as_coords(ds):
    new_coords_vars = [var for var in ds.data_vars if 'bnds' in var or 'bounds' in var]
    ds = ds.set_coords(new_coords_vars)
    return ds

def read_codes(zarr):
    dex = pd.read_csv('csv/exceptions.csv',skipinitialspace=True)
    codes = []
    [source_id,experiment_id,member_id,table_id,variable_id,grid_label] = zarr.split('/')
    for ex in dex.values:
        dd = dict(zip(dex.keys(),ex))
        if dd['source_id'] == source_id or dd['source_id'] == 'all':
            if dd['experiment_id'] == experiment_id or dd['experiment_id'] == 'all':
                if dd['member_id'] == member_id or dd['member_id'] == 'all':
                    if dd['table_id'] == table_id or dd['table_id'] == 'all':
                        if dd['variable_id'] == variable_id or dd['variable_id'] == 'all':
                            if dd['grid_label'] == grid_label or dd['grid_label'] == 'all':                                 
                                codes += [dd['reason_code']]
                                print('special treatment needed:',dd['reason_code'])
    return codes

def get_new(ds, skip_sites, okay, trouble): 
    # download any files needed for this zarr store (or abort the attempt)
    tmp = 'nctemp'
    
    files = ds[ds.zarr_name == zarr].file_name.unique()
    gfiles = []
    #urls = []
    for file in files:
        if okay:
            save_file = tmp + '/'+file
            expected_size = ds[ds.file_name == file]['size'].values[0]
            if os.path.isfile(save_file):
                if abs(os.path.getsize(save_file) - expected_size) <= 1000 :
                    print('already have: ',save_file)
                    gfiles += [save_file]
                    continue

            url = ds[ds.file_name == file].HTTPServer_url.values[0]
            
            for site in skip_sites:
                if site in url:
                    print('skip ',site,'domain for now')
                    trouble[zarr] = 'skipping ' + site + ' domain'
                    okay = False
            
            if not okay:
                continue
                
            command = 'curl ' + url + ' -o ' + save_file
            print(command)
            os.system(command)

            if os.path.getsize(save_file) != expected_size:
                print('trying curl command again')
                os.system(command)
                if os.path.getsize(save_file) != expected_size:
                    print('second download did not fix issue - skipping file:',file)
                    trouble[zarr] = 'netcdf download not complete'
                    okay = False
            if os.path.getsize(save_file) == 0:
                os.system("rm -f "+save_file)
            if okay:
                gfiles += [save_file]
    return gfiles

def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            total_size += os.path.getsize(fp)

    return total_size

def getsheet(json_keyfile,sheet_name):
    scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']

    credentials = ServiceAccountCredentials.from_json_keyfile_name(json_keyfile, scope)

    gc = gspread.authorize(credentials)

    wks = gc.open(sheet_name).sheet1

    data = wks.get_all_values()
    headers = data.pop(0)

    df = pd.DataFrame(data, columns=headers)

    df['experiments'] = [s.replace('*','').replace(' ','').split(',') for s in df.experiment_ids.values]
    df['models'] = [s.replace('All Available','All').replace(' ','').split(',') for s in df.source_ids.values]
    df['variables'] = [s.replace(' ','').split(',') for s in df['variable_ids (comma separated list)'].values]
    df['table'] = [s.replace(' ','').split(':')[0] for s in df.table_id.values]
    df['requester'] = df['Your name'] 
    df['status'] = df['LDEO status'] 

    df = df.drop(['Your name', 'Science Question/Motivation','Have you verified the existence of the data you will request?',
                  'table_id', 'source_ids', 'experiment_ids','variable_ids (comma separated list)', 'Questions and comments', 'status'],1)
    
    return df

def requests(): 
    json_keyfile = '/home/nhn2/json/Pangeo Hackathon-e48a41b13c91.json'
    sheet_name = "CMIP6 Hackathon Data Request (Responses)"

    df_all = getsheet(json_keyfile, sheet_name)
    df_all.to_csv('csv/dummy.csv',index=False)
    df_all = pd.read_csv('csv/dummy.csv')
      
    df_prior = pd.read_csv('csv/requests.csv')    
    
    df_new = df_all.merge(df_prior, how='left', indicator=True)
    df_new = df_new[df_new['_merge']=='left_only'].drop('_merge',1)
    
    return df_new

def set_request_id():
    return datetime.now().strftime('%Y%m-%d%H-%M%S')

In [23]:
def search(server, df_req):
    df_list = []
    for index, row in df_req.iterrows():
        timestamp = row['Timestamp']
        name = row['requester']
        email = row['E-mail']
        experiment_ids = ast.literal_eval(row['experiments'])
        source_ids = ast.literal_eval(row['models'])
        variable_ids = ast.literal_eval(row['variables'])
        table_id = row['table']  
        print(email)
        for experiment_id in experiment_ids:
            for variable_id in variable_ids:
                print(experiment_id, variable_id, table_id, source_ids)
                if source_ids[0] == 'All':
                    try:
                        files= my_search.esgf_search(server=server, mip_era='CMIP6', variable_id=variable_id, 
                                table_id=table_id, experiment_id=experiment_id, page_size=500, verbose=False)
                    except:
                        continue
                    
                    files.loc[:,'version'] = [str.split('/')[-2] for str in files['HTTPServer_url']]
                    files.loc[:,'file_name'] = [str.split('/')[-1] for str in files['HTTPServer_url']]
                    # might need to set activity_id to activity_drs for some files (see old versions)
                    files.loc[:,'activity_id'] = files.activity_drs

                    df_list += [files.drop_duplicates(subset =["file_name","version","checksum"]) ]
                else:
                    for source_id in source_ids:
                        try:
                            files= my_search.esgf_search(server=server, mip_era='CMIP6', variable_id=variable_id, 
                                    table_id=table_id, experiment_id=experiment_id, source_id = source_id, page_size=500, verbose=False)
                        except:
                            continue
                            
                        files.loc[:,'version'] = [str.split('/')[-2] for str in files['HTTPServer_url']]
                        files.loc[:,'file_name'] = [str.split('/')[-1] for str in files['HTTPServer_url']]
                        # might need to set activity_id to activity_drs for some files (see old versions)
                        files.loc[:,'activity_id'] = files.activity_drs
                        
                        df_list += [files.drop_duplicates(subset =["file_name","version","checksum"]) ]

    dESGF = pd.concat(df_list,sort=False)
    dESGF = dESGF.drop_duplicates(subset =["file_name","version","checksum"])
    keys_all = list(dESGF.keys())
    keys_show = ['activity_drs','institution_id',"source_id","experiment_id","member_id","table_id","variable_id",'grid_label',"file_name",'HTTPServer_url']
    keys_drop = list(set(keys_all) - set(keys_show))
    return dESGF.drop(keys_drop,1)

In [47]:
def identify(dfm, df_req, dESGF):
    single_member = single_member_tables

    zarr_format = '/%(activity_drs)s/%(institution_id)s/%(source_id)s/%(experiment_id)s/%(member_id)s/%(table_id)s/%(variable_id)s/%(grid_label)s'
    df_list = []
    for index, row in df_req.iterrows():
        timestamp = row['Timestamp']
        name = row['requester']
        email = row['E-mail']
        experiment_ids = ast.literal_eval(row['experiments'])
        source_ids = ast.literal_eval(row['models'])
        variable_ids = ast.literal_eval(row['variables'])
        table_id = row['table']
        
        for experiment_id in experiment_ids:
            for variable_id in variable_ids:
                for source_id in source_ids:
                    df = dESGF[(dESGF.experiment_id==experiment_id)&(dESGF.table_id==table_id)&
                               (dESGF.variable_id==variable_id)    &(dESGF.source_id==source_id)]
                    member_ids = df.member_id.unique()
                    for member_id in member_ids:
                        dfm = df[df.member_id==member_id]
                        file=dfm.values[0]
                        zarr_dir = dict(zip(df.keys(),file))
                        zarr_file = zarr_format % zarr_dir
                        dfm.loc[:,'zstore'] = zarr_file
                        df_list += [dfm]
    return pd.concat(df_list)

In [48]:
dESGF.keys()

Index(['HTTPServer_url', 'activity_drs', 'experiment_id', 'grid_label',
       'institution_id', 'member_id', 'source_id', 'table_id', 'variable_id',
       'file_name'],
      dtype='object')

In [40]:
import numpy as np
import my_search
import qgrid

In [54]:
# Initialize cmip6-master.csv 
#url_cloud = 'https://storage.googleapis.com/cmip6/cmip6.csv'
#df = pd.read_csv(url_cloud)
#df.to_csv('csv/cmip6-master.csv',index=False)

In [41]:
dtype = {}
dtype['llnl'] = "https://esgf-node.llnl.gov/esg-search/search"
dtype['ipsl'] = "https://esgf-node.ipsl.upmc.fr/esg-search/search"
dtype['nci']  = "https://esgf.nci.org.au/esg-search/search"  
dtype['ceda'] = "https://esgf-index1.ceda.ac.uk/esg-search/search"   # nothing yet
dtype['jpl'] = "https://esgf-node.jpl.nasa.gov/esg-search/search"   # connection refused
dtype['gfdl'] =  "https://esgdata.gfdl.noaa.gov/esg-search/search"    # only amip and piControl
dtype['dkrz'] =  "https://esgf-data.dkrz.de/esg-search/search"        # no historical

In [42]:
# Make some choices
ESGF_site = dtype['llnl']
skip_sites = ['dist.nmlab.snu.ac.kr','esg.lasg.ac.cn','esgf-data2.diasjp.net']
single_member_tables = ['Omon', 'CF3hr','3hr','E3hr', '6hrLev', 'day', '6hrPlev', '6hrPlevPt', 'fx', 'Ofx']

In [59]:
df_request_new = requests()
request_id = set_request_id()

c_file = 'csv/cmip6_'+request_id+'.csv'
x_file = 'csv/exceptions_'+request_id+'.txt'

print(x_file,c_file)
df_request_new

csv/exceptions_201911-0907-5418.csv csv/cmip6_201911-0907-5418.csv


Unnamed: 0,Timestamp,E-mail,LDEO status,hackathon location,experiments,models,variables,table,requester
66,11/8/2019 11:44:12,test@gmail.com,,,['historical'],['CESM2'],"['ts', 'tas']",Amon,Naomi Henderson


In [44]:
print(ESGF_site)
df_ESGF = search(ESGF_site,df_request_new)

https://esgf-node.llnl.gov/esg-search/search
test@gmail.com
historical ts Amon ['CESM2']
historical tas Amon ['CESM2']


In [55]:
df_master = pd.read_csv('csv/cmip6-master.csv')
df_needed = identify(df_master, df_request_new, df_ESGF)

In [57]:
df_new = df_master[df_master.table_id=='junk']
df_new

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,date_start,date_stop,time_len,sizeG,dcpp_init_year


In [53]:
df.head()

Unnamed: 0,HTTPServer_url,activity_drs,experiment_id,grid_label,institution_id,member_id,source_id,table_id,variable_id,file_name,zstore
0,http://aims3.llnl.gov/thredds/fileServer/css03...,CMIP,historical,gn,NCAR,r10i1p1f1,CESM2,Amon,ts,ts_Amon_CESM2_historical_r10i1p1f1_gn_185001-1...,/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/ts/gn
6,http://aims3.llnl.gov/thredds/fileServer/css03...,CMIP,historical,gn,NCAR,r10i1p1f1,CESM2,Amon,ts,ts_Amon_CESM2_historical_r10i1p1f1_gn_190001-1...,/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/ts/gn
12,http://aims3.llnl.gov/thredds/fileServer/css03...,CMIP,historical,gn,NCAR,r10i1p1f1,CESM2,Amon,ts,ts_Amon_CESM2_historical_r10i1p1f1_gn_195001-1...,/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/ts/gn
18,http://aims3.llnl.gov/thredds/fileServer/css03...,CMIP,historical,gn,NCAR,r10i1p1f1,CESM2,Amon,ts,ts_Amon_CESM2_historical_r10i1p1f1_gn_200001-2...,/CMIP/NCAR/CESM2/historical/r10i1p1f1/Amon/ts/gn
24,http://aims3.llnl.gov/thredds/fileServer/css03...,CMIP,historical,gn,NCAR,r11i1p1f1,CESM2,Amon,ts,ts_Amon_CESM2_historical_r11i1p1f1_gn_185001-1...,/CMIP/NCAR/CESM2/historical/r11i1p1f1/Amon/ts/gn
