# build a database of CESM-LE files

In [27]:
import intake
import intake_esm
import yaml

from tqdm import tqdm
import sys

import pandas as pd

import os
from subprocess import check_call, CalledProcessError

Show the directories to store database and cached files (set in .intake_esm/config.yaml)

In [13]:
for key in ['data-cache-directory', 'database-directory']:
    print(f'{key}: {intake_esm.config.get(key)}')


data-cache-directory: /glade/scratch/mclong/intake-esm-data
database-directory: /glade/work/mclong/intake-esm-collections/future-arctic


### query collection using intake plugin interface

Connect to database

In [14]:
col = intake.open_esm_metadatastore(collection_input_file='cesm1-le-collection.yml',
                                    overwrite_existing=True)
col                     

Working on experiment: CTRL
Getting file listing : GLADE:posix:/glade/collections/cdg/data/cesmLE/CESM-CAM5-BGC-LE
Building file database : GLADE:posix:/glade/collections/cdg/data/cesmLE/CESM-CAM5-BGC-LE
Filename : f.e11.FAMIPC5CN.f09_f09.rcp85.toga.ens10.clm2.h0.THBOT.200601-201412.nc_temp_.nc does not conform to expected pattern
Filename : f.e11.FAMIPC5CN.f09_f09.rcp85.toga.ens10.clm2.h0.ERRH2OSNO.200601-201412.nc_temp_.nc does not conform to expected pattern
Filename : f.e11.FAMIPC5CN.f09_f09.rcp85.toga.ens10.clm2.h0.PSNSUN.200601-201412.nc_temp_.nc does not conform to expected pattern
Filename : f.e11.FAMIPC5CN.f09_f09.rcp85.toga.ens10.clm2.h0.PRODUCT_CLOSS.200601-201412.nc_temp_.nc does not conform to expected pattern
Filename : f.e11.FAMIPC5CN.f09_f09.rcp85.toga.ens10.clm2.h0.PSNSUN_TO_CPOOL.200601-201412.nc_temp_.nc does not conform to expected pattern
Could not identify CESM fileparts for : HadISST.cvdp_data.1979-2012.nc
Could not identify CESM fileparts for : NASA_Team.cvdp_da

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261628 entries, 0 to 261627
Data columns (total 18 columns):
resource            261628 non-null object
resource_type       261628 non-null object
direct_access       261628 non-null object
experiment          261628 non-null object
case                261628 non-null object
component           261628 non-null object
stream              261628 non-null object
variable            261628 non-null object
date_range          261628 non-null object
ensemble            261628 non-null object
files               261628 non-null object
files_basename      261628 non-null object
files_dirname       261628 non-null object
ctrl_branch_year    0 non-null object
year_offset         34112 non-null object
sequence_order      261628 non-null object
has_ocean_bgc       261628 non-null object
grid                52299 non-null object
dtypes: object(18)
memory usage: 35.9+ MB


<Intake catalog: esm_metadatastore>

In [15]:
col.df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 261628 entries, 0 to 261627
Data columns (total 18 columns):
resource            261628 non-null object
resource_type       261628 non-null object
direct_access       261628 non-null bool
experiment          261628 non-null object
case                261628 non-null object
component           261628 non-null object
stream              261628 non-null object
variable            261628 non-null object
date_range          261628 non-null object
ensemble            261628 non-null int64
files               261628 non-null object
files_basename      261628 non-null object
files_dirname       261628 non-null object
ctrl_branch_year    0 non-null float64
year_offset         34112 non-null float64
sequence_order      261628 non-null int64
has_ocean_bgc       261628 non-null bool
grid                52299 non-null object
dtypes: bool(2), float64(2), int64(2), object(12)
memory usage: 34.4+ MB


Determine which ensembles have ocean biogeochemistry variables.

In [16]:
experiments=['20C', 'RCP85']
ensembles = col.search(experiment=experiments, has_ocean_bgc=True).results.ensemble.unique().tolist()
print(ensembles)

[1, 2, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 34, 35, 101, 102, 103, 104, 105]


Determine what ocean variables are available at monthly and daily resolution.  The POP model is not smart enough to write the same variable to two different streams, so it is not necessary to inclued `stream` in database queries for POP. We can, however, view all the variables defined in each stream.

In [17]:
stream_def = {'mon': 'pop.h', 'day': ['pop.h.nday1', 'pop.h.ecosys.nday1']}

all_variables = {'mon': col.search(component='ocn', stream=stream_def['mon']).results.variable.unique().tolist(), 
             'day': col.search(component='ocn', stream=stream_def['day']).results.variable.unique().tolist()}
print(yaml.dump(all_variables))

day: [CaCO3_form_zint, DpCO2_2, ECOSYS_IFRAC_2, ECOSYS_XKW_2, FG_CO2_2, HBLT_2, HMXL_2,
  SSH_2, SST, SST2, STF_O2_2, TAUX_2, TAUY_2, WVEL_50m, XBLT_2, XMXL_2, diatC_zint_100m,
  diatChl_SURF, diazC_zint_100m, diazChl_SURF, photoC_diat_zint, photoC_diaz_zint,
  photoC_sp_zint, spC_zint_100m, spCaCO3_zint_100m, spChl_SURF, zooC_zint_100m]
mon: [ADVS, ADVS_ISOP, ADVS_SUBM, ADVT, ADVT_ISOP, ADVT_SUBM, ALK, AOU, ATM_ALT_CO2,
  ATM_CO2, BSF, CFC11, CFC12, CFC_ATM_PRESS, CFC_IFRAC, CFC_XKW, CO2STAR, CO3, CaCO3_FLUX_IN,
  CaCO3_PROD, CaCO3_form, DCO2STAR, DCO2STAR_ALT_CO2, DENITRIF, DIA_DEPTH, DIA_IMPVF_CFC11,
  DIA_IMPVF_CFC12, DIA_IMPVF_IAGE, DIA_IMPVF_SALT, DIA_IMPVF_TEMP, DIC, DIC_ALT_CO2,
  DOC, DOC_prod, DOC_remin, DOFe, DOFe_prod, DON, DON_prod, DOP, DOP_prod, DpCO2,
  DpCO2_ALT_CO2, ECOSYS_ATM_PRESS, ECOSYS_IFRAC, ECOSYS_XKW, EVAP_F, FG_ALT_CO2, FG_CO2,
  FW, Fe, Fe_scavenge, Fe_scavenge_rate, FvICE_ALK, FvICE_DIC, FvPER_ALK, FvPER_DIC,
  H2CO3, HBLT, HCO3, HDIFB_CFC11, HDIFB_CFC12, H

This analysis will focus on a subset of those variables.

In [18]:
variables = yaml.load('''
day: [ECOSYS_IFRAC_2, HMXL_2, SST, XMXL_2, 
      diatC_zint_100m, diatChl_SURF, photoC_diat_zint,
      photoC_sp_zint, spC_zint_100m, spChl_SURF, zooC_zint_100m]
mon: [ECOSYS_IFRAC, Fe, HMXL, Jint_100m_DIC, NH4, NO3, NOx_FLUX, PAR_avg, PD, PO4, 
      POC_FLUX_IN, POC_PROD, SALT, SiO3, TBLT, TEMP, XMXL, diatC, diatChl, 
      diat_Fe_lim, diat_N_lim, diat_PO4_lim, diat_SiO3_lim, diat_agg, diat_light_lim, diat_loss, 
      graze_diat, graze_diaz, graze_sp, photoC_NO3_diat, photoC_NO3_diat_zint, photoC_NO3_sp, 
      photoC_NO3_sp_zint, photoC_diat, photoC_sp, spC, spChl, sp_Fe_lim, sp_N_lim, sp_PO4_lim, 
      sp_agg, sp_light_lim, sp_loss, tend_zint_100m_NO3, zooC, zoo_loss]
''')
with open('variables.yml', 'w') as fid:
    yaml.dump(variables, fid)
    
variable_list = []
for k, v in variables.items():
    variable_list.extend(v)
    
!cat variables.yml

/bin/sh: module: line 1: syntax error: unexpected end of file
/bin/sh: error importing function definition for `BASH_FUNC_module'
/bin/sh: ml: line 1: syntax error: unexpected end of file
/bin/sh: error importing function definition for `BASH_FUNC_ml'
day: [ECOSYS_IFRAC_2, HMXL_2, SST, XMXL_2, diatC_zint_100m, diatChl_SURF, photoC_diat_zint,
  photoC_sp_zint, spC_zint_100m, spChl_SURF, zooC_zint_100m]
mon: [ECOSYS_IFRAC, Fe, HMXL, Jint_100m_DIC, NH4, NO3, NOx_FLUX, PAR_avg, PD, PO4,
  POC_FLUX_IN, POC_PROD, SALT, SiO3, TBLT, TEMP, XMXL, diatC, diatChl, diat_Fe_lim,
  diat_N_lim, diat_PO4_lim, diat_SiO3_lim, diat_agg, diat_light_lim, diat_loss, graze_diat,
  graze_diaz, graze_sp, photoC_NO3_diat, photoC_NO3_diat_zint, photoC_NO3_sp, photoC_NO3_sp_zint,
  photoC_diat, photoC_sp, spC, spChl, sp_Fe_lim, sp_N_lim, sp_PO4_lim, sp_agg, sp_light_lim,
  sp_loss, tend_zint_100m_NO3, zooC, zoo_loss]


### query catalog for full-subset

In [19]:
cat = col.search(experiment=experiments,
                 ensemble=ensembles, 
                 variable=variable_list)
cat.results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6580 entries, 226491 to 1379
Data columns (total 18 columns):
resource            6580 non-null object
resource_type       6580 non-null object
direct_access       6580 non-null bool
experiment          6580 non-null object
case                6580 non-null object
component           6580 non-null object
stream              6580 non-null object
variable            6580 non-null object
date_range          6580 non-null object
ensemble            6580 non-null int64
files               6580 non-null object
files_basename      6580 non-null object
files_dirname       6580 non-null object
ctrl_branch_year    0 non-null float64
year_offset         0 non-null float64
sequence_order      6580 non-null int64
has_ocean_bgc       6580 non-null bool
grid                6580 non-null object
dtypes: bool(2), float64(2), int64(2), object(12)
memory usage: 886.8+ KB


### ensure all data is accessible on spinning-disk

The catalog includes data from multiple locations; what are those locations and are they accessible via direct access?

In [20]:
df = cat.results
resource_access = {}
for res in df.resource.unique():
    resource_access[res] = df.direct_access.loc[df.resource == res].unique()[0]
resource_access

{'HPSS:hsi:/CCSM/csm/CESM-CAM5-BGC-LE': False,
 'GLADE:posix:/glade/collections/cdg/data/cesmLE/CESM-CAM5-BGC-LE': True}

Determine which files need to be transferred from tape.

In [21]:
df_transfer = pd.DataFrame(columns=df.columns)
for file in df.files_basename.unique():
    df_file_subset = df.loc[df.files_basename == file]
    if not any(df_file_subset.direct_access):
        df_transfer = pd.concat((df_transfer, df_file_subset), ignore_index=True) 
df_transfer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3744 entries, 0 to 3743
Data columns (total 18 columns):
resource            3744 non-null object
resource_type       3744 non-null object
direct_access       3744 non-null object
experiment          3744 non-null object
case                3744 non-null object
component           3744 non-null object
stream              3744 non-null object
variable            3744 non-null object
date_range          3744 non-null object
ensemble            3744 non-null object
files               3744 non-null object
files_basename      3744 non-null object
files_dirname       3744 non-null object
ctrl_branch_year    0 non-null object
year_offset         0 non-null object
sequence_order      3744 non-null object
has_ocean_bgc       3744 non-null object
grid                3744 non-null object
dtypes: object(18)
memory usage: 526.6+ KB


In [None]:
os.makedirs(intake_esm.config.get('data-cache-directory'), exist_ok=True)

for idx, row in tqdm(df_transfer.iterrows(), file=sys.stdout):
    file_remote = row.files
    file_local = os.path.join(intake_esm.config.get('data-cache-directory'), row.files_basename)
    try:
        check_call(['hsi',f"cget {file_local} : {file_remote}"])
    except (CalledProcessError) as err:
        print('\n'+'-'*80)
        print('error!')       
        print(' '.join(['hsi',f"cget {file_local} : {file_remote}"]))
        print('-'*80)
        raise err



0it [00:00, ?it/s][A[A

1it [00:00,  3.36it/s][A[A

2it [00:38, 11.66s/it][A[A

3it [01:53, 30.73s/it][A[A

4it [02:56, 40.40s/it][A[A

5it [04:28, 55.74s/it][A[A

6it [06:31, 75.90s/it][A[A

7it [08:18, 85.30s/it][A[A

8it [09:31, 81.76s/it][A[A

9it [11:15, 88.32s/it][A[A

10it [12:46, 89.22s/it][A[A

11it [14:51, 99.81s/it][A[A

12it [16:42, 103.21s/it][A[A

13it [18:27, 103.82s/it][A[A

14it [19:38, 93.92s/it] [A[A

15it [21:45, 103.73s/it][A[A

16it [22:52, 92.73s/it] [A[A

17it [24:53, 101.19s/it][A[A

In [None]:
col = intake.open_esm_metadatastore(collection_input_file='cesm1-le-collection.yml',
                                    overwrite_existing=True)
col  

In [None]:
%load_ext watermark

In [None]:
%watermark --iversion -g -h -m -v -u -d