# Identify CESM1 data dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from subprocess import check_call
import yaml

import intake

import pandas as pd
import xarray as xr

import ann_avg_utils as aau
import funnel
import operators as ops
import utils

## Raw data

In [3]:
esm_collection_json = "data/glade-cesm1-cmip5-timeseries.json"
catalog = intake.open_esm_datastore(esm_collection_json)
catalog

Unnamed: 0,unique
experiment,9
case,9
component,1
stream,1
variable,12
date_range,133
member_id,1
path,2124
ctrl_branch_year,4


In [4]:
variable_list = aau.global_vars()['vars']
variable_list

['photoC_TOT_zint_100m',
 'photoC_diat_zint_100m',
 'photoC_TOT_zint',
 'photoC_diat_zint',
 'POC_FLUX_100m',
 'CaCO3_FLUX_100m',
 'SiO2_FLUX_100m',
 'diaz_Nfix',
 'NOx_FLUX',
 'NHy_FLUX',
 'NHx_SURFACE_EMIS',
 'DENITRIF',
 'SedDenitrif',
 'DON_RIV_FLUX',
 'DONr_RIV_FLUX',
 'NO3_RIV_FLUX',
 'ponToSed',
 'FG_CO2',
 'O2',
 'O2_under_thres']

In [5]:
catalog_vars = list(catalog.df.variable.unique())
catalog_vars

['DENITRIF',
 'IRON_FLUX',
 'NHy_FLUX',
 'NOx_FLUX',
 'CaCO3_FLUX_IN',
 'FG_CO2',
 'O2',
 'POC_FLUX_IN',
 'diaz_Nfix',
 'photoC_diat',
 'photoC_diaz',
 'photoC_sp']

In [6]:
table_vars = ['photoC_diat', 
              'photoC_diaz', 
              'photoC_diaz',
              'POC_FLUX_IN',
              'CaCO3_FLUX_IN',
              'diaz_Nfix',
              'NHy_FLUX', 'NOx_FLUX',
              'DENITRIF',
              'FG_CO2',
              'IRON_FLUX',
             ]
missing_vars = set(table_vars) - set(catalog_vars)
assert not missing_vars

In [7]:
cluster, client = utils.get_ClusterClient()
cluster.scale(12) #adapt(minimum_jobs=0, maximum_jobs=24)
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://10.12.206.42:46226  Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/mclong/proxy/42332/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [8]:
experiment_list = [
    "piControl", "historical", "RCP-8.5",
]
time_slices_dict = {
    'piControl': {
        'slices': [slice(120, 320)], 
        'date_ranges': ['012101-032012'],
        'type': 'isel',
    },
    'historical': {
        'slices': [slice("1981-01-15", "2006-01-15")],
        'date_ranges': ['198101-200512'],
        'type': 'sel',
    },
    'RCP-8.5': {
        'slices': [
            slice("2090-01-15", "2100-01-15"), 
            slice("2005-01-15", "2015-01-15")
        ],
        'date_ranges': ['209001-210012', '200501-201412'],
        'type': 'sel',
    },
}
time_slices_dict

{'piControl': {'slices': [slice(120, 320, None)],
  'date_ranges': ['012101-032012'],
  'type': 'isel'},
 'historical': {'slices': [slice('1981-01-15', '2006-01-15', None)],
  'date_ranges': ['198101-200512'],
  'type': 'sel'},
 'RCP-8.5': {'slices': [slice('2090-01-15', '2100-01-15', None),
   slice('2005-01-15', '2015-01-15', None)],
  'date_ranges': ['209001-210012', '200501-201412'],
  'type': 'sel'}}

In [None]:
clobber = False
file_format = "zarr"

data_dir = f'/glade/scratch/{os.environ["USER"]}/cesm2-marbl-data-v2'

cdf_kwargs_stream = {
    'pop.h': dict(chunks={'time': 12}, decode_coords=False),
    'pop.h.ecosys.nyear1': dict(chunks={'time': 1}, decode_coords=False),
}

for experiment in experiment_list:
    slice_info = time_slices_dict[experiment]
    slices = slice_info["slices"]
    date_ranges = slice_info["date_ranges"]
    slice_type = slice_info["type"]
    
    for this_slice, date_range in zip(slices, date_ranges):
        
        for v in table_vars:
            cat_sub = catalog.search(
                experiment=experiment, 
                variable=v,
            )

            keys = list(cat_sub.keys())
            if not keys:
                print(f'missing: {collection_type}, {experiment}, {v}')
                continue

            elif len(keys) > 1:                
                print(f'{v} -- expecting single key, found: {keys}')
                continue

            key = keys[0]
            key_info = funnel.core.intake_esm_get_keys_info(cat_sub)[key]                       
            stream = key_info['stream']

            name = f"cesm1-cmip5.{experiment}.{stream}.{v}.{date_range}"
            file_out = f'{data_dir}/{name}.{file_format}'

            # to write or not to write? if not, move on
            if os.path.exists(file_out) and not clobber:
                continue 

            dsets = cat_sub.to_dataset_dict(cdf_kwargs=cdf_kwargs_stream["pop.h"])        
            case_list = sorted(list(set(cat_sub.df.case.to_list())))

            # get the dataset
            dsets = cat_sub.to_dataset_dict(cdf_kwargs=cdf_kwargs_stream[stream])
            _, ds = dsets.popitem()

            ds.attrs['cases'] = ','.join(case_list)

            # center the time axix
            #ds = ops.center_time(ds)

            # apply operations        
            if slice_type == "sel":
                ds = ds.sel(time=this_slice)
            else:
                ds = ds.isel(time=this_slice)

            # write
            if file_format == 'zarr':
                ds.to_zarr(file_out, mode='w', consolidated=True)
            else:
                ds.to_netcdf(file_out, unlimited_dims=['time'])


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'


In [None]:
!