# Identify data dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from subprocess import check_call
import yaml

import intake

import pandas as pd

import operators as ops
import utils
import funnel

## Post-processed assets

In [3]:
funnel_collection_json = funnel.to_intake_esm()
funnel_cat = intake.open_esm_datastore(funnel_collection_json)
funnel_cat.df

Unnamed: 0,component,experiment,stream,variable,name,path
0,ocn,SSP5-8.5,pop.h,POC_FLUX_100m,global_integral_timeseries_ann,/glade/scratch/mclong/cesm2-marbl-archive/0050...
1,ocn,SSP3-7.0,pop.h,photoC_TOT_zint_100m,global_integral_timeseries_ann,/glade/scratch/mclong/cesm2-marbl-archive/0579...
2,ocn,SSP5-8.5,pop.h,photoC_TOT_zint_100m,global_integral_timeseries_ann,/glade/scratch/mclong/cesm2-marbl-archive/0714...
3,ocn,SSP5-8.5,pop.h,photoC_diat_zint_100m,global_integral_timeseries_ann,/glade/scratch/mclong/cesm2-marbl-archive/0b9f...
4,ocn,SSP1-2.6,pop.h,SST,global_mean_timeseries_ann,/glade/scratch/mclong/cesm2-marbl-archive/1122...
...,...,...,...,...,...,...
58,ocn,historical,pop.h,spC_zint_100m,epoch_mean,/glade/scratch/mclong/cesm2-marbl-archive/ec1d...
59,ocn,SSP2-4.5,pop.h,SST,global_mean_timeseries_ann,/glade/scratch/mclong/cesm2-marbl-archive/f585...
60,ocn,historical,pop.h,DOCt,epoch_mean,/glade/scratch/mclong/cesm2-marbl-archive/f730...
61,ocn,historical,pop.h,DOP,epoch_mean,/glade/scratch/mclong/cesm2-marbl-archive/f891...


## Raw data

In [4]:
esm_collection_json = 'data/campaign-cesm2-cmip6-timeseries.json'
catalog = intake.open_esm_datastore(esm_collection_json)
catalog

Unnamed: 0,unique
Unnamed: 0,312503
experiment,13
case,33
component,6
stream,22
variable,2641
date_range,596
member_id,12
path,312503
ctrl_branch_year,18


### Write csv of data used

In [5]:
with open('_datasets.yml') as fid:
    datasets = yaml.safe_load(fid)    


lines = [];
for collection_type, exp_dict in datasets.items():   
    for experiment, variable_list in exp_dict.items():
            assert len(variable_list) == len(set(variable_list))
                                         
        
            cat = catalog.search(
                experiment=experiment, 
                variable=variable_list, 
                component='ocn')
            
            for v in variable_list:
                
                # subset catalog
                cat_sub = cat.search(variable=v)
                keys = list(cat_sub.keys())
                if not keys:
                    print(f'missing: {collection_type}, {experiment}, {v}')
                    continue                
                
                elif len(keys) > 1:                
                    #assert len(keys) == 1, f'expecting single key, found: {keys}'
                    print(f'{v} -- expecting single key, found: {keys}')
                    continue
                
                # get info
                key = keys[0]
                key_info = funnel.core.intake_esm_get_keys_info(cat_sub)[key]                       
                stream = key_info['stream']
                
                date_range = cat_sub.df.date_range.to_list()

                for i, row in cat_sub.df.iterrows():
                    entry = {'collection': collection_type}
                    for column in ['case', 'stream', 'variable', 'path']:
                        entry[column] = row[column]
                    
                    lines.append(entry)
                        
                #paths.extend(cat_sub.df.path.to_list())

#paths = sorted(list(set(paths)))
#paths
df = pd.DataFrame(lines).drop_duplicates(ignore_index=True)
df.to_csv('data/data-dependencies.csv')
df

Unnamed: 0,collection,case,stream,variable,path
0,epoch_mean,b.e21.BHIST.f09_g17.CMIP6-historical.001,pop.h,ABIO_DIC14,/glade/campaign/collections/cmip/CMIP6/timeser...
1,epoch_mean,b.e21.BHIST.f09_g17.CMIP6-historical.002,pop.h,ABIO_DIC14,/glade/campaign/collections/cmip/CMIP6/timeser...
2,epoch_mean,b.e21.BHIST.f09_g17.CMIP6-historical.003,pop.h,ABIO_DIC14,/glade/campaign/collections/cmip/CMIP6/timeser...
3,epoch_mean,b.e21.BHIST.f09_g17.CMIP6-historical.004,pop.h,ABIO_DIC14,/glade/campaign/collections/cmip/CMIP6/timeser...
4,epoch_mean,b.e21.BHIST.f09_g17.CMIP6-historical.005,pop.h,ABIO_DIC14,/glade/campaign/collections/cmip/CMIP6/timeser...
...,...,...,...,...,...
2109,global_integral_timeseries_ann,b.e21.BSSP585cmip6.f09_g17.CMIP6-SSP5-8.5.101,pop.h,POC_FLUX_100m,/glade/campaign/collections/cmip/CMIP6/timeser...
2110,global_integral_timeseries_ann,b.e21.BSSP585cmip6.f09_g17.CMIP6-SSP5-8.5.102,pop.h,POC_FLUX_100m,/glade/campaign/collections/cmip/CMIP6/timeser...
2111,global_integral_timeseries_ann,b.e21.BSSP585cmip6.f09_g17.CMIP6-SSP5-8.5.102,pop.h,POC_FLUX_100m,/glade/campaign/collections/cmip/CMIP6/timeser...
2112,global_integral_timeseries_ann,b.e21.BSSP585cmip6.f09_g17.CMIP6-SSP5-8.5.103,pop.h,POC_FLUX_100m,/glade/campaign/collections/cmip/CMIP6/timeser...


In [6]:
data_dir = f'/glade/scratch/{os.environ["USER"]}/cesm2-marbl-data'
if True:
    check_call(['rm', '-fr', data_dir])
os.makedirs(data_dir, exist_ok=True)        

In [7]:
with open('_config_calc.yml') as fid:
    config_dict = yaml.load(fid, Loader=yaml.Loader)

_collections = config_dict['data_collections']
_collections

{'epoch_mean': {'experiment': {'historical': ['1990', '2014'],
   'SSP5-RCP8.5': ['2086', '2100'],
   'SSP1-2.6': ['2086', '2100'],
   'SSP2-4.5': ['2086', '2100'],
   'SSP3-7.0': ['2086', '2100'],
   'SSP5-8.5': ['2086', '2100']}},
 'global_mean_timeseries_ann': None,
 'global_integral_timeseries_ann': None}

In [8]:
cluster, client = utils.get_ClusterClient()
cluster.scale(12) #adapt(minimum_jobs=0, maximum_jobs=24)
client

0,1
Client  Scheduler: tcp://10.12.206.63:44527  Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/mclong/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [9]:
collections_w_full_ts = ['global_integral_timeseries_ann', 'global_mean_timeseries_ann']
complete_records = []
for collection_type in collections_w_full_ts:
    complete_records.extend(datasets[collection_type]['historical'])
complete_records                            

['FG_CO2',
 'photoC_TOT_zint_100m',
 'photoC_diat_zint_100m',
 'POC_FLUX_100m',
 'ATM_CO2',
 'TEMP']

In [10]:
datasets_trimmed = {}
for collection_type, exp_dict in datasets.items():
    datasets_trimmed[collection_type] = {}
    for experiment, variable_list in exp_dict.items():
        if collection_type not in collections_w_full_ts:
            datasets_trimmed[collection_type][experiment] = sorted([v for v in variable_list if v not in complete_records])
        else:
            datasets_trimmed[collection_type][experiment] = sorted(variable_list)
datasets_trimmed

{'epoch_mean': {'historical': ['ABIO_DIC',
   'ABIO_DIC14',
   'ALK',
   'CFC11',
   'CFC12',
   'CO3',
   'CaCO3_FLUX_100m',
   'DENITRIF',
   'DIA_IMPVF_DOC',
   'DIA_IMPVF_DOCr',
   'DIC',
   'DIC_ALT_CO2',
   'DOC',
   'DOCr',
   'DON',
   'DON_RIV_FLUX',
   'DONr',
   'DONr_RIV_FLUX',
   'DOP',
   'DOPr',
   'Fe',
   'Fe_RIV_FLUX',
   'HDIFB_DOC',
   'HDIFB_DOCr',
   'IRON_FLUX',
   'Jint_100m_DIC',
   'NHx_SURFACE_EMIS',
   'NHy_FLUX',
   'NO3',
   'NO3_RIV_FLUX',
   'NOx_FLUX',
   'PO4',
   'POP_FLUX_100m',
   'SALT',
   'SedDenitrif',
   'SiO2_FLUX_100m',
   'SiO3',
   'WT_DOC',
   'WT_DOCr',
   'co3_sat_arag',
   'co3_sat_calc',
   'diatC',
   'diatChl',
   'diat_Fe_lim_Cweight_avg_100m',
   'diat_N_lim_Cweight_avg_100m',
   'diat_P_lim_Cweight_avg_100m',
   'diat_SiO3_lim_Cweight_avg_100m',
   'diazC',
   'diazChl',
   'diaz_Fe_lim_Cweight_avg_100m',
   'diaz_Nfix',
   'diaz_P_lim_Cweight_avg_100m',
   'dust_REMIN',
   'pfeToSed',
   'photoC_diaz_zint_100m',
   'photoC_sp_zin

## Make a datasets with subsetting applied

In [None]:
clobber = False
file_format = 'zarr'

cdf_kwargs_stream = {
    'pop.h': dict(chunks={'time': 12}, decode_coords=False),
    'pop.h.ecosys.nyear1': dict(chunks={'time': 1}, decode_coords=False),
}

assert file_format in ['zarr', 'nc']

dfs = {}; paths = {}
for collection_type, exp_dict in datasets_trimmed.items():
    
    for experiment, variable_list in exp_dict.items():
            
            cat = catalog.search(
                experiment=experiment, 
                variable=variable_list, 
                component='ocn')
            
            for v in variable_list:
                
                # subset catalog
                cat_sub = cat.search(variable=v)
                keys = list(cat_sub.keys())
                if not keys:
                    print(f'missing: {collection_type}, {experiment}, {v}')
                    continue
                
                elif len(keys) > 1:                
                    #assert len(keys) == 1, f'expecting single key, found: {keys}'
                    print(f'{v} -- expecting single key, found: {keys}')
                    continue
                    
                # get info
                key = keys[0]
                key_info = funnel.core.intake_esm_get_keys_info(cat_sub)[key]                       
                stream = key_info['stream']
                
                date_range = cat_sub.df.date_range.to_list()
                date_range = '-'.join([date_range[0].split('-')[0], date_range[-1].split('-')[-1]]) 
                
                # set operations
                sel_dict = dict()
                if collection_type == 'timeseries':
                    pass 
                
                elif collection_type == 'epoch_mean':
                    time_range = _collections['epoch_mean']['experiment'][experiment]                    
                    sel_dict = dict(time=slice(time_range[0], time_range[1]))                        
                    
                    if stream == 'pop.h':
                        date_range = '-'.join([time_range[0]+'01', time_range[-1]+'12'])
                    
                    elif stream == 'pop.h.ecosys.nyear1':
                        date_range = '-'.join([time_range[0], time_range[-1]])
                        
                    else:
                        raise ValueError(f'{stream} not handled')
                
                # name this dataset
                name = f"cesm2-cmip6.{experiment}.{stream}.{v}.{date_range}"
                file_out = f'{data_dir}/{name}.{file_format}'
                
                # to write or not to write? if not, move on
                if os.path.exists(file_out) and not clobber:
                    continue 
                    
                case_list = sorted(list(set(cat_sub.df.case.to_list())))

                # get the dataset
                dsets = cat_sub.to_dataset_dict(cdf_kwargs=cdf_kwargs_stream[stream])
                _, ds = dsets.popitem()
                
                ds.attrs['cases'] = ','.join(case_list)
                
                # center the time axix
                ds = ops.center_time(ds)

                # apply operations                
                if sel_dict:
                    ds = ds.sel(**sel_dict)[[v]]
                
                # write
                if file_format == 'zarr':
                    ds.to_zarr(file_out, mode='w', consolidated=True)
                else:
                    ds.to_netcdf(file_out, unlimited_dims=['time'])


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream'


In [None]:
# data_dir = f'/glade/scratch/{os.environ["USER"]}/cesm2-marbl-data'
# os.makedirs(data_dir, exist_ok=True)

# data_vol_GB = {}
# for key, paths in paths.items():
#     data_vol_GB[key] = 0.
#     for path in paths[key]:
#         data_vol_GB[key] += os.stat(path).st_size / 1024**3
#         check_call(['ln', '-sf', path, data_dir+'/.'])
        
#     print(f'Data volume {key}: {data_vol_GB[key]:0.2f} GB')    

In [None]:
v
