# 00 - Build an Intake-ESM Catalog

In [1]:
import yaml

from ecgtools import Builder
from ecgtools.parsers.cesm import parse_cesm_timeseries
import pandas as pd

In [2]:
with open("case_config.yml", mode="r") as fptr:
    case_config = yaml.safe_load(fptr)

In [3]:
def build_catalog(path):
    b = Builder(
        # Directory with the output
        path,
        # Depth of 1 since we are sending it to the case output directory
        depth=1,
        # Exclude the timeseries and restart directories
        exclude_patterns=["*/hist/*", "*/rest/*"],
        # Number of jobs to execute - should be equal to # threads you are using
        njobs=-1,
    )
    
    return b.build(parse_cesm_timeseries)

In [4]:
cases = case_config['catalog_gen_path']

In [5]:
df_list = []
for case in cases:
    b = build_catalog(case)
    df_list.append(b.df)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    0.7s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 878 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 1148 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 

In [6]:
b.df = pd.concat(df_list)

In [8]:
b.save(
    '../data/hires_catalog.csv',
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variable',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["component", "stream", "case"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "time_range",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Saved catalog location: ../data/hires_catalog.json and ../data/hires_catalog.csv


  b.save(
