# Build the Catalog for the Pinatubo Files

In [1]:
from ecgtools import Builder
from ecgtools.parsers.cesm import parse_cesm_timeseries

## Setup the Builder

In [3]:
b = Builder(
    "/glade/campaign/univ/udeo0005/cesmLE_no_pinatubo/",
    depth=5,
    exclude_patterns=["*/hist/*", "*/rest/*"],
    njobs=-1,
)

## Build the Catalog

In [4]:
b.build(parse_cesm_timeseries)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 738 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 1314 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 1656 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 2034 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 2448 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 2898 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 3384 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: D

Builder(root_path=PosixPath('/glade/campaign/univ/udeo0005/cesmLE_no_pinatubo'), extension='.nc', depth=5, exclude_patterns=['*/hist/*', '*/rest/*'], njobs=-1)

## Save the Catalog

In [8]:
b.save(
    'data/pinatubo-LE.csv',
    
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variable',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["component", "stream", "case"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "time_range",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Saved catalog location: data/pinatubo-LE.json and data/pinatubo-LE.csv




## Test the Catalog

In [9]:
import intake

In [10]:
col = intake.open_esm_datastore('/glade/work/mgrover/intake-esm-catalogs/pinatubo-LE.json')

In [11]:
col.df.stream.unique()

array(['cam.h1', 'cam.h0', 'pop.h.ecosys.nday1', 'pop.h.nday1',
       'pop.h.ecosys.nyear1', 'pop.h', 'clm2.h1', 'clm2.h0', 'rtm.h1',
       'rtm.h0', 'cice.h1', 'cice.h'], dtype=object)

In [12]:
cat = col.search(variable='TEMP', frequency='month_1')

In [13]:
dsets = cat.to_dataset_dict()


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.stream.case'


In [14]:
sorted(list(dsets))

['ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.001',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.002',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.009',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.010',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.011',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.012',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.013',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.014',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.015',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.016',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.017',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.018',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.019',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.020',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.021',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.022',
 'ocn.pop.h.b.e11.B20TRC5CNBDRD_no_pinatubo.f09_g16.023',
 'ocn.pop.h.b.

---

## Re-run of original forcing on Cheyenne

In [23]:
b = Builder(
    "/glade/campaign/univ/udeo0005/cesmLE_cheyenne",
    depth=5,
    exclude_patterns=["*/hist/*", "*/rest/*", "*/restarts/*"],
    njobs=-1,
)

In [24]:
b.build(parse_cesm_timeseries)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 738 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1314 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 1656 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 2034 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 2448 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 2898 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 3384 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: D

Builder(root_path=PosixPath('/glade/campaign/univ/udeo0005/cesmLE_cheyenne'), extension='.nc', depth=5, exclude_patterns=['*/hist/*', '*/rest/*', '*/restarts/*'], njobs=-1)

In [26]:
b.save(
    'data/cesmLE-cheyenne.csv',
    
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variable',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["component", "stream", "case"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "time_range",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Saved catalog location: data/cesmLE-cheyenne.json and data/cesmLE-cheyenne.csv


---
## Rebuild catalog for `CESM-CAM5-BGC-LE`

In [27]:
b = Builder(
    "/glade/campaign/cesm/collections/cesmLE/CESM-CAM5-BGC-LE",
    depth=5,
    exclude_patterns=["*/hist/*", "*/rest/*", "*/restarts/*"],
    njobs=-1,
)

In [None]:
b.build(parse_cesm_timeseries)