In [34]:
from getpass import getuser # Libaray to copy things
from pathlib import Path # Object oriented libary to deal with paths
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory # Creating temporary Files/Dirs
from subprocess import run, PIPE
import sys
 
import dask # Distributed data libary
import numpy as np
from dask_jobqueue import SLURMCluster # Setting up distributed memories via slurm
from distributed import Client, progress, wait # Libaray to orchestrate distributed resources
import xarray as xr # Libary to work with labeled n-dimensional data and dask

In [35]:
import warnings
warnings.filterwarnings(action='ignore')

In [36]:
home = Path(os.path.expanduser('~'))
work_dir = Path('/work/mh0731/m300414/')
gridfile = Path('/pool/data/ICON/grids/public/mpim/0015/icon_grid_0015_R02B09_G.nc')
input_path = work_dir / 'DyWinter_b9'
days_from_files = [str(f)[-16:-8] for f in input_path.rglob(f'*.nc')]

In [37]:
unique_days = np.unique(np.array(days_from_files))
unique_days

array(['20200120', '20200121', '20200122', '20200123', '20200124',
       '20200125', '20200126', '20200127', '20200128', '20200129',
       '20200130', '20200131', '20200201', '20200202', '20200203',
       '20200204', '20200205', '20200206', '20200207', '20200208',
       '20200209', '20200210', '20200211', '20200212', '20200213',
       '20200214', '20200215', '20200216', '20200217', '20200218',
       '20200219', '20200220', '20200221', '20200222', '20200223',
       '20200224', '20200225', '20200226', '20200227', '20200228',
       '20200229', '20200301'], dtype='<U8')

In [40]:
# Set some user specific variables
scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir
# Create a temp directory where the output of distributed cluster will be written to, after this notebook
# is closed the temp directory will be closed
dask_tmp_dir = TemporaryDirectory(dir=scratch_dir, prefix='PostProc')
cluster = SLURMCluster(memory='500GiB',
                       cores=72,
                       project='mh0731',
                       walltime='1:00:00',
                       queue='gpu',
                       name='PostProc',
                       scheduler_options={'dashboard_address': ':12435'},
                       local_directory=dask_tmp_dir.name,
                       job_extra=[f'-J PostProc', 
                                  f'-D {dask_tmp_dir.name}',
                                  f'--begin=now',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o'
                                 ],
                       interface='ib0')
cluster.scale(jobs=2)
dask_client = Client(cluster)
dask_client.wait_for_workers(18)

In [41]:
@dask.delayed
def run_cmd(cmd, path_extra=Path(sys.exec_prefix)/'bin'):
    '''Run a bash command.'''
    env_extra = os.environ.copy()
    env_extra['PATH'] = str(path_extra) + ':' + env_extra['PATH']
    status = run(cmd, check=False, stderr=PIPE, stdout=PIPE, env=env_extra)
    if status.returncode != 0:
        error = f'''{' '.join(cmd)}: {status.stderr.decode('utf-8')}'''
        raise RuntimeError(f'{error}')
    return status.stdout.decode('utf-8')

In [46]:
run_futures = []
for day in unique_days: 
    print(day)
    infiles = tuple(str(f) for f in input_path.rglob(f'pr_{day}*.nc'))
    outfile = work_dir / 'DyWinter_b9' / 'Daily' / f'pr_{day}.nc'
    # command = ('cdo', 'sellonlatbox,129,133,-14,-10', f'-setgrid,{gridfile}', '-select,name=pr', f'{infile}', f'{outfile}')
    command = ('cdo', 'mergetime', *infiles, f'{outfile}')
    run_futures.append(run_cmd(command))

20200120
20200121
20200122
20200123
20200124
20200125
20200126
20200127
20200128
20200129
20200130
20200131
20200201
20200202
20200203
20200204
20200205
20200206
20200207
20200208
20200209
20200210
20200211
20200212
20200213
20200214
20200215
20200216
20200217
20200218
20200219
20200220
20200221
20200222
20200223
20200224
20200225
20200226
20200227
20200228
20200229
20200301


In [47]:
run_jobs = dask.persist(run_futures)
progress(run_jobs, notebook=False)

[                                        ] | 2% Completed |  3min 14.7s