In [1]:
# Approach I found in the history of the only netCDF file in the b10-simulation
# Thu Apr 02 03:49:15 2020: cdo mergetime /work/mh1119/DYAMOND_winter/sst_sic_final.nc* /work/mh1119/DYAMOND_winter/sst-sic-runmean_0017_R02B10.nc
# Thu Apr 02 01:24:21 2020: cdo setmisstonn /work/mh1119/DYAMOND_winter/sst_sic_missing.nc.2020-01-13 /work/mh1119/DYAMOND_winter/sst_sic_final.nc2020-01-13
# Thu Apr 02 01:01:48 2020: cdo seldate,2020-01-13 /work/mh1119/DYAMOND_winter/sst_sic_missing.nc /work/mh1119/DYAMOND_winter/sst_sic_missing.nc.2020-01-13
# Thu Apr 02 00:55:45 2020: cdo -P 8 -f nc4 remapnn,/home/dkrz/k203123/experiments/input/2.5km/icon_grid_0017_R02B10_G.nc -setpartabn,grib2cf.tab /work/mh1119/DYAMOND_winter/dyamond-winter-sst-sic-runmean.grb /work/mh1119/DYAMOND_winter/sst_sic_missing.nc

# Approach which works as well
# cdo -P 8 -f nc4 selvar,tp -sellonlatbox,-180,180,-20,20 -setgrid,/pool/data/ICON/grids/public/mpim/0017/icon_grid_0017_R02B10_G.nc /work/bk1040/experiments/DYAMOND_winter/nwp_R2B10_lkm1007_atm2_2d_ml_20200130T000000Z.grb tp_20200130_tropics.nc

In [2]:
from getpass import getuser # Libary to copy things
from pathlib import Path    # Object oriented libary to deal with paths
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory # Creating temporary Files/Dirs
from subprocess import run, PIPE
import sys
 
import dask # Distributed data libary
from dask_jobqueue import SLURMCluster # Setting up distributed memories via slurm
from distributed import Client, progress, wait # Libaray to orchestrate distributed resources
import xarray as xr # Libary to work with labeled n-dimensional data and dask

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# Set some user specific variables
scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir

# Create a temp directory where the output of distributed cluster will be written to, after this notebook
# is closed the temp directory will be closed
dask_tmp_dir = TemporaryDirectory(dir=scratch_dir, prefix='grib2nc_')

cluster = SLURMCluster(memory='500GiB',
                       cores=72,
                       project='mh0731',
                       walltime='0:45:00',
                       queue='gpu',
                       name='grib2nc',
                       scheduler_options={'dashboard_address': ':12435'},
                       local_directory=dask_tmp_dir.name,
                       job_extra=[f'-J grb2nc', 
                                  f'-D {dask_tmp_dir.name}',
                                  f'--begin=now',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o'
                                 ],
                       interface='ib0')

cluster.scale(jobs=2)
dask_client = Client(cluster)
dask_client.wait_for_workers(18)

In [4]:
data_path = Path('/work/bk1040/experiments/DYAMOND_winter/')
glob_pattern = 'omega_3d_pl'
# if dont take first element(s), theres a subdir with more matching files, we dont want that
data_files = sorted([str(f) for f in data_path.rglob(f'*{glob_pattern}*.grb')])[2:]

In [5]:
len(data_files)


42

In [6]:
@dask.delayed
def run_cmd(cmd, path_extra=Path(sys.exec_prefix)/'bin'):
    '''Run a bash command.'''
    env_extra = os.environ.copy()
    env_extra['PATH'] = str(path_extra) + ':' + env_extra['PATH']
    
    status = run(cmd, check=False, stderr=PIPE, stdout=PIPE, env=env_extra)
    
    if status.returncode != 0:
        error = f'''{' '.join(cmd)}: {status.stderr.decode('utf-8')}'''
        raise RuntimeError(f'{error}')
    return status.stdout.decode('utf-8')

In [7]:
home = Path(os.path.expanduser('~'))
work_dir = Path('/work/mh0731/m300414/')
gridfile = Path('/pool/data/ICON/grids/public/mpim/0017/icon_grid_0017_R02B10_G.nc')

In [8]:
run_futures = []
for infile in data_files[28:]:
    
    print(infile[-20:-7])
    date = infile[-20:-7]
    
    outfile = work_dir / 'DyWinter_b10' / 'Tropics_fromGrib' / f'omega500_{date}_tropics.nc'
    command = ('cdo', '-P', '8', '-f', 'nc4', 'sellevel,50000', '-selvar,w', '-sellonlatbox,-180,180,-20,20', f'-setgrid,{gridfile}', f'{infile}', f'{outfile}')
    
    run_futures.append(run_cmd(command))

20200217T0000
20200218T0000
20200219T0000
20200220T0000
20200221T0000
20200222T0000
20200223T0000
20200224T0000
20200225T0000
20200226T0000
20200227T0000
20200228T0000
20200229T0000
20200301T0000


In [9]:
run_jobs = dask.persist(run_futures)
progress(run_jobs, notebook=False)

[########################################] | 100% Completed | 40min  5.2s

In [10]:
print('done.')

done.
