In [1]:
from getpass import getuser # Libaray to copy things
from pathlib import Path # Object oriented libary to deal with paths
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory # Creating temporary Files/Dirs
from subprocess import run, PIPE
import sys
 
import dask # Distributed data libary
from dask_jobqueue import SLURMCluster # Setting up distributed memories via slurm
from distributed import Client, progress, wait # Libaray to orchestrate distributed resources
import xarray as xr # Libary to work with labeled n-dimensional data and dask

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# Set some user specific variables
scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir
# Create a temp directory where the output of distributed cluster will be written to, after this notebook
# is closed the temp directory will be closed
dask_tmp_dir = TemporaryDirectory(dir=scratch_dir, prefix='PostProc')
cluster = SLURMCluster(memory='500GiB',
                       cores=72,
                       project='mh0731',
                       walltime='1:00:00',
                       queue='gpu',
                       name='PostProc',
                       scheduler_options={'dashboard_address': ':12435'},
                       local_directory=dask_tmp_dir.name,
                       job_extra=[f'-J PostProc', 
                                  f'-D {dask_tmp_dir.name}',
                                  f'--begin=now',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o'
                                 ],
                       interface='ib0')
cluster.scale(jobs=2)
dask_client = Client(cluster)
dask_client.wait_for_workers(18)

data_path = Path('/work/mh0287/k203123/GIT/icon-aes-dyw/experiments/dpp0014')
glob_pattern = 'atm2_'
# if dont take first element, theres a subdir with more matching files, we dont want that
data_files = sorted([str(f) for f in data_path.rglob(f'*{glob_pattern}*.nc')]) #[1:]

In [4]:
@dask.delayed
def run_cmd(cmd, path_extra=Path(sys.exec_prefix)/'bin'):
    '''Run a bash command.'''
    env_extra = os.environ.copy()
    env_extra['PATH'] = str(path_extra) + ':' + env_extra['PATH']
    status = run(cmd, check=False, stderr=PIPE, stdout=PIPE, env=env_extra)
    if status.returncode != 0:
        error = f'''{' '.join(cmd)}: {status.stderr.decode('utf-8')}'''
        raise RuntimeError(f'{error}')
    return status.stdout.decode('utf-8')

In [5]:
home = Path(os.path.expanduser('~'))
work_dir = Path('/work/mh0731/m300414/')
gridfile = Path('/pool/data/ICON/grids/public/mpim/0015/icon_grid_0015_R02B09_G.nc')

In [6]:
output_path = Path('/work/mh0731/m300414/Data/DyWinter_again/')
existing_output = [str(f) for f in output_path.rglob(f'*.nc')]

In [7]:
len(existing_output)

3872

In [8]:
run_futures = []
for infile in data_files:
    
    if not any(infile[-19:-6] in existing_file for existing_file in existing_output):
    
        print(infile[-19:-6])
        outfile = work_dir / 'Data' / 'DyWinter_again2' / f'pr_{infile[-19:-6]}.nc'
        # command = ('cdo', 'sellonlatbox,129,133,-14,-10', f'-setgrid,{gridfile}', '-select,name=pr', f'{infile}', f'{outfile}')
        command = ('cdo', 'sellonlatbox,-180,180,-20,20', f'-setgrid,{gridfile}', '-select,name=pr', f'{infile}', f'{outfile}')
        run_futures.append(run_cmd(command))

20200120T1030
20200120T1330
20200122T1730
20200122T1930
20200123T0145
20200123T1115
20200123T1800
20200124T0230
20200126T0530
20200126T0615
20200126T1000
20200127T0345
20200127T0945
20200127T1500
20200127T1845
20200127T2130
20200130T1230
20200130T2015
20200131T0115
20200201T2245
20200202T1400
20200202T2015
20200203T0330
20200203T1145
20200203T2230
20200204T0300
20200205T0400
20200205T1515
20200205T1930
20200206T0830
20200206T1800
20200207T0300
20200207T1115
20200207T1830
20200208T0300
20200208T0545
20200208T0730
20200208T1045
20200208T1545
20200209T0015
20200209T2215
20200209T2345
20200210T0400
20200210T2045
20200211T0700
20200212T1545
20200212T1645
20200212T2015
20200214T1200
20200214T1530
20200214T1945
20200214T2215
20200215T0415
20200216T1315
20200216T1615
20200216T2315
20200217T1245
20200217T1300
20200217T1915
20200217T2045
20200218T0515
20200218T2045
20200219T1515
20200220T1130
20200221T0730
20200221T0845
20200221T1700
20200221T2215
20200222T1045
20200222T1800
20200223T0915
202002

In [9]:
run_jobs = dask.persist(run_futures)
progress(run_jobs, notebook=False)

[########################################] | 100% Completed |  1min 18.3s

In [10]:
1+1

2

2+1

In [11]:
2+1

3