In [6]:
from getpass import getuser # Libaray to copy things
from pathlib import Path # Object oriented libary to deal with paths
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory # Creating temporary Files/Dirs
from subprocess import run, PIPE
import sys
 
import dask # Distributed data libary
from dask_jobqueue import SLURMCluster # Setting up distributed memories via slurm
from distributed import Client, progress, wait # Libaray to orchestrate distributed resources
import xarray as xr # Libary to work with labeled n-dimensional data and dask
import numpy as np
import matplotlib.pyplot as plt

In [7]:
import warnings
warnings.filterwarnings(action='ignore')

In [8]:
# Set some user specific variables
scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir

# Create a temp directory where the output of distributed cluster will be written to, after this notebook
# is closed the temp directory will be closed
dask_tmp_dir = TemporaryDirectory(dir=scratch_dir, prefix='threshold_')
cluster = SLURMCluster(memory='500GiB',
                       cores=72,
                       project='mh0731',
                       walltime='00:25:00',
                       queue='gpu',
                       name='threshold',
                       scheduler_options={'dashboard_address': ':12435'},
                       local_directory=dask_tmp_dir.name,
                       job_extra=[f'-J thrshld', 
                                  f'-D {dask_tmp_dir.name}',
                                  f'--begin=now',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o'
                                 ],
                       interface='ib0')

cluster.scale(jobs=1) # requests whole nodes
dask_client = Client(cluster)
dask_client.wait_for_workers(9) # gpu-partition has 9 workers per node

In [9]:
data_path = Path('/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/')
glob_pattern_2d = 'pr_*[0-9]_reggrid.nc'
 
# Collect all file names with pathlib's rglob and list compressions 
# dont take first ten days, they are spin-up
file_names = sorted([str(f) for f in data_path.rglob(f'{glob_pattern_2d}')])[10:-1] #[1:]

In [10]:
file_names

['/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200131T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200201T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200202T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200203T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200204T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200205T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200206T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200207T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200208T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200209T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200210T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/pr_20200211T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWin

In [11]:
@dask.delayed
def get_convective_rain(file, rain_threshold):
    rain = xr.open_dataset(file)['pr']
    mask = rain > rain_threshold
    mask.name = 'conv_rain_class'
    mask.attrs['long_name'] = 'Boolean for convective precipitation'

    outfile = Path('/work/mh0731/m300414/DyWinter_b10/Fake_Steiner/') / f'bool_{file[-24:-11]}_{rain_threshold*3600:.0f}mmhour.nc'
    mask.to_netcdf(outfile)

Bring data into distributed memory via persist()

In [12]:
def run_data(rain_threshold_in_mm):
    rain_threshold       = rain_threshold_in_mm / 3600
    
    rain_futures = []
    for file in file_names:# for i in [17, 29, 30]]:
        rain_futures.append(get_convective_rain(file, rain_threshold))
    
    # Bring data into distributed memory via persist()
    jobs = dask.persist(rain_futures)
    progress(jobs, notebook=False)
    
    return rain_futures

Gather data into single memory with compute(). Here this acts on data which already was brought into distributed memory via persist(), thus it should be fast.

In [13]:
def gather_data(futures):
    _ = dask.compute(*futures)

In [14]:
threshold = 10 # rain threshold above which rain is considered 'convective'
dask_futures = run_data(threshold)
gather_data(dask_futures)

[########################################] | 100% Completed |  2min 17.6s

In [15]:
print('done.')

done.
