In [1]:
from getpass import getuser # Libaray to copy things
from pathlib import Path # Object oriented libary to deal with paths
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory # Creating temporary Files/Dirs
from subprocess import run, PIPE
import sys
from dask.utils import format_bytes
 
import dask # Distributed data libary
from dask_jobqueue import SLURMCluster # Setting up distributed memories via slurm
from distributed import Client, progress, wait # Libaray to orchestrate distributed resources
import xarray as xr # Libary to work with labeled n-dimensional data and dask

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# Set some user specific variables
scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir
# Create a temp directory where the output of distributed cluster will be written to, after this notebook
# is closed the temp directory will be closed
dask_tmp_dir = TemporaryDirectory(dir=scratch_dir, prefix='PostProc')
cluster = SLURMCluster(memory='500GiB',
                       cores=72,
                       project='mh0731',
                       walltime='1:00:00',
                       queue='gpu',
                       name='PostProc',
                       scheduler_options={'dashboard_address': ':12435'},
                       local_directory=dask_tmp_dir.name,
                       job_extra=[f'-J PostProc', 
                                  f'-D {dask_tmp_dir.name}',
                                  f'--begin=now',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o'
                                 ],
                       interface='ib0')
cluster.scale(jobs=2)
dask_client = Client(cluster)
dask_client.wait_for_workers(18)

In [4]:
# data_path = Path('/work/mh0287/k203123/GIT/icon-aes-dyw_albW/experiments/dpp0016/')
# glob_pattern_2d = 'atm2_2d_ml'
# data_path = Path('/work/mh0731/m300414/Data/TropicsBox/')
# glob_pattern_2d = 'pr_'
 
# Collect all file names with pathlib's rglob and list compressions 
file_names = '/work/mh0731/m300414/DyWinter_b9/Tropics_20to20/Daily/pr_20200120.nc' #sorted([str(f) for f in data_path.rglob(f'*{glob_pattern_2d}*.nc')]) #[1:]
dset = xr.open_mfdataset(file_names) #, combine='by_coords', parallel=True)
var_names = ['pr']
dset_subset = dset[var_names].persist()
dset_subset

Unnamed: 0,Array,Chunk
Bytes,56.86 MB,56.86 MB
Shape,"(7107910,)","(7107910,)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 56.86 MB 56.86 MB Shape (7107910,) (7107910,) Count 1 Tasks 1 Chunks Type float64 numpy.ndarray",7107910  1,

Unnamed: 0,Array,Chunk
Bytes,56.86 MB,56.86 MB
Shape,"(7107910,)","(7107910,)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,56.86 MB,56.86 MB
Shape,"(7107910,)","(7107910,)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 56.86 MB 56.86 MB Shape (7107910,) (7107910,) Count 1 Tasks 1 Chunks Type float64 numpy.ndarray",7107910  1,

Unnamed: 0,Array,Chunk
Bytes,56.86 MB,56.86 MB
Shape,"(7107910,)","(7107910,)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.73 GB,2.73 GB
Shape,"(96, 7107910)","(96, 7107910)"
Count,1 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.73 GB 2.73 GB Shape (96, 7107910) (96, 7107910) Count 1 Tasks 1 Chunks Type float32 numpy.ndarray",7107910  96,

Unnamed: 0,Array,Chunk
Bytes,2.73 GB,2.73 GB
Shape,"(96, 7107910)","(96, 7107910)"
Count,1 Tasks,1 Chunks
Type,float32,numpy.ndarray


In [5]:
time_mean = dset_subset.mean(dim='time').persist()
# field_mean = dset_subset.mean(dim='ncells').persist()

In [6]:
format_bytes(dset_subset.nbytes)

'2.84 GB'

In [7]:
dset_subset['pr']

Unnamed: 0,Array,Chunk
Bytes,2.73 GB,2.73 GB
Shape,"(96, 7107910)","(96, 7107910)"
Count,1 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.73 GB 2.73 GB Shape (96, 7107910) (96, 7107910) Count 1 Tasks 1 Chunks Type float32 numpy.ndarray",7107910  96,

Unnamed: 0,Array,Chunk
Bytes,2.73 GB,2.73 GB
Shape,"(96, 7107910)","(96, 7107910)"
Count,1 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,56.86 MB,56.86 MB
Shape,"(7107910,)","(7107910,)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 56.86 MB 56.86 MB Shape (7107910,) (7107910,) Count 1 Tasks 1 Chunks Type float64 numpy.ndarray",7107910  1,

Unnamed: 0,Array,Chunk
Bytes,56.86 MB,56.86 MB
Shape,"(7107910,)","(7107910,)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,56.86 MB,56.86 MB
Shape,"(7107910,)","(7107910,)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 56.86 MB 56.86 MB Shape (7107910,) (7107910,) Count 1 Tasks 1 Chunks Type float64 numpy.ndarray",7107910  1,

Unnamed: 0,Array,Chunk
Bytes,56.86 MB,56.86 MB
Shape,"(7107910,)","(7107910,)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [8]:
def get_griddes(y_res, x_res, x_first=-180, y_first=-90):
    """Create a description for a regular global grid at given x, y resolution."""
 
    xsize = 360 / x_res
    ysize = 180 / y_res
    xfirst = -180 + x_res / 2
    yfirst = -90 + x_res / 2
 
    return f'''
#
# gridID 1
#
gridtype  = lonlat
gridsize  = {int(xsize * ysize)}
xsize     = {int(xsize)}
ysize     = {int(ysize)}
xname     = lon
xlongname = "longitude"
xunits    = "degrees_east"
yname     = lat
ylongname = "latitude"
yunits    = "degrees_north"
xfirst    = {xfirst}
xinc      = {x_res}
yfirst    = {yfirst}
yinc      = {y_res}
 
 
    '''

In [9]:
@dask.delayed
def gen_dis(dataset, xres, yres, gridfile):
    '''Create a distance weights using cdo.'''
    scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir
    with TemporaryDirectory(dir=scratch_dir, prefix='Weights_') as td:
        in_file = Path(td) / 'in_file.nc'
        weightfile = Path(td) / 'weight_file.nc'
        griddes = Path(td) / 'griddes.txt'
        with griddes.open('w') as f:
            f.write(get_griddes(xres, yres))
        dataset.to_netcdf(in_file, mode='w') # Write the file to a temorary netcdf file
        cmd = ('cdo', '-O', f'gendis,{griddes}', f'-setgrid,{gridfile}', str(in_file), str(weightfile))
        run_cmd(cmd)
        df = xr.open_dataset(weightfile).load()
        wait(df)
        return df
 
def run_cmd(cmd, path_extra=Path(sys.exec_prefix)/'bin'):
    '''Run a bash command.'''
    env_extra = os.environ.copy()
    env_extra['PATH'] = str(path_extra) + ':' + env_extra['PATH']
    status = run(cmd, check=False, stderr=PIPE, stdout=PIPE, env=env_extra)
    if status.returncode != 0:
        error = f'''{' '.join(cmd)}: {status.stderr.decode('utf-8')}'''
        raise RuntimeError(f'{error}')
    return status.stdout.decode('utf-8')

In [10]:
@dask.delayed
def remap(dataset, x_res, y_res, weights, gridfile):
    """Perform a weighted remapping.
 
    Parameters
    ==========
 
    dataset : xarray.dataset
        The dataset that will be regridded
    griddes : Path, str
        Path to the grid description file
    weights : xarray.dataset
        Distance weights
 
    Returns
    =======
    xarray.dataset : Remapped dataset
    """
    if isinstance(dataset, xr.DataArray):
        # If a dataArray is given create a dataset
        dataset = xr.Dataset(data_vars={dataset.name: dataset})
    scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir
    with TemporaryDirectory(dir=scratch_dir, prefix='Remap_') as td:
        infile = Path(td) / 'input_file.nc'
        weightfile = Path(td) / 'weight_file.nc'
        griddes = Path(td) / 'griddes.txt'
        outfile = Path(td) / 'remaped_file.nc'
        with griddes.open('w') as f:
            f.write(get_griddes(x_res, y_res))
        dataset.to_netcdf(infile, mode='w') # Write the file to a temorary netcdf file
        weights.to_netcdf(weightfile, mode='w')
        cmd = ('cdo', '-O', f'remap,{griddes},{weightfile}', f'-setgrid,{gridfile}',
               str(infile), str(outfile))
        run_cmd(cmd)
        return xr.open_dataset(outfile).load()

In [11]:
grid_file = '/pool/data/ICON/grids/public/mpim/0015/icon_grid_0015_R02B09_G.nc'
weights_future = gen_dis(time_mean, 0.0225, 0.0225, grid_file)
weights_future

Delayed('gen_dis-1d8cfe2e-2895-4f1f-8df5-9a8ef8b459dd')

In [12]:
remap_futures = []
# Process each variable in parallel.
for snapshot in dset_subset['pr']:
    remap_futures.append(remap(dset_subset['pr'].sel(time=snapshot.time.values.astype(str)), 0.0225, 0.0225, weights_future, grid_file))
remap_futures

[Delayed('remap-cbec6142-9f64-4d82-9e2d-8631bc0570e6'),
 Delayed('remap-4352b77c-4528-4728-99ba-ab10a8ea6586'),
 Delayed('remap-e3b52d3a-5c7d-46f2-8d21-6845a844b151'),
 Delayed('remap-b52a25c9-18f8-4a01-8f2f-326357622061'),
 Delayed('remap-439b398f-7e86-4b1e-aa3a-ab712573b5df'),
 Delayed('remap-5bffcd76-b06b-47da-9192-ee4271fa37fd'),
 Delayed('remap-6d08a253-bea3-4b92-b54f-82b7bdc8ec35'),
 Delayed('remap-3cd04bc3-fcd8-4be6-9700-a050400051bb'),
 Delayed('remap-32fdf302-2d10-421c-8025-01ef7fbd2d28'),
 Delayed('remap-9ddd42ab-dc13-4a35-8afe-5bc52e143835'),
 Delayed('remap-1e5f38e5-52a7-4557-a417-77cb10cc27f7'),
 Delayed('remap-3758789f-4aea-4314-8977-81ff1777be28'),
 Delayed('remap-3ddeac96-3c31-4a21-8826-e56e84887cce'),
 Delayed('remap-a73d7b4e-22cf-4f10-af00-70070184e6a3'),
 Delayed('remap-d58a4901-fc87-442d-a2d3-55437ee9446e'),
 Delayed('remap-9181499d-9306-49c7-9778-7d9889517589'),
 Delayed('remap-0dbeb7e6-4627-47d9-917d-5a53daa5341b'),
 Delayed('remap-74abf421-4933-439c-bbf3-8b5806ba

In [13]:
remap_jobs = dask.persist(remap_futures)
progress(remap_jobs, notebook=False)

[###########################             ] | 67% Completed | 59min 11.5s

In [None]:
remap_jobs

In [None]:
list(dask.compute(*remap_futures))

In [None]:
dset_remap = xr.concat(list(dask.compute(*remap_futures)), dim=dset_subset.time[:3])
dset_remap

In [None]:
# 1 Save the time-series
out_file = Path(scratch_dir) / 'dpp0014_precip.nc'
dset_remap.to_netcdf(out_file, mode='w')