In [1]:
from getpass import getuser # Libaray to copy things
from pathlib import Path # Object oriented libary to deal with paths
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory # Creating temporary Files/Dirs
from subprocess import run, PIPE
import sys
 
import dask # Distributed data libary
from dask_jobqueue import SLURMCluster # Setting up distributed memories via slurm
from distributed import Client, progress, wait # Libaray to orchestrate distributed resources

import xarray as xr # Libary to work with labeled n-dimensional data and dask
import numpy as np
import skimage.util as sutil
import matplotlib.pyplot as plt

# sys.path.insert(0, os.path.abspath('/home/mpim/m300414/phd/'))
from org_metrics import Pairs, gen_regionprops_objects_all, gen_shapely_objects_all, gen_tuplelist
from org_metrics import radar_organisation_metric, avg_area, lower_rom_limit

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# Set some user specific variables
scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir

# Create a temp directory where the output of distributed cluster will be written to, after this notebook
# is closed the temp directory will be closed
dask_tmp_dir = TemporaryDirectory(dir=scratch_dir, prefix='dive_')
cluster = SLURMCluster(memory='500GiB',
                       cores=72,
                       project='mh0731',
                       walltime='03:20:00',
                       queue='gpu',
                       name='dive',
                       scheduler_options={'dashboard_address': ':12435'},
                       local_directory='/home/mpim/m300414/phd/Notebooks/',
                       job_extra=[f'-J dive', 
                                  f'-D /home/mpim/m300414/phd/Notebooks/',
                                  f'--begin=now',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o'
                                 ],
                       interface='ib0')

cluster.scale(jobs=2) # requests whole nodes
dask_client = Client(cluster)
dask_client.wait_for_workers(18) # gpu-partition has 9 workers per node
# dask_client = Client()
dask_client

0,1
Client  Scheduler: tcp://10.50.40.25:38826  Dashboard: http://10.50.40.25:12435/status,Cluster  Workers: 18  Cores: 144  Memory: 1.07 TB


In [4]:
rome = xr.open_dataarray('/work/mh0731/m300414/DyWinter_b10/RadarDomain_Grid/rome_14mmhour.nc')

data_path = Path('/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/')
glob_pattern_2d = 'div900_*.nc'
 
# Collect all file names with pathlib's rglob and list compressions 
file_names = sorted([str(f) for f in data_path.rglob(f'{glob_pattern_2d}')])[-31:]
file_names

['/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200131T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200201T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200202T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200203T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200204T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200205T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200206T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200207T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200208T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200209T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_20200210T0000_reggrid.nc',
 '/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/div900_2020021

In [5]:
# ref = xr.open_dataarray('/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/rh500_20200227T0000_reggrid.nc')
# ref_lat = ref['lat']
# ref_lon = ref['lon']
# for f in file_names:
#     ar = xr.open_dataarray(f)
#     ar['lat'] = ref_lat
#     ar['lon'] = ref_lon
#     ar.to_netcdf(f'/work/mh0731/m300414/DyWinter_b10/{f[-31:]}')

In [6]:
@dask.delayed
def mask_highres_by_coarse(bool_coarse, highres_mask):

    domain_size = (117, 117)
    half_size = domain_size[0] // 2
    assert domain_size[0]     == domain_size[1] # domain is quadratic
    assert domain_size[0] % 2 == 1              # number of pixels is not even
    
    bool_stack = bool_coarse.stack({'z': ('lat', 'lon')})
    
    # get indices of high-res field (via argsort()) where boolean is true
    lat_indices = highres_mask['lat'].argsort().sel({'lat': bool_stack[bool_stack]['lat'].values})
    lon_indices = highres_mask['lon'].argsort().sel({'lon': bool_stack[bool_stack]['lon'].values})

    # get all the other indices of the high-res field, surrounding the indices derived from the coarse field
    lat_ind_list = []
    lon_ind_list = []
    for i in range(-half_size, half_size + 1):
        for j in range(-half_size, half_size + 1):

            lat_ind_list.append((lat_indices + i).values)
            lon_ind_list.append((lon_indices + j).values)
            
    del lat_indices, lon_indices

    # swap grouping of the selected indices
    transpose_lat_ind_list = list(zip(*lat_ind_list))
    transpose_lon_ind_list = list(zip(*lon_ind_list))
    
    del lat_ind_list, lon_ind_list
    
    # concatenate all the indices, needed for slicing the DataArray later
    raw_lat_ind_list = []
    raw_lon_ind_list = []
    for lat_tup, lon_tup in zip(transpose_lat_ind_list, transpose_lon_ind_list):
        raw_lat_ind_list.extend(list(lat_tup))
        raw_lon_ind_list.extend(list(lon_tup))
        
    del transpose_lat_ind_list, transpose_lon_ind_list
    
    # get actual lat/lon-values, based on indices. For cyclic longitudes we need modulo-operation.
    lat_select = highres_mask['lat'][         raw_lat_ind_list                            ].values
    lon_select = highres_mask['lon'][np.array(raw_lon_ind_list) % len(highres_mask['lon'])].values

    del raw_lat_ind_list, raw_lon_ind_list
    
    # zip the selected lat/lon together, to mimic the stacked dimension, and get unique pairs via a set.
    lat_lon_unique = list(set(zip(lat_select, lon_select)))
    
    highres_stack = highres_mask.stack({'k': ('lat', 'lon')}).copy()
    del highres_mask
    
    highres_stack.loc[dict(k=lat_lon_unique)] = True

    return highres_stack.unstack()

In [7]:
for file in file_names:
    date = file[-24:-11]
    print(date)

    var_to_process = xr.open_mfdataset(file)['div']

    l_rome_p90 = (rome > np.nanpercentile(rome, q=90))
    highres_mask = xr.full_like(var_to_process, fill_value=False, dtype='bool')

    # parallelisation on time level
    map_singletime = []
    for t in highres_mask.time:
        coarse_mask = l_rome_p90.sel(time=str(t.values), method='nearest')
        map_singletime.append( mask_highres_by_coarse(coarse_mask, highres_mask.sel(time=str(t.values)) ))
    
    jobs = dask.persist(map_singletime)
    progress(jobs, notebook=False)

    result = xr.concat(dask.compute(*map_singletime), dim=var_to_process.time)

    result.name = 'high_rome_mask'
    result.attrs['units'] = '1'
    result.attrs['long_name'] = 'Mask for 90perc-ROME across (117*2.5)x(117*2.5) km domain.'

    result.to_netcdf(f'/work/mh0731/m300414/DyWinter_b10/Cartesian_Grid/rome90p_{date}_mask.nc')

20200131T0000
20200201T0000############################] | 100% Completed |  2min 20.3s
20200202T0000############################] | 100% Completed |  2min 35.0s
20200203T0000############################] | 100% Completed |  2min 24.6s
20200204T0000############################] | 100% Completed |  2min 39.3s
20200205T0000############################] | 100% Completed |  2min 39.3s
20200206T0000############################] | 100% Completed |  2min 52.0s
20200207T0000############################] | 100% Completed |  2min 52.9s
20200208T0000############################] | 100% Completed |  2min 55.5s
20200209T0000############################] | 100% Completed |  2min 58.1s
20200210T0000############################] | 100% Completed |  2min 58.5s
20200211T0000############################] | 100% Completed |  2min 58.0s
20200212T0000############################] | 100% Completed |  2min 52.3s
20200213T0000############################] | 100% Completed |  3min 14.2s
20200214T0000###########