In [1]:
from getpass import getuser # Libary to copy things
from pathlib import Path    # Object oriented libary to deal with paths
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory # Creating temporary Files/Dirs
from subprocess import run, PIPE
import sys
 
import dask # Distributed data libary
from dask_jobqueue import SLURMCluster # Setting up distributed memories via slurm
from distributed import Client, progress, wait # Libaray to orchestrate distributed resources
import xarray as xr # Libary to work with labeled n-dimensional data and dask

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# Set some user specific variables
scratch_dir = Path('/scratch') / getuser()[0] / getuser() # Define the users scratch dir

# Create a temp directory where the output of distributed cluster will be written to, after this notebook
# is closed the temp directory will be closed
dask_tmp_dir = TemporaryDirectory(dir=scratch_dir, prefix='accum2flux_')

cluster = SLURMCluster(memory='500GiB',
                       cores=72,
                       project='mh0731',
                       walltime='1:00:00',
                       queue='gpu',
                       name='accum2flux',
                       scheduler_options={'dashboard_address': ':12435'},
                       local_directory=dask_tmp_dir.name,
                       job_extra=[f'-J accm2flx', 
                                  f'-D {dask_tmp_dir.name}',
                                  f'--begin=now',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o',
                                  f'--output={dask_tmp_dir.name}/LOG_cluster.%j.o'
                                 ],
                       interface='ib0')

cluster.scale(jobs=2)
dask_client = Client(cluster)
dask_client.wait_for_workers(18)


In [3]:
@dask.delayed
def accumulation_to_flux(present_file, previous_file):
    
    data         = xr.open_mfdataset( present_file)['tp']
    data_earlier = xr.open_mfdataset(previous_file)['tp']
    # to assign values to dask-array, first load it to memory
    data.load()
    
    # the short time steps (15 minutes) inside one file
    n_minutes = 15
    data[{'time': slice(1, None)}] = (data[{'time': slice(1, None)}].values - data[{'time': slice(None, -1)}].values) / (n_minutes * 60)
        
    # the long time step (30 minutes) across two files, because the midnight value is stored in neither file
    n_minutes = 30
    data[{'time': 0}] = (data[{'time': 0}].values - data_earlier[{'time': -1}].values) / (n_minutes * 60)
    
    # adjust attributes
    data.attrs['units'] = 'kg m**-2 s**-1'
    data.attrs['long_name'] = 'Precipitation flux'
    data.name = 'pr'
    
    # write results to disk
    date = present_file[-24:-11]
    outfile = Path('/work/mh0731/m300414/') / 'DyWinter_b10' / 'Precip_Flux' / f'pr_{date}_tropics.nc'
    data.to_netcdf(outfile)

In [4]:
data_path = Path('/work/mh0731/m300414/DyWinter_b10/Tropics_fromGrib/')
glob_pattern = 'tp_'
# if dont take first element(s), theres a subdir with more matching files, we dont want that
data_files = sorted([str(f) for f in data_path.rglob(f'*{glob_pattern}*.nc')]) #[2:]

In [14]:
run_futures = []
for previous_file, present_file in zip(data_files[30:], data_files[31:]):
    
    print(present_file[-24:-11])
    
    run_futures.append( accumulation_to_flux(present_file=present_file, previous_file=previous_file) )

20200220T0000
20200221T0000
20200222T0000
20200223T0000
20200224T0000
20200225T0000
20200226T0000
20200227T0000
20200228T0000
20200229T0000
20200301T0000


In [15]:
run_jobs = dask.persist(run_futures)
progress(run_jobs, notebook=False)

[########################################] | 100% Completed |  2min 26.2s

In [16]:
print('done.')

done.
