# Using Dask with climpred

This demo demonstrates `climpred`'s capabilities with `dask`.

In [1]:
import warnings

%matplotlib inline
import numpy as np
import xarray as xr
import dask
import climpred

warnings.filterwarnings("ignore")

### Load large data

In [75]:
# generic
ny,nx=256,220
nl,ni,nm=20,12,10
ds = xr.DataArray(np.random.random((nl,ni,nm,ny,nx)),dims=('lead', 'init', 'member', 'y', 'x'))
ds['init'] = np.arange(3000,3300,300//ni)
ds['lead'] = np.arange(1,1+ds.lead.size)
control = xr.DataArray(np.random.random((300, ny,nx)),dims=('time', 'y', 'x'))
control['time'] = np.arange(3000,3300)

In [76]:
kw = {'comparison':'m2e', 'metric':'rmse'}

In [77]:
%time s = climpred.prediction.compute_perfect_model(ds, control, **kw)

CPU times: user 11.5 s, sys: 6.88 s, total: 18.4 s
Wall time: 19.6 s


2 core Mac Book Pro 2018: CPU times: user 11.5 s, sys: 6.88 s, total: 18.4 s
Wall time: 19.6 s

In order to use `dask` efficient, we need to chunk the data appropriately. Processing chunks of data lazily with `dask` creates a tiny overhead per dask, therefore chunking mostly makes sense when applying it to large data.

In [86]:
chunked_dim = 'y'
chunks = {chunked_dim:ds[chunked_dim].size // 8}
ds = ds.chunk(chunks)
# if memory allows
ds = ds.persist()
ds.data

Unnamed: 0,Array,Chunk
Bytes,1.08 GB,33.79 MB
Shape,"(20, 12, 10, 256, 220)","(20, 12, 10, 32, 55)"
Count,32 Tasks,32 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.08 GB 33.79 MB Shape (20, 12, 10, 256, 220) (20, 12, 10, 32, 55) Count 32 Tasks 32 Chunks Type float64 numpy.ndarray",12  20  220  256  10,

Unnamed: 0,Array,Chunk
Bytes,1.08 GB,33.79 MB
Shape,"(20, 12, 10, 256, 220)","(20, 12, 10, 32, 55)"
Count,32 Tasks,32 Chunks
Type,float64,numpy.ndarray


In [87]:
%%time
s_chunked = climpred.prediction.compute_perfect_model(ds, control, **kw)
assert dask.is_dask_collection(s_chunked)
s_chunked = s_chunked.compute()

CPU times: user 18.8 s, sys: 5.4 s, total: 24.2 s
Wall time: 8.62 s


2 core Mac Book Pro 2018:
CPU times: user 2min 35s, sys: 1min 4s, total: 3min 40s
Wall time: 2min 10s

In [88]:
try:
    xr.testing.assert_allclose(s,s_chunked,atol=1e-6)
except AssertionError:
    for v in s.data_vars:
        (s-s_chunked)[v].plot(robust=True, col='lead')

## bootstrap skill

This speedup translates into `bootstrap_perfect_model`, where `bootstrap` resamplings of intializialized, uninitialized and persistence skill are computed and then translated into p values and confidence intervals.

In [89]:
kwp = kw.copy()
kwp['bootstrap'] = 4

In [90]:
ds=ds.compute()
control=control.compute()

In [91]:
%time s_p = climpred.bootstrap.bootstrap_perfect_model(ds, control, **kwp)

CPU times: user 2min 3s, sys: 1min 22s, total: 3min 26s
Wall time: 3min 43s


2 core Mac Book Pro 2018
CPU times: user 2min 3s, sys: 1min 22s, total: 3min 26s
Wall time: 3min 43s

In [92]:
chunked_dim = 'y'
chunks = {chunked_dim:ds[chunked_dim].size // 8}
ds = ds.chunk(chunks)
# if memory allows
ds = ds.persist()
ds.data

Unnamed: 0,Array,Chunk
Bytes,1.08 GB,135.17 MB
Shape,"(20, 12, 10, 256, 220)","(20, 12, 10, 32, 220)"
Count,8 Tasks,8 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.08 GB 135.17 MB Shape (20, 12, 10, 256, 220) (20, 12, 10, 32, 220) Count 8 Tasks 8 Chunks Type float64 numpy.ndarray",12  20  220  256  10,

Unnamed: 0,Array,Chunk
Bytes,1.08 GB,135.17 MB
Shape,"(20, 12, 10, 256, 220)","(20, 12, 10, 32, 220)"
Count,8 Tasks,8 Chunks
Type,float64,numpy.ndarray


In [93]:
%time s_p_chunked = climpred.bootstrap.bootstrap_perfect_model(ds, control, **kwp)

CPU times: user 2min 35s, sys: 1min 4s, total: 3min 40s
Wall time: 2min 10s


2 core Mac Book Pro 2018
CPU times: user 2min 35s, sys: 1min 4s, total: 3min 40s
Wall time: 2min 10s