In [1]:
import numpy as np
from dask import delayed
from dask.distributed import Client, progress
import dask.array as da

import warnings
warnings.filterwarnings("ignore")

In [7]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:33725  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


Time without persisting the array

In [8]:
%%timeit
x = da.random.random((10000, 10000), chunks=(1000, 1000))
y = x + x.T
z = y[::2, 5000:].mean(axis=1)
z.compute()

826 ms ± 23.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Time with persisting the array

In [17]:
x = da.random.random((10000, 10000), chunks=(1000,1000))

In [18]:
x.persist()

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,8.00 MB
Shape,"(10000, 10000)","(1000, 1000)"
Count,100 Tasks,100 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 8.00 MB Shape (10000, 10000) (1000, 1000) Count 100 Tasks 100 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,8.00 MB
Shape,"(10000, 10000)","(1000, 1000)"
Count,100 Tasks,100 Chunks
Type,float64,numpy.ndarray


In [19]:
%%time
y = x + x.T
z = y[::2, 5000:].mean(axis=1)
z.compute()

CPU times: user 190 ms, sys: 17.3 ms, total: 207 ms
Wall time: 419 ms


array([1.00045728, 1.00428095, 1.00009171, ..., 1.00190588, 1.01012714,
       0.99789713])

Time with chunks set to 250, 250

In [21]:
x = da.random.random((10000, 10000), chunks=(250,250))

In [22]:
x.persist()

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,500.00 kB
Shape,"(10000, 10000)","(250, 250)"
Count,1600 Tasks,1600 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 500.00 kB Shape (10000, 10000) (250, 250) Count 1600 Tasks 1600 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,500.00 kB
Shape,"(10000, 10000)","(250, 250)"
Count,1600 Tasks,1600 Chunks
Type,float64,numpy.ndarray


In [23]:
%%time
y = x + x.T
z = y[::2, 5000:].mean(axis=1)
z.compute()

CPU times: user 1.45 s, sys: 71.2 ms, total: 1.52 s
Wall time: 1.93 s


array([0.9970875 , 1.00337498, 1.01081629, ..., 1.00118455, 0.99495405,
       1.00043262])

Time with chunks 500, 500

In [25]:
x = da.random.random((10000, 10000), chunks=(500,500))

In [26]:
x.persist()

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,2.00 MB
Shape,"(10000, 10000)","(500, 500)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 2.00 MB Shape (10000, 10000) (500, 500) Count 400 Tasks 400 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,2.00 MB
Shape,"(10000, 10000)","(500, 500)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray


In [27]:
%%time
y = x + x.T
z = y[::2, 5000:].mean(axis=1)
z.compute()

CPU times: user 534 ms, sys: 24.8 ms, total: 559 ms
Wall time: 1.04 s


array([0.99780152, 1.00792159, 0.99552531, ..., 0.98990972, 1.0088973 ,
       1.00244963])

### Discussion
Dask is not as fast at Numpy. However it would allow for arrays larger than a single computer's memory. Here we see the drawbacks of segmenting the information across several arrays as Dask does. As chunksize decreases processing time increases because accessing each array adds additional steps rather than if only one array was being read.