[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merantix-momentum/squirrel-datasets-core/blob/main/examples/07.Performance_Guideline.ipynb)

In [None]:
try:
    import squirrel
    from dask.distributed import Client
except:
    !pip install -q --ignore-requires-python --upgrade squirrel "dask[distributed]" # noqa
    import squirrel
    from dask.distributed import Client

print(squirrel.__version__)

In [None]:
import random
from concurrent.futures import ProcessPoolExecutor
from time import sleep

import numpy as np
from dask.distributed import Client
from numba import jit
from squirrel.iterstream import IterableSource

In [None]:
def io_bound_func(x):
    sleep(0.1)
    return x


def python_func(x):
    for _ in range(10 ** 7):
        random.random()
    return x


def np_trace(a):
    trace = 0.0
    for i in range(a.shape[0]):
        trace += np.tanh(a[i, i])
    return a + trace


@jit(nopython=True)
def np_trace_nb(a):
    trace = 0.0
    for i in range(a.shape[0]):
        trace += np.tanh(a[i, i])
    return a + trace


ppool = ProcessPoolExecutor()
d = 10
arrs = [np.random.random((d, d)) for _ in range(1000)]

In [None]:
# calling the function for the first time compiles it
_ = np_trace_nb(arrs[0])

In [None]:
dask_client = Client()

# IO

For IO bound operations, use `async_map()` without `executor` argument, which uses then a ThreadPoolExecutor.
It is less resource intensive than a `ProcessPoolExecutor`, 

In [None]:
%%timeit

IterableSource(range(10)).map(io_bound_func).join()

In [None]:
%%timeit

IterableSource(range(10)).async_map(io_bound_func).join()

In [None]:
%%timeit

IterableSource(range(10)).async_map(io_bound_func, executor=ppool).join()

In [None]:
%%timeit

IterableSource(range(10)).numba_map(io_bound_func).join()

In [None]:
%%timeit

IterableSource(range(10)).async_map(io_bound_func, executor=dask_client).join()

# Py func

For python functions, `async_map` with `ProcessPoolExecutor` provides the best performance.

In [None]:
%%timeit

IterableSource(range(10)).map(python_func).join()

In [None]:
%%timeit

IterableSource(range(10)).async_map(python_func).join()

In [None]:
%%timeit

_ = IterableSource(range(10)).async_map(python_func, executor=ppool).join()

In [None]:
%%timeit

IterableSource(range(10)).numba_map(python_func).join()

In [None]:
%%timeit

IterableSource(range(10)).async_map(python_func, executor=dask_client).join()

# numpy computation

numba offers a fastest execution, while still runing in a single threaded manner (uses less resources)

In [None]:
%%timeit

IterableSource(arrs).map(np_trace).join()

In [None]:
%%timeit

IterableSource(arrs).async_map(np_trace).join()

In [None]:
%%timeit

IterableSource(arrs).async_map(np_trace, executor=ppool).join()

In [None]:
%%timeit

IterableSource(arrs).numba_map(np_trace).join()

In [None]:
%%timeit

IterableSource(arrs).map(np_trace_nb).join()

In [None]:
%%timeit

IterableSource(arrs).numba_map(np_trace_nb).join()