# Examle: Dask Pipeline with Caching 

## Setup

In [1]:
import pandas as pd
import numpy as np
import time
import dask
from dask import delayed
from loguru import logger
from dutil.pipeline import cached

## Basic dataflow

In [2]:
%%time
@delayed()
@cached()
def load_1():
    time.sleep(2)
    df = pd.DataFrame({'a': [1., 2.], 'b': [0.1, np.nan]})
    logger.info('Loaded {} records'.format(len(df)))
    df.app
    return df

@delayed()
@cached(folder='cache')
def load_2(timestamp):
    time.sleep(2)
    df = pd.DataFrame({'a': [0.9, 3.], 'b': [0.001, 1.]})
    logger.info('Loaded {} records'.format(len(df)))
    return df

@delayed()
@cached(folder='cache')
def compute_1(x, y, eps):
    time.sleep(2)
    assert x.shape == y.shape
    diff = ((x - y).abs() / (y.abs()+eps)).mean().mean()
    logger.info('Difference is computed')
    return diff

@delayed()
@cached(folder='cache')
def compute_2(diff):
    diff_perc = diff * 100.
    logger.info('Relative difference is computed')
    return diff_perc
    
ts = pd.Timestamp(2019, 1, 1)
eps = 0.01
s1 = load_1()
s2 = load_2(ts)
diff = compute_1(s1, s2, eps)
diff_perc = compute_2(diff)

print('--- First run: all computations are run and data is saved to cache ---')
print('diff: {:.3f}'.format(diff.compute().load()))
# print('diff perc: {:.3f}'.format(diff_perc.compute().load()))

--- First run: all computations are run and data is saved to cache ---


2020-10-29 17:40:39.302 | INFO     | __main__:load_2:14 - Loaded 2 records
2020-10-29 17:40:39.304 | DEBUG    | dutil.pipeline._cached:dump:201 - Task load_2_2019-01-01 00:00:00.pickle: data has been saved to cache
2020-10-29 17:40:39.305 | INFO     | __main__:load_1:6 - Loaded 2 records
2020-10-29 17:40:39.305 | INFO     | dutil.pipeline._cached:new_foo:279 - Task load_2_2019-01-01 00:00:00.pickle: data has been computed and saved to cache
2020-10-29 17:40:39.307 | DEBUG    | dutil.pipeline._cached:dump:201 - Task load_1.pickle: data has been saved to cache
2020-10-29 17:40:39.308 | INFO     | dutil.pipeline._cached:new_foo:279 - Task load_1.pickle: data has been computed and saved to cache
2020-10-29 17:40:39.310 | DEBUG    | dutil.pipeline._cached:get_hash:215 - Task load_1.pickle: hash has been computed from data
2020-10-29 17:40:39.310 | DEBUG    | dutil.pipeline._cached:get_hash:215 - Task load_2_2019-01-01 00:00:00.pickle: hash has been computed from data
2020-10-29 17:40:41.325

diff: 4.611
CPU times: user 30.1 ms, sys: 14.7 ms, total: 44.8 ms
Wall time: 4.04 s


In [3]:
%%time
print('--- Second run: all data is loaded from cache ---')
print('diff: {:.3f}'.format(diff.compute().load()))
print('diff perc: {:.3f}'.format(diff_perc.compute().load()))

2020-10-29 17:40:41.355 | INFO     | dutil.pipeline._cached:new_foo:268 - Task load_2_2019-01-01 00:00:00.pickle: skip (cache exists)
2020-10-29 17:40:41.357 | INFO     | dutil.pipeline._cached:new_foo:268 - Task load_1.pickle: skip (cache exists)
2020-10-29 17:40:41.363 | INFO     | dutil.pipeline._cached:new_foo:268 - Task compute_1_11093643941594257765_7914328111557193193_0.01.pickle: skip (cache exists)
2020-10-29 17:40:41.366 | DEBUG    | dutil.pipeline._cached:load:193 - Task compute_1_11093643941594257765_7914328111557193193_0.01.pickle: data has been loaded from cache
2020-10-29 17:40:41.371 | INFO     | dutil.pipeline._cached:new_foo:268 - Task load_2_2019-01-01 00:00:00.pickle: skip (cache exists)
2020-10-29 17:40:41.371 | INFO     | dutil.pipeline._cached:new_foo:268 - Task load_1.pickle: skip (cache exists)
2020-10-29 17:40:41.374 | INFO     | dutil.pipeline._cached:new_foo:268 - Task compute_1_11093643941594257765_7914328111557193193_0.01.pickle: skip (cache exists)
2020-1

--- Second run: all data is loaded from cache ---
diff: 4.611
diff perc: 461.053
CPU times: user 31.3 ms, sys: 13.1 ms, total: 44.4 ms
Wall time: 31.5 ms


## Dataflow with parameters

In [4]:
params = dict(ts=None, eps=None)

@delayed()
@cached(parameters=params)
def load_1():
    time.sleep(2)
    df = pd.DataFrame({'a': [1., 2.], 'b': [0.1, np.nan]})
    logger.info('Loaded {} records'.format(len(df)))
    return df

@delayed()
@cached(parameters=params)
def load_2(timestamp):
    time.sleep(2)
    df = pd.DataFrame({'a': [0.9, 3.], 'b': [0.001, 1.]})
    logger.info('Loaded {} records'.format(len(df)))
    return df

@delayed()
@cached(parameters=params)
def compute_1(x, y, eps):
    time.sleep(2)
    assert x.shape == y.shape
    diff = ((x - y).abs() / (y.abs()+eps)).mean().mean()
    logger.info('Difference is computed')
    return diff

@delayed()
@cached(parameters=params)
def compute_2(diff):
    diff_perc = diff * 100.
    logger.info('Relative difference is computed')
    return diff_perc

print('--- First run: all computations are run and saved to cache ---')

ts = delayed(lambda: params['ts'])()
eps = delayed(lambda: params['eps'])()
s1 = load_1()
s2 = load_2(ts)
diff = compute_1(s1, s2, eps)
diff_perc = compute_2(diff)

params['ts'] = pd.Timestamp(2019, 1, 1)
params['eps'] = 0.01
print('diff: {:.3f}'.format(diff.compute().load()))
print('diff perc: {:.3f}'.format(diff_perc.compute().load()))

--- First run: all computations are run and saved to cache ---


2020-10-29 17:40:43.399 | INFO     | __main__:load_1:8 - Loaded 2 records
2020-10-29 17:40:43.403 | INFO     | __main__:load_2:16 - Loaded 2 records
2020-10-29 17:40:43.414 | DEBUG    | dutil.pipeline._cached:dump:201 - Task load_1_ts2019-01-01 00:00:00_eps0.01.pickle: data has been saved to cache
2020-10-29 17:40:43.415 | DEBUG    | dutil.pipeline._cached:dump:201 - Task load_2_ts2019-01-01 00:00:00_eps0.01.pickle: data has been saved to cache
2020-10-29 17:40:43.419 | INFO     | dutil.pipeline._cached:new_foo:279 - Task load_2_ts2019-01-01 00:00:00_eps0.01.pickle: data has been computed and saved to cache
2020-10-29 17:40:43.424 | INFO     | dutil.pipeline._cached:new_foo:279 - Task load_1_ts2019-01-01 00:00:00_eps0.01.pickle: data has been computed and saved to cache
2020-10-29 17:40:45.433 | INFO     | __main__:compute_1:25 - Difference is computed
2020-10-29 17:40:45.434 | DEBUG    | dutil.pipeline._cached:dump:201 - Task compute_1_ts2019-01-01 00:00:00_eps0.01.pickle: data has be

diff: 4.611
diff perc: 461.053


In [5]:
%%time
print('--- Second run: all data is loaded from cache ---')
print('diff: {:.3f}'.format(diff.compute().load()))
print('diff perc: {:.3f}'.format(diff_perc.compute().load()))

2020-10-29 17:40:45.451 | INFO     | dutil.pipeline._cached:new_foo:268 - Task load_1_ts2019-01-01 00:00:00_eps0.01.pickle: skip (cache exists)
2020-10-29 17:40:45.452 | INFO     | dutil.pipeline._cached:new_foo:268 - Task load_2_ts2019-01-01 00:00:00_eps0.01.pickle: skip (cache exists)
2020-10-29 17:40:45.454 | INFO     | dutil.pipeline._cached:new_foo:268 - Task compute_1_ts2019-01-01 00:00:00_eps0.01.pickle: skip (cache exists)
2020-10-29 17:40:45.454 | DEBUG    | dutil.pipeline._cached:load:193 - Task compute_1_ts2019-01-01 00:00:00_eps0.01.pickle: data has been loaded from cache
2020-10-29 17:40:45.456 | INFO     | dutil.pipeline._cached:new_foo:268 - Task load_1_ts2019-01-01 00:00:00_eps0.01.pickle: skip (cache exists)
2020-10-29 17:40:45.456 | INFO     | dutil.pipeline._cached:new_foo:268 - Task load_2_ts2019-01-01 00:00:00_eps0.01.pickle: skip (cache exists)
2020-10-29 17:40:45.457 | INFO     | dutil.pipeline._cached:new_foo:268 - Task compute_1_ts2019-01-01 00:00:00_eps0.01.pi

--- Second run: all data is loaded from cache ---
diff: 4.611
diff perc: 461.053
CPU times: user 8.91 ms, sys: 4.65 ms, total: 13.6 ms
Wall time: 9.77 ms
