# Dask with scikit-fingerprints

### First load the dataset

In [1]:
from skfp.datasets.moleculenet import load_pcba

pcba_molecules = load_pcba()[0]

### Compute molecules with standard joblib backend

In [2]:
from time import time
from skfp.fingerprints import MAPFingerprint

fp = MAPFingerprint(n_jobs=16)

start = time()
fps_joblib = fp.transform(pcba_molecules)
end = time()

print(f"computation time {end-start:.3} s")

computation time 82.3 s


### Compute with Dask

In [3]:
from dask.distributed import LocalCluster

# create local cluster
cluster = LocalCluster(n_workers=16)
cluster.scheduler_address

2024-12-16 21:16:09,451|INFO|To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
2024-12-16 21:16:09,469|INFO|State start
2024-12-16 21:16:09,479|INFO|  Scheduler at:     tcp://127.0.0.1:60199
2024-12-16 21:16:09,480|INFO|  dashboard at:  http://127.0.0.1:8787/status
2024-12-16 21:16:09,480|INFO|Registering Worker plugin shuffle
2024-12-16 21:16:09,585|INFO|        Start Nanny at: 'tcp://127.0.0.1:60220'
2024-12-16 21:16:09,587|INFO|        Start Nanny at: 'tcp://127.0.0.1:60204'
2024-12-16 21:16:09,589|INFO|        Start Nanny at: 'tcp://127.0.0.1:60206'
2024-12-16 21:16:09,591|INFO|        Start Nanny at: 'tcp://127.0.0.1:60222'
2024-12-16 21:16:09,594|INFO|        Start Nanny at: 'tcp://127.0.0.1:60224'
2024-12-16 21:16:09,598|INFO|        Start Nanny at: 'tcp://127.0.0.1:60208'
2024-12-16 21:16:09,601|INFO|        Start Nanny at: 'tcp://127.0.0.1:60210'
2024-12-16 21:16:09,604|INFO|        Start Nanny at: 'tcp:/

'tcp://127.0.0.1:60199'

In [4]:
cluster.dashboard_link

'http://127.0.0.1:8787/status'

In [5]:
from dask.distributed import Client

import joblib

client = Client(cluster.scheduler_address)

with joblib.parallel_config("dask"):
    fp = MAPFingerprint(n_jobs=-1)

    start = time()
    fps_dask = fp.transform(pcba_molecules)
    end = time()

client.close()
cluster.close()

print(f"computation time {end-start:.3} s")

2024-12-16 21:17:36,410|INFO|Closing Nanny at 'tcp://127.0.0.1:60202'. Reason: nanny-close
2024-12-16 21:17:36,410|INFO|Nanny asking worker to close. Reason: nanny-close
2024-12-16 21:17:36,411|INFO|Closing Nanny at 'tcp://127.0.0.1:60204'. Reason: nanny-close
2024-12-16 21:17:36,412|INFO|Nanny asking worker to close. Reason: nanny-close
2024-12-16 21:17:36,412|INFO|Closing Nanny at 'tcp://127.0.0.1:60206'. Reason: nanny-close
2024-12-16 21:17:36,413|INFO|Nanny asking worker to close. Reason: nanny-close
2024-12-16 21:17:36,414|INFO|Closing Nanny at 'tcp://127.0.0.1:60208'. Reason: nanny-close
2024-12-16 21:17:36,415|INFO|Nanny asking worker to close. Reason: nanny-close
2024-12-16 21:17:36,415|INFO|Closing Nanny at 'tcp://127.0.0.1:60210'. Reason: nanny-close
2024-12-16 21:17:36,416|INFO|Nanny asking worker to close. Reason: nanny-close
2024-12-16 21:17:36,417|INFO|Closing Nanny at 'tcp://127.0.0.1:60212'. Reason: nanny-close
2024-12-16 21:17:36,418|INFO|Nanny asking worker to close. 

computation time 85.5 s


The time for local computation is very similar. We expect to see an improvement for cloud cluster