In [2]:
import time

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

In [15]:
# lets define a toy problem:
def costly_simulation(list_param):
    time.sleep(np.random.random())
    return sum(list_param)

input_params = pd.DataFrame(np.random.random(size=(30, 4)),
                            columns=['param_a', 'param_b', 'param_c', 'param_d'])

input_params

Unnamed: 0,param_a,param_b,param_c,param_d
0,0.983522,0.938315,0.242548,0.373498
1,0.103591,0.315438,0.6667,0.418915
2,0.727865,0.038372,0.767575,0.073396
3,0.564838,0.983012,0.247394,0.953011
4,0.85714,0.98819,0.595069,0.074358
5,0.982168,0.582689,0.345748,0.531498
6,0.921376,0.707658,0.400578,0.771813
7,0.856137,0.847027,0.965089,0.930372
8,0.869067,0.068355,0.965709,0.473567
9,0.724317,0.891009,0.548786,0.702796


In [21]:
# Lets see the vanilla for loop approach:

start_time = time.time()
sequential_results = []
for input_param in tqdm(input_params.iterrows()):
    sequential_results.append(costly_simulation(input_param[1].values))
stop_time = time.time()

print(f"The sequential approach took {stop_time - start_time} seconds")

30it [00:14,  2.02it/s]

The sequential approach took 14.872225999832153 seconds





In [22]:
# This is how easy it would be to use joblib to solve the same problem:
start_time = time.time()
parallel_results = Parallel(verbose=0, n_jobs=-1)(delayed(costly_simulation)(input_param[1].values) for input_param in tqdm(input_params.iterrows()))
stop_time = time.time()

print(f"The parallel approach took {stop_time - start_time} seconds")

30it [00:00, 164.65it/s]


The parallel approach took 2.630764961242676 seconds


In [23]:
# check if both results actually return the same:
np.array_equal(np.array(parallel_results), np.array(sequential_results))

True