In [1]:
from platform import python_version

print(python_version())

3.7.7


In [2]:
import fklearn

fklearn.__version__

'1.18.0'

In [3]:
import numpy as np

np.random.seed(42)

In [4]:
from fklearn.data.datasets import make_tutorial_data
from fklearn.preprocessing.splitting import space_time_split_dataset
from concurrent.futures import ThreadPoolExecutor

In [5]:
TRAIN_START_DATE = '2015-01-01' 
TRAIN_END_DATE = '2015-03-01' 
HOLDOUT_END_DATE = '2015-04-01'

split_fn = space_time_split_dataset(train_start_date=TRAIN_START_DATE,
                                train_end_date=TRAIN_END_DATE,
                                holdout_end_date=HOLDOUT_END_DATE,
                                space_holdout_percentage=.5,
                                split_seed=42, 
                                space_column="id",
                                time_column="date")

In [6]:
def measure_time_split_fn(sample_size):
    result = %timeit -r 1 -o split_fn(data[:sample_size])
    return result

In [7]:
sample_sizes = [10000, 100000, 250000, 500000, 750000, 1000000]
data = make_tutorial_data(sample_sizes[-1])

In [8]:
old_function_times = []

for sample_size in sample_sizes:
    time_old = %timeit -r 1 -o split_fn(data[:sample_size])
    old_function_times.append(time_old.best)

7.12 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 100 loops each)
26.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
54.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
102 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
152 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
204 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [9]:
old_function_times

[0.007119678000000001,
 0.026497800000000016,
 0.05441616999999992,
 0.10246849999999999,
 0.15187872999999996,
 0.2037241999999999]

In [10]:
with ThreadPoolExecutor() as executor:
        result = executor.map(measure_time_split_fn, sample_sizes)

985 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1.01 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)939 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

947 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
876 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
149 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


In [11]:
list(result)

[<TimeitResult : 149 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)>,
 <TimeitResult : 985 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>,
 <TimeitResult : 1.01 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>,
 <TimeitResult : 939 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>,
 <TimeitResult : 947 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>,
 <TimeitResult : 876 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>]

In [12]:
import numpy as np
from numpy import nan
import pandas as pd


def new_make_tutorial_data(n: int) -> pd.DataFrame:
    """
    Generates fake data for a tutorial. There are 3 numerical features ("num1", "num3" and "num3")
    and tow categorical features ("cat1" and "cat2")
    sex, age and severity, the treatment is a binary variable, medication and the response
    days until recovery.
    Parameters
    ----------
    n : int
        The number of samples to generate
    Returns
    ----------
    df : pd.DataFrame
        A tutorial dataset
    """
    np.random.seed(1111)

    dataset = pd.DataFrame({
        "id": list(map(lambda x: "id%d" % x, np.random.randint(0, 1000000, n))),
        "date": np.random.choice(pd.date_range("2015-01-01", periods=100), n),
        "feature1": np.random.gamma(20, size=n),
        "feature2": np.random.normal(40, size=n),
        "feature3": np.random.choice(["a", "b", "c"], size=n)})

    dataset["target"] = (dataset["feature1"]
                         + dataset["feature2"]
                         + dataset["feature3"].apply(lambda x: 0 if x == "a" else 30 if x == "b" else 10)
                         + np.random.normal(0, 5, size=n))

    # insert some NANs
    dataset.loc[np.random.randint(0, n, 100), "feature1"] = nan
    dataset.loc[np.random.randint(0, n, 100), "feature3"] = nan

    return dataset

In [13]:
# sample_sizes = [10000, 50000, 100000, 200000, 250000,]
sample_sizes = [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]
data = new_make_tutorial_data(sample_sizes[-1])

In [14]:
with ThreadPoolExecutor() as executor:
        result = executor.map(measure_time_split_fn, sample_sizes)

2.47 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2.45 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2.46 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2.47 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2.48 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2.47 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2.47 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)2.48 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

