Code along for article

In [1]:
%matplotlib inline

In [2]:
import sys
import time
import subprocess
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt

In [3]:
np.random.seed(42)  # set seed for reproducibility

#### Generate bootstrap data

In [4]:
def bootstrap(df, n, to_df=True):
    """
    generate n bootstraped samples from a DataFrame
    
    """
    assert isinstance(df, type(pd.DataFrame())),\
        f"Expected pandas.DataFrame, got type: {type(df)}"

    sample = {column: np.random.choice(df[column], size=int(n)) for column in df.columns}  # column: bootstrap sample
    if to_df: sample = pd.DataFrame.from_dict(sample)   # convert to DataFrame

    return sample


#### Get the mean of runtime for k-folds cross validation

In [5]:
def test_model(model, sample, k=5):
    """
    get the mean time it takes to train the model over k-folds
    
    """
    x = sample.copy()  # set our predictors
    y = x.pop('y')  # set our response
    mean_fit_time = np.mean(cross_validate(model, x, y, cv=k)['fit_time']) * 100  # get mean run time

    return mean_fit_time


#### Evalute the run times across all cores

In [6]:
def evaluate_cores(model, data, n_samples, stride=1000, printer=False):
    """
    evaluate multicore 
    
    """
    N = [n for n in range(stride, (n_samples+stride), stride)]  # the number of samples we're using, stride of 1000

    cores = int(subprocess.check_output(['sysctl', '-n', 'hw.ncpu']).decode())  # get the number of cores

    model_dict = {f"model_{n}": model(n_jobs=n) for n in range(1, cores+1)}  # contruct model:instance pairs
    data_dict = {model: [] for model in model_dict.keys()}  # prep data for storage

    printer: print("Starting evaluation...")
    for n in N:
        print(f"\n  n={n}")
        sample = bootstrap(data.copy(), n=n, to_df=True)  # get bootstrap sample data

        for k in data_dict.keys():
            printer: print(f"\tTraining model {k}")
            start = time.time()
            data_dict[k].append(test_model(model_dict[k], sample))  # add times to dict
            if printer: print(f"\t  - mean fit time: {(time.time() - start):.2f} s")

    data_dict['N'] = N

    df = pd.DataFrame.from_dict(data_dict)

    return df


#### Visualize the results

In [7]:
def visualize(data, save=False):
    """
    plot the results
    
    """

    y = data.copy()  # get y-values
    x = y.pop('N')  # get x-values

    plt.plot(x, y)
    plt.title("Average Model Training Time")
    plt.xlabel('Sample Size ($n$)')
    plt.ylabel('Time (ms)')
    plt.legend([f'n_jobs={n}' for n in range(1, len(data.columns))])
    
    plt.savefig('spread_the_love_new.png', dpi=250)
    
    plt.show()


#### Run the program

In [None]:
df = pd.read_csv('bank-clean.csv')  # read in the data

In [None]:
results = evaluate_cores(RandomForestClassifier, df, 250000, stride=10000)  # run model evaluation, using random forest


  n=10000

  n=20000

  n=30000

  n=40000

  n=50000

  n=60000

  n=70000

  n=80000

  n=90000

  n=100000


In [None]:
results.describe()  # view the summary stats for the results

In [None]:
visualize(results)  # plot the results