In [1]:
import os

data_path = os.getcwd()[:-6] + "Data\\"

# Prepare datasets
# ddf = dd.read_csv(data_path + "HIGSS.csv")

# ddf.loc[:40000, :].to_parquet(data_path + "small.parquet")
# ddf.to_parquet(data_path + "medium.parquet")
# dd.concat([ddf, ddf]).to_parquet(data_path + "large.parquet")

In [None]:
import numpy as np
import dask.dataframe as dd
from dask.distributed import Client
from dask_ml.linear_model import LinearRegression, LogisticRegression
from lightgbm import DaskLGBMClassifier
from dask_ml.cluster import KMeans
from distributed.client import FutureCancelledError
from time import perf_counter

# Start a client and cluster
# Uncomment if causing scheduler issues
# client = Client()

res = np.zeros((3, 3, 10)) # Size, model, iteration
for i in range(10):
    for j, size in enumerate(["small", "medium", "large"]):
        # Prep data
        print(f"{size.capitalize()} data")
        dda = dd.read_parquet(data_path + f"{size}.parquet").to_dask_array(lengths=True)
        print(f"{len(dda.chunks[0])} partitions")
        
        X = dda[:, 1:]
        y = dda[:, 0]

        # Linear regression
        lin_reg = LinearRegression()

        start_time = perf_counter()   
        lin_reg.fit(X, y)
        end_time = perf_counter()
        
        res[j, 0, i] = end_time - start_time
        
        # Logistic regression
        log_reg = LogisticRegression()
        
        start_time = perf_counter()
        log_reg.fit(X, y)
        end_time = perf_counter()

        res[j, 1, i] = end_time - start_time

        # Decision trees
        try:
            dec_tree = DaskLGBMClassifier(client=client)
            
            start_time = perf_counter()
            dec_tree.fit(X, y)
            end_time = perf_counter()
            
            print(f"\tDecision trees: {end_time - start_time:.3f}")
        except NameError:
            pass
        except FutureCancelledError as e:
            print("Decision tree skipped on FutureCancelledError")
            print(e)

        # KMeans
        kmeans = KMeans(n_clusters=2)

        start_time = perf_counter()
        kmeans.fit(X)
        end_time = perf_counter()

        res[j, 2, i] = end_time - start_time

In [None]:
for i in range(3):
    size = ["small", "medium", "large"][i]
    print(size)

    if 4 in res.shape:
        for j in range(4):
            model = ["linear regression", "logistic regression", "decision tree", "k-means"][j]
            print(f"{model}: {res[i, j, :][res[i, j, :] != 0].mean():.3f}")
    else:
        for j in range(3):
            model = ["linear regression", "logistic regression", "k-means"][j]
            print(f"{model}: {res[i, j, :][res[i, j, :] != 0].mean():.3f}")