In [None]:
import numpy as np
import vaex
from vaex.ml.sklearn import IncrementalPredictor, Predictor
from vaex.ml.xgboost import XGBoostModel
from vaex.ml.cluster import KMeans
from sklearn.linear_model import SGDRegressor, SGDClassifier
from time import perf_counter
import os

data_path = os.getcwd()[:-6] + "Data\\"

col_names = ["label", "lepton_pT", "lepton_eta", "lepton_phi", 
             "missing_energy_mag", "missing_energy_phi", 
             "jet_1_pt", "jet_1_eta", "jet_1_phi", "jet_1_b_tag",
             "jet_2_pt", "jet_2_eta", "jet_2_phi", "jet_2_b_tag",
             "jet_3_pt", "jet_3_eta", "jet_3_phi", "jet_3_b_tag",
             "jet_4_pt", "jet_4_eta", "jet_4_phi", "jet_4_b_tag",
             "m_jj", "m_jjj", "m_lv", "m_jlv", "m_bb", "b_wbb", "m_wwbb",
             ]

features = [col_name for col_name in col_names if col_name != "label"]

res = np.zeros((3, 4, 10)) # Size, model, iteration
for i in range(10):
    print(f"Iteration {i}")
    for j, size in enumerate(["small", "medium", "large"]):
        print(f"\t{size.capitalize()} data")
        df = vaex.open(data_path + f"{size}.parquet")
        
        # Linear regression full
        lin_reg = SGDRegressor(loss="squared_loss")
        vaex_lin_reg = IncrementalPredictor(features=features, target="label", model=lin_reg)

        start_time = perf_counter()
        vaex_lin_reg.fit(df)
        end_time = perf_counter()

        res[j, 0, i] = end_time - start_time
        print(f"\t\tLinear regression: {end_time - start_time:.3f}")

        # Logistic regression
        log_reg = SGDClassifier(loss="log")
        vaex_log_reg = IncrementalPredictor(features=features, target="label", model=log_reg, partial_fit_kwargs={'classes':[0, 1]})
        
        start_time = perf_counter()
        vaex_log_reg.fit(df)
        end_time = perf_counter()

        res[j, 1, i] = end_time - start_time
        print(f"\t\tLogistic regression: {end_time - start_time:.3f}")

        # Decision tree
        dec_tree = XGBoostModel(features=features, target="label", num_boost_round=100)

        start_time = perf_counter()
        dec_tree.fit(df)
        end_time = perf_counter()

        res[j, 2, i] = end_time - start_time
        print(f"\t\tDecision tree: {end_time - start_time:.3f}")

        # KMeans
        kmeans = KMeans(n_clusters=2, features=features)
        
        start_time = perf_counter()
        kmeans.fit(df)
        end_time = perf_counter()

        res[j, 3, i] = end_time - start_time
        print(f"\t\tK-Means: {end_time - start_time:.3f}")


In [None]:
for i in range(3):
    size = ["small", "medium", "large"][i]
    print(size)
    for j in range(4):
        model = ["linear regression", "logistic regression", "decision tree", "k-means"][j]
        print(f"{model}: {res[i, j, :][res[i, j, :] != 0].mean():.3f}")