In [8]:
import numpy as np
import vaex
from vaex.ml.sklearn import IncrementalPredictor, Predictor
from vaex.ml.xgboost import XGBoostModel
from vaex.ml.cluster import KMeans
from sklearn.linear_model import SGDRegressor, SGDClassifier
from time import perf_counter

col_names = ["label", "lepton_pT", "lepton_eta", "lepton_phi", 
             "missing_energy_mag", "missing_energy_phi", 
             "jet_1_pt", "jet_1_eta", "jet_1_phi", "jet_1_b_tag",
             "jet_2_pt", "jet_2_eta", "jet_2_phi", "jet_2_b_tag",
             "jet_3_pt", "jet_3_eta", "jet_3_phi", "jet_3_b_tag",
             "jet_4_pt", "jet_4_eta", "jet_4_phi", "jet_4_b_tag",
             "m_jj", "m_jjj", "m_lv", "m_jlv", "m_bb", "b_wbb", "m_wwbb",
             ]

features = [col_name for col_name in col_names if col_name != "label"]

res = np.zeros((3, 4, 10)) # Size, model, iteration
for i in range(10):
    print(f"Iteration {i}")
    for j, size in enumerate(["small", "medium", "large"]):
        print(f"\t{size.capitalize()} data")
        df = vaex.open(fr"C:\Users\marij\Documents\GitHub\732A76_Research_Project\Data\{size}.parquet")
        
        # Linear regression full
        lin_reg = SGDRegressor(loss="squared_loss")
        vaex_lin_reg = IncrementalPredictor(features=features, target="label", model=lin_reg)

        start_time = perf_counter()
        vaex_lin_reg.fit(df)
        end_time = perf_counter()

        res[j, 0, i] = end_time - start_time
        print(f"\t\tLinear regression: {end_time - start_time:.3f}")

        # Logistic regression
        log_reg = SGDClassifier(loss="log")
        vaex_log_reg = IncrementalPredictor(features=features, target="label", model=log_reg, partial_fit_kwargs={'classes':[0, 1]})
        
        start_time = perf_counter()
        vaex_log_reg.fit(df)
        end_time = perf_counter()

        res[j, 1, i] = end_time - start_time
        print(f"\t\tLogistic regression: {end_time - start_time:.3f}")

        # Decision tree
        dec_tree = XGBoostModel(features=features, target="label", num_boost_round=100)

        start_time = perf_counter()
        dec_tree.fit(df)
        end_time = perf_counter()

        res[j, 2, i] = end_time - start_time
        print(f"\t\tDecision tree: {end_time - start_time:.3f}")

        # KMeans
        kmeans = KMeans(n_clusters=2, features=features)
        
        start_time = perf_counter()
        kmeans.fit(df)
        end_time = perf_counter()

        res[j, 3, i] = end_time - start_time
        print(f"\t\tK-Means: {end_time - start_time:.3f}")


Iteration 0
	Small data
		Linear regression: 12.114
		Logistic regression: 13.328
		Decision tree: 565.271
		K-Means: 32.043
	Medium data
		Linear regression: 8.215
		Logistic regression: 9.559
		Decision tree: 1396.933
		K-Means: 18.213
	Large data
		Linear regression: 28.449
		Logistic regression: 28.574
		Decision tree: 3046.837
		K-Means: 55.000
Iteration 1
	Small data
		Linear regression: 10.974
		Logistic regression: 7.473
		Decision tree: 594.051
		K-Means: 31.516
	Medium data
		Linear regression: 11.057
		Logistic regression: 9.598
		Decision tree: 1417.637
		K-Means: 36.078
	Large data
		Linear regression: 36.705
		Logistic regression: 28.775
		Decision tree: 3031.352
		K-Means: 1367.276
Iteration 2
	Small data
		Linear regression: 17.119
		Logistic regression: 13.334
		Decision tree: 537.810
		K-Means: 29.246
	Medium data
		Linear regression: 11.394
		Logistic regression: 8.418
		Decision tree: 1487.469
		K-Means: 50.907
	Large data
		Linear regression: 30.867
		Logistic regr

In [1]:
for i in range(3):
    size = ["small", "medium", "large"][i]
    print(size)
    for j in range(4):
        model = ["linear regression", "logistic regression", "decision tree", "k-means"][j]
        print(f"{model}: {res[i, j, :][res[i, j, :] != 0].mean():.3f}")

small


NameError: name 'res' is not defined

In [4]:
import numpy as np

print("Small data")
print("Linear regression: ")
print(np.mean([12.114, 10.974, 17.119, 43.917, 20.604, 27.727, 37.668, 33.948, 32.251, 20.757]))

print("Logistic regression:")
print(np.mean([13.328, 7.473, 13.334, 7.476, 15.842, 10.466, 12.795, 16.466, 12.386, 8.270]))

print("Decision tree:")
print(np.mean([565.271, 594.051, 537.810, 505.332, 487.438, 573.442, 595.534, 489.217, 524.235, 503.697]))

print("K-Means:")
print(np.mean([32.043, 31.516, 29.246, 39.141, 25.819, 53.099, 43.302, 60.497, 48.799, 25.564]))


print("Medium data")

print("Linear regression:")
print(np.mean([8.215, 11.057, 11.394, 14.222, 10.247, 11.555, 10.879, 17.234, 13.189, 19.980]))

print("Logistic regression:") 
print(np.mean([9.559, 9.598, 8.418, 10.304, 9.039, 9.872, 9.505, 9.444, 10.388, 9.758]))

print("Decision tree:")
print(np.mean([1396.933, 1417.637, 1487.469, 1539.654, 1378.104, 1517.391, 1305.736, 1330.482, 1281.546, 1753.627]))

print("K-Means:")
print(np.mean([18.213, 36.078, 50.907, 292.212, 622.317, 40.925, 43.877, 1274.555, 60.292, 51.888]))


print("Large data")

print("Linear regression:")
print(np.mean([36.705, 28.449, 36.583, 30.867, 44.363, 46.528, 40.161, 43.089, 55.164, 42.834]))

print("Logistic regression:") 
print(np.mean([28.574, 28.775, 25.725, 28.188, 39.712, 29.338, 27.631, 34.099, 30.600, 28.473]))

print("Decision tree:")
print(np.mean([3046.837, 3031.352, 3256.530, 3298.226, 3592.587, 3446.716, 3810.928, 3490.577, 3796.828, 3526.393]))

print("K-Means:")
print(np.mean([55.000, 1367.276, 184.639, 108.510, 202.190, 594.889, 487.622, 406.032, 107.754, 111.385]))

Small data
Linear regression: 
25.707900000000002
Logistic regression:
11.7836
Decision tree:
537.6027
K-Means:
38.9026
Medium data
Linear regression:
12.7972
Logistic regression:
9.5885
Decision tree:
1440.8579
K-Means:
249.12639999999996
Large data
Linear regression:
40.4743
Logistic regression:
30.1115
Decision tree:
3429.6974
K-Means:
362.52970000000005
