In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.cluster import MiniBatchKMeans
from time import perf_counter

res = np.zeros((3, 3, 10)) # Size, model, iteration
for i in range(10):
    for j, size in enumerate(["small", "medium", "large"]):
        print(f"{size.capitalize()} data")

        # https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.files
        pqt_files = pq.ParquetDataset(fr"C:\Users\marij\Documents\GitHub\732A76_Research_Project\Data\{size}.parquet").files

        lin_reg = SGDRegressor(loss="squared_error")
        log_reg = SGDClassifier(loss="log_loss")
        kmeans = MiniBatchKMeans(n_clusters=2)

        # Linear regression
        start_time = perf_counter()
        for pqt_file in pqt_files:
            batch = pd.read_parquet(pqt_file)
            X = batch.loc[:, batch.columns != "label"]
            y = batch.loc[:, "label"]

            # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html
            lin_reg.partial_fit(X, y)

        end_time = perf_counter()
        res[j, 0, i] = end_time - start_time
            
        # Logistic regression
        start_time = perf_counter()    
        for pqt_file in pqt_files:
            # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
            log_reg.partial_fit(X, y, [0, 1])

        end_time = perf_counter()
        res[j, 1, i] = end_time - start_time

        # K-Means
        start_time = perf_counter()    
        for pqt_file in pqt_files:
            # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans
            kmeans.partial_fit(batch.loc[:, batch.columns != "label"])
        
        end_time = perf_counter()
        res[j, 2, i] = end_time - start_time
        

Small data
	Linear regression: 7.161
	Logistic regression: 5.561
	K-Means: 3.258
Medium data
	Linear regression: 13.160
	Logistic regression: 8.343
	K-Means: 5.942
Large data
	Linear regression: 21.722
	Logistic regression: 16.137
	K-Means: 11.928
Small data
	Linear regression: 6.116
	Logistic regression: 4.687
	K-Means: 2.281
Medium data
	Linear regression: 11.511
	Logistic regression: 8.076
	K-Means: 5.859
Large data
	Linear regression: 20.845
	Logistic regression: 16.364
	K-Means: 11.663
Small data
	Linear regression: 5.496
	Logistic regression: 4.901
	K-Means: 2.690
Medium data
	Linear regression: 10.963
	Logistic regression: 8.161
	K-Means: 5.847
Large data
	Linear regression: 20.650
	Logistic regression: 16.389
	K-Means: 11.699
Small data
	Linear regression: 4.874
	Logistic regression: 4.484
	K-Means: 2.113
Medium data
	Linear regression: 11.217
	Logistic regression: 8.098
	K-Means: 6.164
Large data
	Linear regression: 21.066
	Logistic regression: 16.611
	K-Means: 11.663
Small da

In [7]:
for i in range(3):
    size = ["small", "medium", "large"][i]
    print(size)
    for j in range(3):
        model = ["linear regression", "logistic regression", "k-means"][j]
        print(f"{model}: {res[i, j, :].mean():.3f}")

small
linear regression: 5.172
logistic regression: 4.707
k-means: 2.445
medium
linear regression: 11.388
logistic regression: 8.244
k-means: 5.897
large
linear regression: 20.910
logistic regression: 16.473
k-means: 11.739
