In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.cluster import MiniBatchKMeans
from time import perf_counter
import os

data_path = os.getcwd()[:-6] + "Data\\"

res = np.zeros((3, 3, 10)) # Size, model, iteration
for i in range(10):
    for j, size in enumerate(["small", "medium", "large"]):
        # https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.files
        pqt_files = pq.ParquetDataset(data_path + f"{size}.parquet").files

        lin_reg = SGDRegressor(loss="squared_error")
        log_reg = SGDClassifier(loss="log_loss")
        kmeans = MiniBatchKMeans(n_clusters=2)

        # Linear regression
        start_time = perf_counter()
        for pqt_file in pqt_files:
            batch = pd.read_parquet(pqt_file)
            X = batch.loc[:, batch.columns != "label"]
            y = batch.loc[:, "label"]

            # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html
            lin_reg.partial_fit(X, y)

        end_time = perf_counter()
        res[j, 0, i] = end_time - start_time
            
        # Logistic regression
        start_time = perf_counter()    
        for pqt_file in pqt_files:
            # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
            log_reg.partial_fit(X, y, [0, 1])

        end_time = perf_counter()
        res[j, 1, i] = end_time - start_time

        # K-Means
        start_time = perf_counter()    
        for pqt_file in pqt_files:
            # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans
            kmeans.partial_fit(batch.loc[:, batch.columns != "label"])
        
        end_time = perf_counter()
        res[j, 2, i] = end_time - start_time
        

In [None]:
for i in range(3):
    size = ["small", "medium", "large"][i]
    print(size)
    for j in range(3):
        model = ["linear regression", "logistic regression", "k-means"][j]
        print(f"{model}: {res[i, j, :].mean():.3f}")