In [1]:
from KMeans import *
from openml.datasets import list_datasets, get_datasets
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy import stats
import gc

## Flags

In [2]:
# flag for researching datasets
search = False
# flag for re-measuring time executions
measure = True

## Dataset retrieval

In [3]:
if search:
    query = "NumberOfInstances > 100000 &\
             NumberOfInstances < 1000000 &\
             NumberOfNumericFeatures > 3 &\
             NumberOfNumericFeatures < 25 &\
             NumberOfMissingValues == 0 &\
             NumberOfSymbolicFeatures == 0"

    dataset_dataframe = list_datasets(output_format="dataframe").query(query)
    dataset_dataframe = dataset_dataframe.drop_duplicates(['name']).drop_duplicates(['NumberOfNumericFeatures']).sort_values(by=['NumberOfNumericFeatures'])
    dataset_ids = dataset_dataframe['did'][:10]
    dataset_ids.to_csv('./data/dataset_ids.csv', index=False)
dataset_ids = pd.read_csv('./data/dataset_ids.csv')['did']
dataset_list = get_datasets(dataset_ids=dataset_ids)
X_list = [dataset.get_data()[0].select_dtypes([np.number]).to_numpy() for dataset in dataset_list]

## Plot definitions

In [4]:
pad = 20
figsize = (10,8)

In [5]:
def boxplot(df, columns, title, xlabel, ylabel, tick_freq=None):
    plt.figure(figsize=figsize)
    df_melted = pd.melt(df, value_vars=columns)
    ax = sns.boxplot(x="variable", y="value", data=df_melted)
    ax.set_title(title, pad=pad)
    if tick_freq is not None:
        ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_freq))
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    plt.show()

In [6]:
def scatterplot(df, x, y, title, xlabel, ylabel):
    plt.figure(figsize=figsize)
    ax = sns.scatterplot(data=df, x=x, y=y)
    ax.set_title(title, pad=pad)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    plt.show()

In [7]:
def getPlots(df, speculation=False):
    sns.set_context("paper", font_scale=1.5)

    df['ratio_AB_median'] = df['t_A_median']/df['t_B_median']
    
    # Box-plot of medians
    if speculation:
        boxplot(df, columns = ['t_A_median', 't_B_median', 't_correction_median', 't_speculation_median'], title = 'Average execution time of A, B, correction and speculation', xlabel = 'Task', ylabel = 'Execution time in ns', tick_freq=0.05)
        
    else:
        boxplot(df, columns = ['t_A_median', 't_B_median'],  title = 'Average execution time of A, B', xlabel = 'Task', ylabel = 'Execution time in ns', tick_freq=0.05)
        
    # Box-plot of ratios
    if speculation:
        df_measurements_speculation['ratio_A_correction_median'] = df_measurements_speculation['t_A_median']/df_measurements_speculation['t_correction_median']
        df_measurements_speculation['ratio_B_correction_median'] = df_measurements_speculation['t_B_median']/df_measurements_speculation['t_correction_median']
        df_measurements_speculation['ratio_B_speculation_median'] = df_measurements_speculation['t_B_median']/df_measurements_speculation['t_speculation_median']
        boxplot(df, columns = ['ratio_AB_median'], title = r'Ratio $\frac{median(t_A)}{median(t_B)}$', xlabel = 'Task', ylabel = 'Ratio value', tick_freq=20)
        boxplot(df, columns = ['ratio_A_correction_median'], title = r'Ratio $\frac{median(t_A)}{median(t_{correction})}$', xlabel = 'Task', ylabel = 'Ratio value', tick_freq=20)
        boxplot(df, columns = ['ratio_B_correction_median', 'ratio_B_speculation_median'], title = r'Ratio $\frac{median(t_B)}{median(t_{correction})}$ and Ratio $\frac{median(t_B)}{median(t_{speculation})}$', xlabel = 'Task', ylabel = 'Ratio value', tick_freq=5)

    else:
        boxplot(df, columns = ['ratio_AB_median'], title = r'Ratio $\frac{median(t_A)}{median(t_B)}$', xlabel = 'Task', ylabel = 'Ratio value', tick_freq=5)
        
    # Scatter plots - n_clusters
    scatterplot(df, x='n_clusters', y='ratio_AB_median', title=r'Relation between n_clusters and $\frac{median(t_A)}{median(t_B)}$', xlabel='', ylabel=r'$\frac{median(t_A)}{median(t_B)}$')
    scatterplot(df, x='n_datapoints', y='ratio_AB_median', title=r'Relation between n_datapoints and $\frac{median(t_A)}{median(t_B)}$', xlabel='n', ylabel=r'$\frac{median(t_A)}{median(t_B)}$')
    scatterplot(df, x='n_features', y='ratio_AB_median', title=r'Relation between n_features and $\frac{median(t_A)}{median(t_B)}$', xlabel='d', ylabel=r'$\frac{median(t_A)}{median(t_B)}$')
    #scatterplot(df.groupby(['n_clusters']).mean(), x='n_clusters', y='ratio_AB_median', title=r'Relation between n_clusters and $\frac{median(t_A)}{median(t_B)} - aggregation over all datasets $', xlabel='Number of clusters', ylabel=r'$\frac{median(t_A)}{median(t_B)}$')
    #scatterplot(df[df.n_datapoints == df.iloc[0].n_datapoints], x='n_clusters', y='ratio_AB_median', title=r'Relation between n_clusters and $\frac{median(t_A)}{median(t_B)} - fixed dataset $', xlabel='Number of clusters', ylabel=r'$\frac{median(t_A)}{median(t_B)}$')
    
    # Scatter plots - cluster sizes
    df['ratio_max_min_cluster_size'] = df['max_cluster_size']/df['min_cluster_size']
    scatterplot(df, x='median_cluster_size', y='ratio_AB_median', title=r'Relation between median(cluster_size) and $\frac{median(t_A)}{median(t_B)}$', xlabel='Average cluster size', ylabel=r'$\frac{median(t_A)}{median(t_B)}$')
    scatterplot(df, x='ratio_max_min_cluster_size', y='ratio_AB_median', title=r'Relation between skew in cluster size (i.e:  $\frac{max(cluster-size)}{min(cluster-size)}$) and  $\frac{median(t_A)}{median(t_B)}$', xlabel=r'Skew: $\frac{max(cluster-size)}{min(cluster-size)}$', ylabel=r'$\frac{median(t_A)}{median(t_B)}$')
    scatterplot(df, x='ratio_max_min_cluster_size', y='t_A_median', title=r'Relation between skew in cluster size (i.e:  $\frac{max(cluster-size)}{min(cluster-size)}$) and median(t_A)', xlabel=r'Skew: $\frac{max(cluster-size)}{min(cluster-size)}$', ylabel='median(t_A)')
    scatterplot(df, x='ratio_max_min_cluster_size', y='t_B_median', title=r'Relation between skew in cluster size (i.e: $\frac{max(cluster-size)}{min(cluster-size)}$)', xlabel=r'Skew: $\frac{max(cluster-size)}{min(cluster-size)}$', ylabel='median(t_B)')

## Helpers

In [8]:
def fit_linear_regression(df, x_name, y_name):
    X = df[x_name].to_numpy().reshape(-1, 1)  
    y = df[y_name].to_numpy()

    model = LinearRegression()
    model.fit(X, y);
    print(f'Model with x: {x_name}, y: {y_name}')
    print(f'\tCoefficient: {model.coef_}')
    print(f'\tIntercept: {model.intercept_}')
    return model

In [9]:
def clean_dataset(df):
    # Remove inf values
    df_clean = df.replace([np.inf, -np.inf], np.nan)
    df_clean = df_clean.dropna()
    return df_clean

## Relation tA/tB, n, k

The implementations of kmeans should have the following complexities:
- Assignment: **O(ndk)**
- Update: **O(d(n+k))**

Even if ideally the complexity of the update should be O(nd) since we are doing only nd suns, we need some work to separate the datapoints using the labels. This leads to extra work which depdends on k. Depending on the implementations we may have the update with complexity O(ndk) or O(d(n+k)). The fastest implementation in numpy is O(d(n+k)).

We therefore expect to have: tA/tB O(nk/(n+k)).
However, we have that usually n >> k, therefore O(nk/(n+k)) ~ O(k).

In this case we have that:
- for fixed n, the ratio tA/tB will grow like k. In particular:
    - The assignment phase will grow with k, since O(ndk)
    - The update phase should be costant with k, since O(d(n+k)) ~ O(dn)
- for fixed k, the ratio tA/tB stay costant around k

### Relation tA/tB, k

We fix n (and d) and we plot the evolution of tA/tB for growing k.
We take the dataset with relatively large d = 17.

In [None]:
measure=False
if measure:
    # We select the first dataset in the list of datasets
    X = X_list[-1]
    df = pd.DataFrame(columns=['n_datapoints', 'n_features', 'n_clusters', 't_A_median', 't_A_min', 't_A_max', 't_B_median', 't_B_min', 't_B_max', 'median_cluster_size', 'min_cluster_size', 'max_cluster_size'])
    i = 0
    for K in range(3, 100):
        if K%10 == 0:
            print(K)
        labels, centroids, A_time, B_time = KMeans(X, K, num_iter=100, measure=True)
        # Get cluster size
        clusters_size = np.array([X[labels == k].shape[0] for k in range(K)])
        # Append to df
        df.loc[i] = [X.shape[0], X.shape[1], K, np.median(A_time), np.min(A_time), np.max(A_time), np.median(B_time), np.min(B_time), np.max(B_time), np.nanmean(clusters_size), np.nanmin(clusters_size), np.nanmax(clusters_size)]
        i += 1
    df.to_csv('./data/measurements_relation_k.csv', index=False)
    
# Import dataset    
df = pd.read_csv('./data/measurements_relation_k.csv')
# Compute ratio
df['ratio_AB_median'] = df['t_A_median']/df['t_B_median']
# Clean dataset
df_clean = clean_dataset(df)
# Plot
scatterplot(df_clean, 'n_clusters', 't_A_median', 'Relation k, ratio_AB', 'k', 'ratio_AB')

In [None]:
# Fit linear regression
model_k_1 = fit_linear_regression(df_clean, 'n_clusters', 't_A_median')
model_k_2 = fit_linear_regression(df_clean, 'n_clusters', 't_B_median')
model_k_3 = fit_linear_regression(df_clean, 'n_clusters', 'ratio_AB_median')

### Relation tA/tB, n

In [None]:
if measure:
    X_full = X_list[-1]
    df = pd.DataFrame(columns=['n_datapoints', 'n_features', 'n_clusters', 't_A_median', 't_A_min', 't_A_max', 't_B_median', 't_B_min', 't_B_max', 'median_cluster_size', 'min_cluster_size', 'max_cluster_size'])
    i = 0
    K = 4
    for j in range(1,101):
        if j%10 == 0:
            print(j)
        X = X_full[:int(X_full.shape[0]* j/100)]
        labels, centroids, A_time, B_time = KMeans(X, K, num_iter=100, measure=True)
        # Get cluster size
        clusters_size = np.array([X[labels == k].shape[0] for k in range(K)])
        # Append to df
        df.loc[i] = [X.shape[0], X.shape[1], K, np.median(A_time), np.min(A_time), np.max(A_time), np.median(B_time), np.min(B_time), np.max(B_time), np.nanmean(clusters_size), np.nanmin(clusters_size), np.nanmax(clusters_size)]
        i += 1
    df.to_csv('./data/measurements_relation_n.csv', index=False)
    
# Import dataset
df = pd.read_csv('./data/measurements_relation_n.csv')
# Compute ratio
df['ratio_AB_median'] = df['t_A_median']/df['t_B_median']
# Clean dataset
df_clean = clean_dataset(df)
# Scatterplot
scatterplot(df_clean, 'n_datapoints', 'ratio_AB_median', 'Relation k, ratio_AB', 'n', 'ratio_AB')

In [None]:
model_n = fit_linear_regression(df,'n_datapoints', 'ratio_AB_median')

## Measurements basic k-means

In [None]:
if measure:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        i = 0
        df_measurements = pd.DataFrame(columns=['n_datapoints', 'n_features', 'n_clusters', 't_A_median', 't_A_min', 't_A_max', 't_B_median', 't_B_min', 't_B_max', 'median_cluster_size', 'min_cluster_size', 'max_cluster_size'])
        for k, X in enumerate(X_list):
            print(f"\nProcessing {k}° dataset...")
            for K in range(3, 15):
                # Fit kmeans
                print(f"{K} -- ", end='')
                labels, centroids, A_time, B_time = KMeans(X, K, num_iter=100, measure=True)
                # Get cluster size
                clusters_size = np.array([X[labels == k].shape[0] for k in range(K)])
                # Append to df
                df_measurements.loc[i] = [X.shape[0], X.shape[1], K, np.median(A_time), np.min(A_time), np.max(A_time), np.median(B_time), np.min(B_time), np.max(B_time), np.nanmean(clusters_size), np.nanmin(clusters_size), np.nanmax(clusters_size)]
                i += 1
        df_measurements.to_csv('./data/measurements.csv', index=False)
        
df_measurements = pd.read_csv('./data/measurements.csv')
getPlots(df_measurements)


Processing 0° dataset...
3 -- 4 -- 5 -- 6 -- 7 -- 8 -- 9 -- 10 -- 11 -- 12 -- 13 -- 14 -- 
Processing 1° dataset...
3 -- 4 -- 

## Measurements speculated k-means

In [None]:
if measure:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        i = 0
        subsample_size=0.01
        df_measurements_speculation = pd.DataFrame(columns=['n_datapoints', 'n_features', 'n_clusters', 't_A_median', 't_A_min', 't_A_max', 't_B_median', 't_B_min', 't_B_max', 'median_cluster_size', 'min_cluster_size', 'max_cluster_size',\
                                                't_speculation_median', 't_speculation_min', 't_speculation_max', 't_correction_median', 't_correction_min', 't_correction_max', 'subsample_size'])
        for k, X in enumerate(X_list):
            print(f"\nProcessing {k}° dataset...")
            for K in range(3, 15):
                # Fit kmeans
                print(f"{K} -- ", end='')
                labels, centroids, A_time, B_time, speculation_time, correction_time  = KMeans_speculation(X, K, num_iter=100, measure=True, subsample_size = subsample_size)
                # Get cluster size
                clusters_size = np.array([X[labels == k].shape[0] for k in range(K)])
                # Append to df
                df_measurements_speculation.loc[i] = [X.shape[0], X.shape[1], K, np.median(A_time), np.min(A_time), np.max(A_time), np.median(B_time),\
                                          np.min(B_time), np.max(B_time), np.nanmean(clusters_size), np.nanmin(clusters_size), np.nanmax(clusters_size),\
                                          np.median(speculation_time), np.min(speculation_time), np.max(speculation_time), np.median(correction_time), np.min(correction_time), np.max(correction_time), subsample_size]
                i += 1
        df_measurements_speculation.to_csv('./data/measurements_speculation.csv', index=False)

In [None]:
df_measurements_speculation = pd.read_csv('./data/measurements_speculation.csv')
getPlots(df_measurements_speculation, speculation = True)