In [1]:
import numpy as np
from kmeans import StandardKMeans2, mpKMeans, allowKMeans2, chop
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
import warnings
warnings.filterwarnings("ignore")

features = [2, 10, 20, 40, 60]
clusters = [10, 25, 50, 75]
sample_seeds = [0, 42, 2024]

_eval = {'hs': homogeneity_score, 'cs': completeness_score, 'vm': v_measure_score}
LOW_PREC = chop(np.float16)

native_kmeans = np.zeros((len(features), len(clusters), len(_eval)))
mp_kmeans = np.zeros((len(features), len(clusters), len(_eval)))
all_kmeans =np.zeros((len(features), len(clusters), len(_eval)))

norm_native_kmeans = np.zeros((len(features), len(clusters), len(_eval)))
norm_mp_kmeans = np.zeros((len(features), len(clusters), len(_eval)))
norm_all_kmeans = np.zeros((len(features), len(clusters), len(_eval)))

native_kmeans_sse = np.zeros((len(features), len(clusters)))
mp_kmeans_sse = np.zeros((len(features), len(clusters)))
all_kmeans_sse =np.zeros((len(features), len(clusters)))

norm_native_kmeans_sse = np.zeros((len(features), len(clusters)))
norm_mp_kmeans_sse = np.zeros((len(features), len(clusters)))
norm_all_kmeans_sse = np.zeros((len(features), len(clusters)))

native_kmeans_iter = np.zeros((len(features), len((clusters))))
mp_kmeans_iter = np.zeros((len(features), len((clusters))))
all_kmeans_iter = np.zeros((len(features), len((clusters))))

norm_native_kmeans_iter = np.zeros((len(features), len((clusters))))
norm_mp_kmeans_iter = np.zeros((len(features), len((clusters))))
norm_all_kmeans_iter = np.zeros((len(features), len((clusters))))

for d in tqdm(range(len(features))):  
    dim = features[d]
    for c in range(len(clusters)):
        ct = clusters[c]
        for seed in sample_seeds:
            X, y = make_blobs(n_samples=2000,
                              n_features=dim, centers=ct, random_state=seed)

            mu = X.mean(axis=0)
            sigma = X.std(axis=0)
            norm_X = (X - mu) / sigma

            kmeans = StandardKMeans2(n_clusters=ct, seeding='d2')
            kmeans.fit(X)

            mpkmeans = mpKMeans(n_clusters=ct, seeding='d2', low_prec=LOW_PREC)
            mpkmeans.fit(X)

            allkmeans = allowKMeans2(n_clusters=ct, seeding='d2', low_prec=LOW_PREC)
            allkmeans.fit(X)

            norm_kmeans = StandardKMeans2(n_clusters=ct, seeding='d2')
            norm_kmeans.fit(norm_X)

            norm_mpkmeans = mpKMeans(n_clusters=ct, seeding='d2', low_prec=LOW_PREC)
            norm_mpkmeans.fit(norm_X)

            norm_allkmeans = allowKMeans2(n_clusters=ct, seeding='d2', low_prec=LOW_PREC)
            norm_allkmeans.fit(norm_X)
            
            for i in range(len(_eval)):
                metric = _eval[list(_eval)[i]]
                native_kmeans[d, c, i] += metric(y, kmeans.labels) / len(sample_seeds)
                mp_kmeans[d, c, i] += metric(y, mpkmeans.labels) / len(sample_seeds)
                all_kmeans[d, c, i] += metric(y, allkmeans.labels) / len(sample_seeds)

                norm_native_kmeans[d, c, i] += metric(y, norm_kmeans.labels) / len(sample_seeds)
                norm_mp_kmeans[d, c, i] += metric(y, norm_mpkmeans.labels) / len(sample_seeds)
                norm_all_kmeans[d, c, i] += metric(y, norm_allkmeans.labels) / len(sample_seeds)

            
            native_kmeans_sse[d, c] += kmeans.inertia[-1] / len(sample_seeds)
            mp_kmeans_sse[d, c] += mpkmeans.inertia[-1] / len(sample_seeds)
            all_kmeans_sse[d, c] += allkmeans.inertia[-1] / len(sample_seeds)

            norm_native_kmeans_sse[d, c] += norm_kmeans.inertia[-1] / len(sample_seeds)
            norm_mp_kmeans_sse[d, c] += norm_mpkmeans.inertia[-1] / len(sample_seeds)
            norm_all_kmeans_sse[d, c] += norm_allkmeans.inertia[-1] / len(sample_seeds)

            native_kmeans_iter[d, c] += kmeans.iter / len(sample_seeds)
            mp_kmeans_iter[d, c] += mpkmeans.iter / len(sample_seeds)
            all_kmeans_iter[d, c] += allkmeans.iter / len(sample_seeds)

            norm_native_kmeans_iter[d, c] += norm_kmeans.iter / len(sample_seeds)
            norm_mp_kmeans_iter[d, c] += norm_mpkmeans.iter / len(sample_seeds)
            norm_all_kmeans_iter[d, c] += norm_allkmeans.iter / len(sample_seeds)

for i in range(len(_eval)):
    pd.DataFrame(native_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/native_kmeans'+list(_eval)[i]+'_fp16.csv')
    pd.DataFrame(mp_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/mp_kmeans'+list(_eval)[i]+'_fp16.csv')
    pd.DataFrame(all_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/all_kmeans'+list(_eval)[i]+'_fp16.csv')

    pd.DataFrame(norm_native_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/norm_native_kmeans'+list(_eval)[i]+'_fp16.csv')
    pd.DataFrame(norm_mp_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/norm_mp_kmeans'+list(_eval)[i]+'_fp16.csv')
    pd.DataFrame(norm_all_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/norm_all_kmeans'+list(_eval)[i]+'_fp16.csv')

pd.DataFrame(native_kmeans_sse, index=features,
             columns=clusters).to_csv('results/native_kmeans_sse_fp16..csv')
pd.DataFrame(mp_kmeans_sse, index=features,
             columns=clusters).to_csv('results/mp_kmeans_sse_fp16.csv')
pd.DataFrame(all_kmeans_sse, index=features,
             columns=clusters).to_csv('results/all_kmeans_sse_fp16.csv')

pd.DataFrame(norm_native_kmeans_sse, index=features,
             columns=clusters).to_csv('results/norm_native_kmeans_sse_fp16.csv')
pd.DataFrame(norm_mp_kmeans_sse, index=features,
             columns=clusters).to_csv('results/norm_mp_kmeans_sse_fp16.csv')
pd.DataFrame(norm_all_kmeans_sse, index=features,
             columns=clusters).to_csv('results/norm_all_kmeans_sse_fp16.csv')

pd.DataFrame(native_kmeans_iter, index=features,
             columns=clusters).to_csv('results/native_kmeans_iter_fp16..csv')
pd.DataFrame(mp_kmeans_iter, index=features,
             columns=clusters).to_csv('results/mp_kmeans_iter_fp16.csv')
pd.DataFrame(all_kmeans_iter, index=features,
             columns=clusters).to_csv('results/all_kmeans_iter_fp16.csv')

pd.DataFrame(norm_native_kmeans_iter, index=features,
             columns=clusters).to_csv('results/norm_native_kmeans_iter_fp16.csv')
pd.DataFrame(norm_mp_kmeans_iter, index=features,
             columns=clusters).to_csv('results/norm_mp_kmeans_iter_fp16.csv')
pd.DataFrame(norm_all_kmeans_iter, index=features,
             columns=clusters).to_csv('results/norm_all_kmeans_iter_fp16.csv')
                                  
                                  
LOW_PREC = chop(np.float32)

native_kmeans = np.zeros((len(features), len(clusters), len(_eval)))
mp_kmeans = np.zeros((len(features), len(clusters), len(_eval)))
all_kmeans =np.zeros((len(features), len(clusters), len(_eval)))

norm_native_kmeans = np.zeros((len(features), len(clusters), len(_eval)))
norm_mp_kmeans = np.zeros((len(features), len(clusters), len(_eval)))
norm_all_kmeans = np.zeros((len(features), len(clusters), len(_eval)))

native_kmeans_sse = np.zeros((len(features), len(clusters)))
mp_kmeans_sse = np.zeros((len(features), len(clusters)))
all_kmeans_sse =np.zeros((len(features), len(clusters)))

norm_native_kmeans_sse = np.zeros((len(features), len(clusters)))
norm_mp_kmeans_sse = np.zeros((len(features), len(clusters)))
norm_all_kmeans_sse = np.zeros((len(features), len(clusters)))

native_kmeans_iter = np.zeros((len(features), len((clusters))))
mp_kmeans_iter = np.zeros((len(features), len((clusters))))
all_kmeans_iter = np.zeros((len(features), len((clusters))))

norm_native_kmeans_iter = np.zeros((len(features), len((clusters))))
norm_mp_kmeans_iter = np.zeros((len(features), len((clusters))))
norm_all_kmeans_iter = np.zeros((len(features), len((clusters))))

for d in tqdm(range(len(features))):  
    dim = features[d]
    for c in range(len(clusters)):
        ct = clusters[c]
        for seed in sample_seeds:
            X, y = make_blobs(n_samples=2000,
                              n_features=dim, centers=ct, random_state=seed)

            mu = X.mean(axis=0)
            sigma = X.std(axis=0)
            norm_X = (X - mu) / sigma

            kmeans = StandardKMeans2(n_clusters=ct, seeding='d2')
            kmeans.fit(X)

            mpkmeans = mpKMeans(n_clusters=ct, seeding='d2', low_prec=LOW_PREC)
            mpkmeans.fit(X)

            allkmeans = allowKMeans2(n_clusters=ct, seeding='d2', low_prec=LOW_PREC)
            allkmeans.fit(X)

            norm_kmeans = StandardKMeans2(n_clusters=ct, seeding='d2')
            norm_kmeans.fit(norm_X)

            norm_mpkmeans = mpKMeans(n_clusters=ct, seeding='d2', low_prec=LOW_PREC)
            norm_mpkmeans.fit(norm_X)

            norm_allkmeans = allowKMeans2(n_clusters=ct, seeding='d2', low_prec=LOW_PREC)
            norm_allkmeans.fit(norm_X)
            
            for i in range(len(_eval)):
                metric = _eval[list(_eval)[i]]
                native_kmeans[d, c, i] += metric(y, kmeans.labels) / len(sample_seeds)
                mp_kmeans[d, c, i] += metric(y, mpkmeans.labels) / len(sample_seeds)
                all_kmeans[d, c, i] += metric(y, allkmeans.labels) / len(sample_seeds)

                norm_native_kmeans[d, c, i] += metric(y, norm_kmeans.labels) / len(sample_seeds)
                norm_mp_kmeans[d, c, i] += metric(y, norm_mpkmeans.labels) / len(sample_seeds)
                norm_all_kmeans[d, c, i] += metric(y, norm_allkmeans.labels) / len(sample_seeds)

            
            native_kmeans_sse[d, c] += kmeans.inertia[-1] / len(sample_seeds)
            mp_kmeans_sse[d, c] += mpkmeans.inertia[-1] / len(sample_seeds)
            all_kmeans_sse[d, c] += allkmeans.inertia[-1] / len(sample_seeds)

            norm_native_kmeans_sse[d, c] += norm_kmeans.inertia[-1] / len(sample_seeds)
            norm_mp_kmeans_sse[d, c] += norm_mpkmeans.inertia[-1] / len(sample_seeds)
            norm_all_kmeans_sse[d, c] += norm_allkmeans.inertia[-1] / len(sample_seeds)

            native_kmeans_iter[d, c] += kmeans.iter / len(sample_seeds)
            mp_kmeans_iter[d, c] += mpkmeans.iter / len(sample_seeds)
            all_kmeans_iter[d, c] += allkmeans.iter / len(sample_seeds)

            norm_native_kmeans_iter[d, c] += norm_kmeans.iter / len(sample_seeds)
            norm_mp_kmeans_iter[d, c] += norm_mpkmeans.iter / len(sample_seeds)
            norm_all_kmeans_iter[d, c] += norm_allkmeans.iter / len(sample_seeds)

for i in range(len(_eval)):
    pd.DataFrame(native_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/native_kmeans'+list(_eval)[i]+'_fp32.csv')
    pd.DataFrame(mp_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/mp_kmeans'+list(_eval)[i]+'_fp32.csv')
    pd.DataFrame(all_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/all_kmeans'+list(_eval)[i]+'_fp32.csv')

    pd.DataFrame(norm_native_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/norm_native_kmeans'+list(_eval)[i]+'_fp32.csv')
    pd.DataFrame(norm_mp_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/norm_mp_kmeans'+list(_eval)[i]+'_fp32.csv')
    pd.DataFrame(norm_all_kmeans[:,:,i], index=features,
                 columns=clusters).to_csv('results/norm_all_kmeans'+list(_eval)[i]+'_fp32.csv')

pd.DataFrame(native_kmeans_sse, index=features,
             columns=clusters).to_csv('results/native_kmeans_sse_fp32..csv')
pd.DataFrame(mp_kmeans_sse, index=features,
             columns=clusters).to_csv('results/mp_kmeans_sse_fp32.csv')
pd.DataFrame(all_kmeans_sse, index=features,
             columns=clusters).to_csv('results/all_kmeans_sse_fp32.csv')

pd.DataFrame(norm_native_kmeans_sse, index=features,
             columns=clusters).to_csv('results/norm_native_kmeans_sse_fp32.csv')
pd.DataFrame(norm_mp_kmeans_sse, index=features,
             columns=clusters).to_csv('results/norm_mp_kmeans_sse_fp32.csv')
pd.DataFrame(norm_all_kmeans_sse, index=features,
             columns=clusters).to_csv('results/norm_all_kmeans_sse_fp32.csv')

pd.DataFrame(native_kmeans_iter, index=features,
             columns=clusters).to_csv('results/native_kmeans_iter_fp32..csv')
pd.DataFrame(mp_kmeans_iter, index=features,
             columns=clusters).to_csv('results/mp_kmeans_iter_fp32.csv')
pd.DataFrame(all_kmeans_iter, index=features,
             columns=clusters).to_csv('results/all_kmeans_iter_fp32.csv')

pd.DataFrame(norm_native_kmeans_iter, index=features,
             columns=clusters).to_csv('results/norm_native_kmeans_iter_fp32.csv')
pd.DataFrame(norm_mp_kmeans_iter, index=features,
             columns=clusters).to_csv('results/norm_mp_kmeans_iter_fp32.csv')
pd.DataFrame(norm_all_kmeans_iter, index=features,
             columns=clusters).to_csv('results/norm_all_kmeans_iter_fp32.csv')
                                  

100%|██████████| 5/5 [09:07<00:00, 109.57s/it]
100%|██████████| 5/5 [09:03<00:00, 108.77s/it]
