In [1]:
import math
import numpy as np
from decimal import Decimal
from classix import loadData
from pychop.numpy import chop
from kmeans import  StandardKMeans2, mpKMeans,  allowKMeans2, chop as kchop
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
import warnings
warnings.filterwarnings("ignore")

def sigificant_digit(number, digits=5):
    if number != 0:
        return round(number, digits - int(math.floor(math.log10(abs(number)))) - 1)
    else:
        return 0

In [2]:
UCI_DATA = ['Banknote', 'Glass', 'Dermatology', 'Ecoli', 'Phoneme', 'Wine', 'Iris']
LOW_PREC = kchop(np.float16)

for dname in UCI_DATA:
    X, y = loadData(dname)
    nonans = np.isnan(X).sum(1) == 0
    X = X[nonans,:]
    y = y[nonans]
    print("shape:", X.shape)
    

    mu = X.mean(axis=0)
    sigma = X.std(axis=0)
    norm_X = (X - mu) / sigma

    clusters = len(np.unique(y))
    kmeans = StandardKMeans2(n_clusters=clusters, seeding='d2')
    kmeans.fit(X)

    norm_kmeans = StandardKMeans2(n_clusters=clusters, seeding='d2')
    norm_kmeans.fit(norm_X)

    alkmeans = allowKMeans2(n_clusters=clusters, seeding='d2', low_prec=LOW_PREC, verbose=0)
    alkmeans.fit(X)

    norm_alkmeans = allowKMeans2(n_clusters=clusters, seeding='d2', low_prec=LOW_PREC, verbose=0)
    norm_alkmeans.fit(norm_X)

    mpkmeans = mpKMeans(n_clusters=clusters, seeding='d2', low_prec=LOW_PREC, verbose=0)
    mpkmeans.fit(X)

    norm_mpkmeans = mpKMeans(n_clusters=clusters, seeding='d2', low_prec=LOW_PREC, verbose=0)
    norm_mpkmeans.fit(norm_X)

    print("kmeans++ &", 
          "kmeans++ (normalized) &",
          'mp1 k-means++ &',
          'mp1 k-means++ (normalized) &',
          'mp2 k-means++ &'
          'mp2 k-means++ (normalized) &'
         )

    print("trigger:", '-', '-', sigificant_digit(mpkmeans.low_prec_trigger * 100),"\%;")
    print("(norm) trigger:", '-', '-', sigificant_digit(norm_mpkmeans.low_prec_trigger * 100),"\%;")
    print("clusters:", kmeans.centers.shape[0], mpkmeans.centers.shape[0])

    print('SSE:', '%.3E' % Decimal(kmeans.inertia[-1]), '&',
              '%.3E' % Decimal(norm_kmeans.inertia[-1]), '&', 
              '%.3E' % Decimal(alkmeans.inertia[-1]), '&',
              '%.3E' % Decimal(norm_alkmeans.inertia[-1]),'&',
              '%.3E' % Decimal(mpkmeans.inertia[-1]), '&',
              '%.3E' % Decimal(norm_mpkmeans.inertia[-1])
     )


    print('ARI:',
    sigificant_digit(adjusted_rand_score(y, kmeans.labels)), '&',
    sigificant_digit(adjusted_rand_score(y, norm_kmeans.labels)),'&',
    sigificant_digit(adjusted_rand_score(y, alkmeans.labels)), '&',
    sigificant_digit(adjusted_rand_score(y, norm_alkmeans.labels)), '&',
    sigificant_digit(adjusted_rand_score(y, mpkmeans.labels)), '&',
    sigificant_digit(adjusted_rand_score(y, norm_mpkmeans.labels))
    )


    print('AMI:',
    sigificant_digit(adjusted_mutual_info_score(y, kmeans.labels)), '&',
    sigificant_digit(adjusted_mutual_info_score(y, norm_kmeans.labels)),'&',
    sigificant_digit(adjusted_mutual_info_score(y, alkmeans.labels)), '&',
    sigificant_digit(adjusted_mutual_info_score(y, norm_alkmeans.labels)), '&',
    sigificant_digit(adjusted_mutual_info_score(y, mpkmeans.labels)), '&',
    sigificant_digit(adjusted_mutual_info_score(y, norm_mpkmeans.labels))
    )


    print('homogeneity:',
      sigificant_digit(homogeneity_score(y, kmeans.labels)), '&',
      sigificant_digit(homogeneity_score(y, norm_kmeans.labels)),'&',
      sigificant_digit(homogeneity_score(y, alkmeans.labels)), '&',
      sigificant_digit(homogeneity_score(y, norm_alkmeans.labels)), '&',
      sigificant_digit(homogeneity_score(y, mpkmeans.labels)), '&',
      sigificant_digit(homogeneity_score(y, norm_mpkmeans.labels))
     )

    print('completeness:', sigificant_digit(completeness_score(y, kmeans.labels)),'&',
                      sigificant_digit(completeness_score(y, norm_kmeans.labels)),'&',
                      sigificant_digit(completeness_score(y, alkmeans.labels)),'&',
                      sigificant_digit(completeness_score(y, norm_alkmeans.labels)),'&',
                      sigificant_digit(completeness_score(y, mpkmeans.labels)), '&',
                      sigificant_digit(completeness_score(y, norm_mpkmeans.labels)),
     )

    print('v_measure:', sigificant_digit(v_measure_score(y, kmeans.labels)), '&',
                    sigificant_digit(v_measure_score(y, norm_kmeans.labels)),'&',
                    sigificant_digit(v_measure_score(y, alkmeans.labels)), '&',
                    sigificant_digit(v_measure_score(y, norm_alkmeans.labels)),'&',
                    sigificant_digit(v_measure_score(y, mpkmeans.labels)), '&',
                    sigificant_digit(v_measure_score(y, norm_mpkmeans.labels))
     )

shape: (1372, 4)
kmeans++ & kmeans++ (normalized) & mp1 k-means++ & mp1 k-means++ (normalized) & mp2 k-means++ &mp2 k-means++ (normalized) &
trigger: - - 74.93 \%;
(norm) trigger: - - 71.723 \%;
clusters: 2 2
SSE: 4.405E+04 & 3.453E+03 & 4.405E+04 & 3.453E+03 & 4.405E+04 & 3.453E+03
ARI: 0.048538 & 0.013568 & 0.048538 & 0.013568 & 0.048538 & 0.013568
AMI: 0.02977 & 0.010581 & 0.02977 & 0.010581 & 0.02977 & 0.010581
homogeneity: 0.029242 & 0.01115 & 0.029242 & 0.01115 & 0.029242 & 0.01115
completeness: 0.031446 & 0.011058 & 0.031446 & 0.011058 & 0.031446 & 0.011058
v_measure: 0.030304 & 0.011104 & 0.030304 & 0.011104 & 0.030304 & 0.011104
shape: (214, 9)
kmeans++ & kmeans++ (normalized) & mp1 k-means++ & mp1 k-means++ (normalized) & mp2 k-means++ &mp2 k-means++ (normalized) &
trigger: - - 0 \%;
(norm) trigger: - - 83.411 \%;
clusters: 6 6
SSE: 3.363E+02 & 7.927E+02 & 4.777E+02 & 7.927E+02 & 3.586E+02 & 7.927E+02
ARI: 0.27389 & 0.16799 & 0.21281 & 0.16799 & 0.2631 & 0.16799
AMI: 0.39901 

In [3]:
LOW_PREC = chop(prec='q52', rmode=1)

for dname in UCI_DATA:
    X, y = loadData(dname)
    nonans = np.isnan(X).sum(1) == 0
    X = X[nonans,:]
    y = y[nonans]
    print("shape:", X.shape)
    

    mu = X.mean(axis=0)
    sigma = X.std(axis=0)
    norm_X = (X - mu) / sigma

    clusters = len(np.unique(y))
    kmeans = StandardKMeans2(n_clusters=clusters, seeding='d2')
    kmeans.fit(X)

    norm_kmeans = StandardKMeans2(n_clusters=clusters, seeding='d2')
    norm_kmeans.fit(norm_X)

    alkmeans = allowKMeans2(n_clusters=clusters, seeding='d2', low_prec=LOW_PREC, verbose=0)
    alkmeans.fit(X)

    norm_alkmeans = allowKMeans2(n_clusters=clusters, seeding='d2', low_prec=LOW_PREC, verbose=0)
    norm_alkmeans.fit(norm_X)

    mpkmeans = mpKMeans(n_clusters=clusters, seeding='d2', low_prec=LOW_PREC, verbose=0)
    mpkmeans.fit(X)

    norm_mpkmeans = mpKMeans(n_clusters=clusters, seeding='d2', low_prec=LOW_PREC, verbose=0)
    norm_mpkmeans.fit(norm_X)

    print("kmeans++ &", 
          "kmeans++ (normalized) &",
          'mp1 k-means++ &',
          'mp1 k-means++ (normalized) &',
          'mp2 k-means++ &'
          'mp2 k-means++ (normalized) &'
         )

    print("trigger:", '-', '-', sigificant_digit(mpkmeans.low_prec_trigger * 100),"\%;")
    print("(norm) trigger:", '-', '-', sigificant_digit(norm_mpkmeans.low_prec_trigger * 100),"\%;")
    print("clusters:", kmeans.centers.shape[0], mpkmeans.centers.shape[0])

    print('SSE:', '%.3E' % Decimal(kmeans.inertia[-1]), '&',
              '%.3E' % Decimal(norm_kmeans.inertia[-1]), '&', 
              '%.3E' % Decimal(alkmeans.inertia[-1]), '&',
              '%.3E' % Decimal(norm_alkmeans.inertia[-1]),'&',
              '%.3E' % Decimal(mpkmeans.inertia[-1]), '&',
              '%.3E' % Decimal(norm_mpkmeans.inertia[-1])
     )


    print('ARI:',
    sigificant_digit(adjusted_rand_score(y, kmeans.labels)), '&',
    sigificant_digit(adjusted_rand_score(y, norm_kmeans.labels)),'&',
    sigificant_digit(adjusted_rand_score(y, alkmeans.labels)), '&',
    sigificant_digit(adjusted_rand_score(y, norm_alkmeans.labels)), '&',
    sigificant_digit(adjusted_rand_score(y, mpkmeans.labels)), '&',
    sigificant_digit(adjusted_rand_score(y, norm_mpkmeans.labels))
    )


    print('AMI:',
    sigificant_digit(adjusted_mutual_info_score(y, kmeans.labels)), '&',
    sigificant_digit(adjusted_mutual_info_score(y, norm_kmeans.labels)),'&',
    sigificant_digit(adjusted_mutual_info_score(y, alkmeans.labels)), '&',
    sigificant_digit(adjusted_mutual_info_score(y, norm_alkmeans.labels)), '&',
    sigificant_digit(adjusted_mutual_info_score(y, mpkmeans.labels)), '&',
    sigificant_digit(adjusted_mutual_info_score(y, norm_mpkmeans.labels))
    )


    print('homogeneity:',
      sigificant_digit(homogeneity_score(y, kmeans.labels)), '&',
      sigificant_digit(homogeneity_score(y, norm_kmeans.labels)),'&',
      sigificant_digit(homogeneity_score(y, alkmeans.labels)), '&',
      sigificant_digit(homogeneity_score(y, norm_alkmeans.labels)), '&',
      sigificant_digit(homogeneity_score(y, mpkmeans.labels)), '&',
      sigificant_digit(homogeneity_score(y, norm_mpkmeans.labels))
     )

    print('completeness:', sigificant_digit(completeness_score(y, kmeans.labels)),'&',
                      sigificant_digit(completeness_score(y, norm_kmeans.labels)),'&',
                      sigificant_digit(completeness_score(y, alkmeans.labels)),'&',
                      sigificant_digit(completeness_score(y, norm_alkmeans.labels)),'&',
                      sigificant_digit(completeness_score(y, mpkmeans.labels)), '&',
                      sigificant_digit(completeness_score(y, norm_mpkmeans.labels)),
     )

    print('v_measure:', sigificant_digit(v_measure_score(y, kmeans.labels)), '&',
                    sigificant_digit(v_measure_score(y, norm_kmeans.labels)),'&',
                    sigificant_digit(v_measure_score(y, alkmeans.labels)), '&',
                    sigificant_digit(v_measure_score(y, norm_alkmeans.labels)),'&',
                    sigificant_digit(v_measure_score(y, mpkmeans.labels)), '&',
                    sigificant_digit(v_measure_score(y, norm_mpkmeans.labels))
     )

shape: (1372, 4)
kmeans++ & kmeans++ (normalized) & mp1 k-means++ & mp1 k-means++ (normalized) & mp2 k-means++ &mp2 k-means++ (normalized) &
trigger: - - 75.732 \%;
(norm) trigger: - - 71.667 \%;
clusters: 2 2
SSE: 4.405E+04 & 3.453E+03 & 4.467E+04 & 3.605E+03 & 4.405E+04 & 3.453E+03
ARI: 0.048538 & 0.013568 & 0.048375 & 0.052262 & 0.049206 & 0.01322
AMI: 0.02977 & 0.010581 & 0.030074 & 0.032546 & 0.030253 & 0.010348
homogeneity: 0.029242 & 0.01115 & 0.030037 & 0.031596 & 0.029719 & 0.010917
completeness: 0.031446 & 0.011058 & 0.031183 & 0.03472 & 0.031934 & 0.010827
v_measure: 0.030304 & 0.011104 & 0.030599 & 0.033084 & 0.030786 & 0.010872
shape: (214, 9)
kmeans++ & kmeans++ (normalized) & mp1 k-means++ & mp1 k-means++ (normalized) & mp2 k-means++ &mp2 k-means++ (normalized) &
trigger: - - 0 \%;
(norm) trigger: - - 80.997 \%;
clusters: 6 6
SSE: 3.363E+02 & 7.927E+02 & 1.878E+03 & 7.933E+02 & 3.842E+02 & 7.790E+02
ARI: 0.27389 & 0.16799 & 0 & 0.15441 & 0.25606 & 0.15985
AMI: 0.39901 & 