In [1]:
import numpy as np
from sklearn.datasets import make_blobs 
from sklearn.cluster import KMeans
import pandas as pd
from src.cagg_memview import aggregate
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import matplotlib
plt.style.use('ggplot')
# np.random.seed(0)

In [2]:
def wcss(data, labels, centers):
    _sum = 0
    for i in np.unique(labels):
        c = centers[i]
        partition = data[labels == i]
        _sum = _sum + np.sum(np.linalg.norm(partition - c, ord=2, axis=1)**2)
    return _sum

In [3]:
X, y = make_blobs(n_samples=1000, centers=10, n_features=2, random_state=0)

WCSS_Kmeans = list()
WCSS_Agg_sp = list()
WCSS_Agg_center = list()
WCSS_Agg_median = list()
upper_bound = list()

for TOL in tqdm(np.arange(1.0, 0.0, -0.01)):
    agg_labels, splist, nr_dist = aggregate(X, sorting="2-norm", tol=TOL)
    sps = np.array(splist)[:, 3:]
    centers = list()
    m_centers = list()
    for i in np.unique(agg_labels):
        centers.append(X[agg_labels == i].mean(axis=0))
        m_centers.append(np.median(X[agg_labels == i], axis=0))
        
    upper_bound.append(0.5*(TOL**2)*(1000 - len(np.unique(agg_labels))))
    kmeans = KMeans(n_clusters=len(np.unique(agg_labels)),init='k-means++', random_state=0).fit(X)
    WCSS_Kmeans.append(kmeans.inertia_) # or WCSS_Kmeans.append(wcss(X, kmeans.labels_, kmeans.cluster_centers_))
    WCSS_Agg_sp.append(wcss(X, agg_labels, sps))
    WCSS_Agg_center.append(wcss(X, agg_labels, np.array(centers)))
    WCSS_Agg_median.append(wcss(X, agg_labels, np.array(m_centers)))

100%|██████████| 100/100 [04:11<00:00,  2.51s/it]


In [4]:
WCSS = pd.DataFrame()
WCSS['Kmeans++'] = WCSS_Kmeans
WCSS['Aggregation - starting points'] = WCSS_Agg_sp
WCSS['Aggregation - mean centers'] = WCSS_Agg_center
WCSS['Aggregation - median centers'] = WCSS_Agg_median
WCSS['Probabilistic estimate'] = upper_bound
WCSS.to_csv('results/WCSS.csv', index=False)