In [1]:
from sklearn.datasets import make_blobs
from seeding import *
from kmeans import StandardKMeans2, mp2KMeans, chop
from tqdm import tqdm
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
from sklearn.cluster._k_means_common import _inertia_dense
from sklearn.metrics.cluster import adjusted_mutual_info_score as ami
import pandas as pd 

LOW_PREC = chop(np.float16)

In [2]:
clusters = [2, 5, 10, 25, 50, 75]
tols = [0.1, 0.3, 0.5, 0.7, 1]

ndim = [2, 5, 10]

In [3]:
num_records = np.zeros((len(clusters), len(tols)))
mp_num_records = np.zeros((len(clusters), len(tols)))

iter_records_1 = np.zeros((len(clusters), len(tols)))
start_inertia_1 = np.zeros((len(clusters), len(tols)))
end_inertia_1= np.zeros((len(clusters), len(tols)))

iter_records_2 = np.zeros((len(clusters), len(tols)))
start_inertia_2 = np.zeros((len(clusters), len(tols)))
end_inertia_2 = np.zeros((len(clusters), len(tols)))

mp_iter_records_1 = np.zeros((len(clusters), len(tols)))
mp_start_inertia_1 = np.zeros((len(clusters), len(tols)))
mp_end_inertia_1 = np.zeros((len(clusters), len(tols)))

mp_iter_records_2 = np.zeros((len(clusters), len(tols)))
mp_start_inertia_2 = np.zeros((len(clusters), len(tols)))
mp_end_inertia_2 = np.zeros((len(clusters), len(tols)))

for i in tqdm(range(len(clusters))):
    for j in range(len(tols)):
        cn = clusters[i]
        X, y = make_blobs(n_samples=5000, centers=cn, n_features=ndim[0],
                          random_state=42)
        
        kmeans_psa = StandardKMeans2(alpha=tols[j], seeding='psa')
        kmeans_psa.fit(X)
        
        kmeans_d2 = StandardKMeans2(n_clusters=kmeans_psa.centers.shape[0], seeding='d2')
        kmeans_d2.fit(X)
        
        mpkmeans_psa = mp2KMeans(alpha=tols[j], seeding='psa', low_prec=LOW_PREC)
        mpkmeans_psa.fit(X)
        
        mpkmeans_d2 = mp2KMeans(n_clusters=mpkmeans_psa.centers.shape[0], seeding='d2', low_prec=LOW_PREC)
        mpkmeans_d2.fit(X)
        
        num_records[i, j] = kmeans_psa.centers.shape[0]
        mp_num_records[i, j] = mpkmeans_psa.centers.shape[0]
        
        iter_records_1[i, j] = len(kmeans_psa.inertia) - 1
        iter_records_2[i, j] = len(kmeans_d2.inertia) - 1
        
        mp_iter_records_1[i, j] = len(mpkmeans_psa.inertia) - 1
        mp_iter_records_2[i, j] = len(mpkmeans_d2.inertia) - 1
        
        start_inertia_1[i, j] = kmeans_psa.inertia[0]
        start_inertia_2[i, j] = kmeans_d2.inertia[0]
        
        mp_start_inertia_1[i, j] = mpkmeans_psa.inertia[0]
        mp_start_inertia_2[i, j] = mpkmeans_d2.inertia[0]
        
        end_inertia_1[i, j] = kmeans_psa.inertia[-1]
        end_inertia_2[i, j] = kmeans_d2.inertia[-1]
        
        mp_end_inertia_1[i, j] = mpkmeans_psa.inertia[-1]
        mp_end_inertia_2[i, j] = mpkmeans_d2.inertia[-1]
        
num_records = pd.DataFrame(num_records, index=clusters, columns=tols)
mp_num_records = pd.DataFrame(mp_num_records, index=clusters, columns=tols)

iter_records_1 = pd.DataFrame(iter_records_1, index=clusters, columns=tols)
start_inertia_1 = pd.DataFrame(start_inertia_1, index=clusters, columns=tols)
end_inertia_1 = pd.DataFrame(end_inertia_1, index=clusters, columns=tols)

iter_records_2 = pd.DataFrame(iter_records_2, index=clusters, columns=tols)
start_inertia_2 = pd.DataFrame(start_inertia_2, index=clusters, columns=tols)
end_inertia_2 = pd.DataFrame(end_inertia_2, index=clusters, columns=tols)

mp_iter_records_1 = pd.DataFrame(mp_iter_records_1, index=clusters, columns=tols)
mp_start_inertia_1 = pd.DataFrame(mp_start_inertia_1, index=clusters, columns=tols)
mp_end_inertia_1 = pd.DataFrame(mp_end_inertia_1, index=clusters, columns=tols)

mp_iter_records_2 = pd.DataFrame(mp_iter_records_2, index=clusters, columns=tols)
mp_start_inertia_2 = pd.DataFrame(mp_start_inertia_2, index=clusters, columns=tols)
mp_end_inertia_2 = pd.DataFrame(mp_end_inertia_2, index=clusters, columns=tols)

num_records.to_csv("results/benpsa/dim2num_records.csv")
mp_num_records.to_csv("results/benpsa/dim2mp_num_records.csv")

iter_records_1.to_csv("results/benpsa/dim2iter_records_1.csv")
start_inertia_1.to_csv("results/benpsa/dim2start_inertia_1.csv")
end_inertia_1.to_csv("results/benpsa/dim2end_inertia_1.csv")

iter_records_2.to_csv("results/benpsa/dim2iter_records_2.csv")
start_inertia_2.to_csv("results/benpsa/dim2start_inertia_2.csv")
end_inertia_2.to_csv("results/benpsa/dim2end_inertia_2.csv")

mp_iter_records_1.to_csv("results/benpsa/dim2mp_iter_records_1.csv")
mp_start_inertia_1.to_csv("results/benpsa/dim2mp_start_inertia_1.csv")
mp_end_inertia_1.to_csv("results/benpsa/dim2mp_end_inertia_1.csv")

mp_iter_records_2.to_csv("results/benpsa/dim2mp_iter_records_2.csv")
mp_start_inertia_2.to_csv("results/benpsa/dim2mp_start_inertia_2.csv")
mp_end_inertia_2.to_csv("results/benpsa/dim2mp_end_inertia_2.csv")

100%|██████████| 6/6 [09:53<00:00, 98.89s/it] 


In [None]:
num_records = np.zeros((len(clusters), len(tols)))
mp_num_records = np.zeros((len(clusters), len(tols)))

iter_records_1 = np.zeros((len(clusters), len(tols)))
start_inertia_1 = np.zeros((len(clusters), len(tols)))
end_inertia_1= np.zeros((len(clusters), len(tols)))

iter_records_2 = np.zeros((len(clusters), len(tols)))
start_inertia_2 = np.zeros((len(clusters), len(tols)))
end_inertia_2 = np.zeros((len(clusters), len(tols)))

mp_iter_records_1 = np.zeros((len(clusters), len(tols)))
mp_start_inertia_1 = np.zeros((len(clusters), len(tols)))
mp_end_inertia_1 = np.zeros((len(clusters), len(tols)))

mp_iter_records_2 = np.zeros((len(clusters), len(tols)))
mp_start_inertia_2 = np.zeros((len(clusters), len(tols)))
mp_end_inertia_2 = np.zeros((len(clusters), len(tols)))

for i in tqdm(range(len(clusters))):
    for j in range(len(tols)):
        cn = clusters[i]
        X, y = make_blobs(n_samples=5000, centers=cn, n_features=ndim[1],
                          random_state=42)
        
        kmeans_psa = StandardKMeans2(alpha=tols[j], seeding='psa')
        kmeans_psa.fit(X)
        
        kmeans_d2 = StandardKMeans2(n_clusters=kmeans_psa.centers.shape[0], seeding='d2')
        kmeans_d2.fit(X)
        
        mpkmeans_psa = mp2KMeans(alpha=tols[j], seeding='psa', low_prec=LOW_PREC)
        mpkmeans_psa.fit(X)
        
        mpkmeans_d2 = mp2KMeans(n_clusters=mpkmeans_psa.centers.shape[0], seeding='d2', low_prec=LOW_PREC)
        mpkmeans_d2.fit(X)
        
        num_records[i, j] = kmeans_psa.centers.shape[0]
        mp_num_records[i, j] = mpkmeans_psa.centers.shape[0]
        
        iter_records_1[i, j] = len(kmeans_psa.inertia) - 1
        iter_records_2[i, j] = len(kmeans_d2.inertia) - 1
        
        mp_iter_records_1[i, j] = len(mpkmeans_psa.inertia) - 1
        mp_iter_records_2[i, j] = len(mpkmeans_d2.inertia) - 1
        
        start_inertia_1[i, j] = kmeans_psa.inertia[0]
        start_inertia_2[i, j] = kmeans_d2.inertia[0]
        
        mp_start_inertia_1[i, j] = mpkmeans_psa.inertia[0]
        mp_start_inertia_2[i, j] = mpkmeans_d2.inertia[0]
        
        end_inertia_1[i, j] = kmeans_psa.inertia[-1]
        end_inertia_2[i, j] = kmeans_d2.inertia[-1]
        
        mp_end_inertia_1[i, j] = mpkmeans_psa.inertia[-1]
        mp_end_inertia_2[i, j] = mpkmeans_d2.inertia[-1]
        
num_records = pd.DataFrame(num_records, index=clusters, columns=tols)
mp_num_records = pd.DataFrame(mp_num_records, index=clusters, columns=tols)

iter_records_1 = pd.DataFrame(iter_records_1, index=clusters, columns=tols)
start_inertia_1 = pd.DataFrame(start_inertia_1, index=clusters, columns=tols)
end_inertia_1 = pd.DataFrame(end_inertia_1, index=clusters, columns=tols)

iter_records_2 = pd.DataFrame(iter_records_2, index=clusters, columns=tols)
start_inertia_2 = pd.DataFrame(start_inertia_2, index=clusters, columns=tols)
end_inertia_2 = pd.DataFrame(end_inertia_2, index=clusters, columns=tols)

mp_iter_records_1 = pd.DataFrame(mp_iter_records_1, index=clusters, columns=tols)
mp_start_inertia_1 = pd.DataFrame(mp_start_inertia_1, index=clusters, columns=tols)
mp_end_inertia_1 = pd.DataFrame(mp_end_inertia_1, index=clusters, columns=tols)

mp_iter_records_2 = pd.DataFrame(mp_iter_records_2, index=clusters, columns=tols)
mp_start_inertia_2 = pd.DataFrame(mp_start_inertia_2, index=clusters, columns=tols)
mp_end_inertia_2 = pd.DataFrame(mp_end_inertia_2, index=clusters, columns=tols)

num_records.to_csv("results/benpsa/dim5num_records.csv")
mp_num_records.to_csv("results/benpsa/dim5mp_num_records.csv")

iter_records_1.to_csv("results/benpsa/dim5iter_records_1.csv")
start_inertia_1.to_csv("results/benpsa/dim5start_inertia_1.csv")
end_inertia_1.to_csv("results/benpsa/dim5end_inertia_1.csv")

iter_records_2.to_csv("results/benpsa/dim5iter_records_2.csv")
start_inertia_2.to_csv("results/benpsa/dim5start_inertia_2.csv")
end_inertia_2.to_csv("results/benpsa/dim5end_inertia_2.csv")

mp_iter_records_1.to_csv("results/benpsa/dim5mp_iter_records_1.csv")
mp_start_inertia_1.to_csv("results/benpsa/dim5mp_start_inertia_1.csv")
mp_end_inertia_1.to_csv("results/benpsa/dim5mp_end_inertia_1.csv")

mp_iter_records_2.to_csv("results/benpsa/dim5mp_iter_records_2.csv")
mp_start_inertia_2.to_csv("results/benpsa/dim5mp_start_inertia_2.csv")
mp_end_inertia_2.to_csv("results/benpsa/dim5mp_end_inertia_2.csv")

 67%|██████▋   | 4/6 [17:26<07:45, 232.81s/it]

In [None]:
num_records = np.zeros((len(clusters), len(tols)))
mp_num_records = np.zeros((len(clusters), len(tols)))

iter_records_1 = np.zeros((len(clusters), len(tols)))
start_inertia_1 = np.zeros((len(clusters), len(tols)))
end_inertia_1= np.zeros((len(clusters), len(tols)))

iter_records_2 = np.zeros((len(clusters), len(tols)))
start_inertia_2 = np.zeros((len(clusters), len(tols)))
end_inertia_2 = np.zeros((len(clusters), len(tols)))

mp_iter_records_1 = np.zeros((len(clusters), len(tols)))
mp_start_inertia_1 = np.zeros((len(clusters), len(tols)))
mp_end_inertia_1 = np.zeros((len(clusters), len(tols)))

mp_iter_records_2 = np.zeros((len(clusters), len(tols)))
mp_start_inertia_2 = np.zeros((len(clusters), len(tols)))
mp_end_inertia_2 = np.zeros((len(clusters), len(tols)))

for i in tqdm(range(len(clusters))):
    for j in range(len(tols)):
        cn = clusters[i]
        X, y = make_blobs(n_samples=5000, centers=cn, n_features=ndim[2],
                          random_state=42)
        
        kmeans_psa = StandardKMeans2(alpha=tols[j], seeding='psa')
        kmeans_psa.fit(X)
        
        kmeans_d2 = StandardKMeans2(n_clusters=kmeans_psa.centers.shape[0], seeding='d2')
        kmeans_d2.fit(X)
        
        mpkmeans_psa = mp2KMeans(alpha=tols[j], seeding='psa', low_prec=LOW_PREC)
        mpkmeans_psa.fit(X)
        
        mpkmeans_d2 = mp2KMeans(n_clusters=mpkmeans_psa.centers.shape[0], seeding='d2', low_prec=LOW_PREC)
        mpkmeans_d2.fit(X)
        
        num_records[i, j] = kmeans_psa.centers.shape[0]
        mp_num_records[i, j] = mpkmeans_psa.centers.shape[0]
        
        iter_records_1[i, j] = len(kmeans_psa.inertia) - 1
        iter_records_2[i, j] = len(kmeans_d2.inertia) - 1
        
        mp_iter_records_1[i, j] = len(mpkmeans_psa.inertia) - 1
        mp_iter_records_2[i, j] = len(mpkmeans_d2.inertia) - 1
        
        start_inertia_1[i, j] = kmeans_psa.inertia[0]
        start_inertia_2[i, j] = kmeans_d2.inertia[0]
        
        mp_start_inertia_1[i, j] = mpkmeans_psa.inertia[0]
        mp_start_inertia_2[i, j] = mpkmeans_d2.inertia[0]
        
        end_inertia_1[i, j] = kmeans_psa.inertia[-1]
        end_inertia_2[i, j] = kmeans_d2.inertia[-1]
        
        mp_end_inertia_1[i, j] = mpkmeans_psa.inertia[-1]
        mp_end_inertia_2[i, j] = mpkmeans_d2.inertia[-1]
        
num_records = pd.DataFrame(num_records, index=clusters, columns=tols)
mp_num_records = pd.DataFrame(mp_num_records, index=clusters, columns=tols)

iter_records_1 = pd.DataFrame(iter_records_1, index=clusters, columns=tols)
start_inertia_1 = pd.DataFrame(start_inertia_1, index=clusters, columns=tols)
end_inertia_1 = pd.DataFrame(end_inertia_1, index=clusters, columns=tols)

iter_records_2 = pd.DataFrame(iter_records_2, index=clusters, columns=tols)
start_inertia_2 = pd.DataFrame(start_inertia_2, index=clusters, columns=tols)
end_inertia_2 = pd.DataFrame(end_inertia_2, index=clusters, columns=tols)

mp_iter_records_1 = pd.DataFrame(mp_iter_records_1, index=clusters, columns=tols)
mp_start_inertia_1 = pd.DataFrame(mp_start_inertia_1, index=clusters, columns=tols)
mp_end_inertia_1 = pd.DataFrame(mp_end_inertia_1, index=clusters, columns=tols)

mp_iter_records_2 = pd.DataFrame(mp_iter_records_2, index=clusters, columns=tols)
mp_start_inertia_2 = pd.DataFrame(mp_start_inertia_2, index=clusters, columns=tols)
mp_end_inertia_2 = pd.DataFrame(mp_end_inertia_2, index=clusters, columns=tols)

num_records.to_csv("results/benpsa/dim10num_records.csv")
mp_num_records.to_csv("results/benpsa/dim10mp_num_records.csv")

iter_records_1.to_csv("results/benpsa/dim10iter_records_1.csv")
start_inertia_1.to_csv("results/benpsa/dim10start_inertia_1.csv")
end_inertia_1.to_csv("results/benpsa/dim10end_inertia_1.csv")

iter_records_2.to_csv("results/benpsa/dim10iter_records_2.csv")
start_inertia_2.to_csv("results/benpsa/dim10start_inertia_2.csv")
end_inertia_2.to_csv("results/benpsa/dim10end_inertia_2.csv")

mp_iter_records_1.to_csv("results/benpsa/dim10mp_iter_records_1.csv")
mp_start_inertia_1.to_csv("results/benpsa/dim10mp_start_inertia_1.csv")
mp_end_inertia_1.to_csv("results/benpsa/dim10mp_end_inertia_1.csv")

mp_iter_records_2.to_csv("results/benpsa/dim10mp_iter_records_2.csv")
mp_start_inertia_2.to_csv("results/benpsa/dim10mp_start_inertia_2.csv")
mp_end_inertia_2.to_csv("results/benpsa/dim10mp_end_inertia_2.csv")

In [17]:

X, y = make_blobs(n_samples=500, centers=6, n_features=100,
                  random_state=0)

centers = all_low_pca_aggregate2(X, 1, LOW_PREC)

In [24]:
mpkmeans_d2 = mp2KMeans(alpha=0.8, seeding='psa', low_prec=LOW_PREC)
mpkmeans_d2.fit(X)

In [25]:
mpkmeans_d2.centers.shape

(23, 100)

In [42]:
X.min(axis=0)

array([-3.12768911, -1.82113055, -2.82066755])

In [43]:
X.max(axis=0)

array([2.75434898, 1.73284889, 2.51927051])