In [3]:
from repliclust import Archetype, DataGenerator, set_seed
import numpy as np
from scipy.stats import norm
from overlap_approximations import compute_exact_q
from overlap_approximations import compute_lda_q
from overlap_approximations import compute_heuristic_q
from overlap_approximations import compute_overlaps
from repliclust.utils import assemble_covariance_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score as ami

overlap_vals = 0.95*(10**(np.linspace(-4,0,50)))
dim_vals = [10,100,500]
k_vals = [5,10,30]
samples_per_cluster_vals = [100,250]
PCT = 0.025

data_ami_vs_overlap = []
cols = ["overlap", "k", "dim", "samples_per_cluster", 
        "aspect_r", "aspect_mm", "radius_mm", 
        "min_o_exact", "max_o_exact",
        "min_o_lda", "max_o_lda",
        "min_o_heuristic", "max_o_heuristic",
        "ami_kmeans"]

for overlap in overlap_vals:
    for k in k_vals:
        for dim in dim_vals:
            for samples_per_cluster in samples_per_cluster_vals:
                aspect_r = np.random.uniform(1,10)
                aspect_mm = np.random.uniform(1,10)
                radius_mm = np.random.uniform(1,10)
                archie = Archetype(n_clusters=k,dim=dim,
                            max_overlap=(1+PCT)*overlap,
                            min_overlap=(1-PCT)*overlap,
                            aspect_ref=aspect_r,
                            aspect_maxmin=aspect_mm,
                            radius_maxmin=radius_mm,
                            overlap_mode='lda')

                mix_model = archie.sample_mixture_model()
                centers = mix_model.centers
                cov = [assemble_covariance_matrix(mix_model.axes_list[j], 
                                                mix_model.axis_lengths_list[j])
                        for j in range(centers.shape[0])]

                # compute max and min observed overlap using the approximate methods
                min_o_exact, max_o_exact = compute_overlaps(centers, cov, mode='exact')
                min_o_lda, max_o_lda = compute_overlaps(centers, cov, mode='lda')
                min_o_heuristic, max_o_heuristic = compute_overlaps(centers, cov, mode='heuristic')

                # generate data from the archetype
                groupsizes = samples_per_cluster*np.ones(k, dtype='int')
                X, y = mix_model.sample_data(groupsizes)

                # compute AMI for K-Means
                kmeans = KMeans(n_clusters=k,
                                max_iter=500, n_init=10,
                                init="k-means++")
                y_hat_kmeans = kmeans.fit_predict(X)
                ami_kmeans = ami(y, y_hat_kmeans)

                data_ami_vs_overlap.append(
                    (overlap, k, dim, samples_per_cluster, 
                     aspect_r, aspect_mm, radius_mm, 
                     min_o_exact, max_o_exact,
                     min_o_lda, max_o_lda,
                     min_o_heuristic, max_o_heuristic,
                     ami_kmeans)
                )





Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 5537.81it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1912.79it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 801.06it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 588.14it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 509.28it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1202.50it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 3161.15it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 350.99it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:01<00:00, 266.33it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:08<00:00, 33.95it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 30



Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 6482.29it/s, Status=SUCCESS]




Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 527.18it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 6434.46it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 2110.27it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1141.69it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1416.10it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:01<00:00, 200.89it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 122.63it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:01<00:00, 186.44it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:04<00:00, 70.15it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:18<00:00, 16.60it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300



Optimizing Cluster Centers: 100%|██████████| 300/300 [00:06<00:00, 46.51it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 110.27it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:17<00:00, 16.70it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:18<00:00, 16.37it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [04:05<00:00,  1.22it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:33<00:00,  8.83it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 10455.53it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 12162.93it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 3932.31it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 7436.45it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/



Optimizing Cluster Centers: 100%|██████████| 300/300 [00:25<00:00, 11.99it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:36<00:00,  8.33it/s, Status=SUCCESS]




Optimizing Cluster Centers: 100%|██████████| 300/300 [04:49<00:00,  1.04it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 5488.90it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 10213.15it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 5147.12it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 6130.32it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 992.78it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 586.50it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1872.17it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 5632.41it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 988.81it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████|



Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 8679.55it/s, Status=SUCCESS] 
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 3222.03it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1758.88it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1008.89it/s, Status=SUCCESS]




Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 4856.02it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1488.29it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 867.17it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:04<00:00, 72.09it/s, Status=SUCCESS] 
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:03<00:00, 94.87it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 110.64it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 120.52it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:10<00:00, 29.68it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:35<00:00,  8.46it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:46<00:00,  6.43it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300



Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 6223.31it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1445.22it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 4450.84it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 569.66it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 943.76it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1469.22it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 2360.83it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:01<00:00, 281.27it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 477.79it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:06<00:00, 47.08it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 3



Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 3102.30it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 770.77it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1164.39it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 147.63it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 123.56it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 119.32it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:04<00:00, 64.92it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:13<00:00, 21.53it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:14<00:00, 20.23it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [02:32<00:00,  1.96it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300



Optimizing Cluster Centers: 100%|██████████| 300/300 [00:01<00:00, 180.68it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:01<00:00, 157.30it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 144.59it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 122.14it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:10<00:00, 28.52it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:19<00:00, 15.76it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [01:16<00:00,  3.90it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:47<00:00,  6.36it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 4907.57it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 12625.21it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/30



Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 6236.11it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1315.11it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 919.52it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 4962.19it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 8706.27it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 1491.54it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:00<00:00, 498.82it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:03<00:00, 86.64it/s, Status=SUCCESS] 
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:03<00:00, 97.79it/s, Status=SUCCESS] 
Optimizing Cluster Centers: 100%|██████████| 300/300 [00:02<00:00, 110.08it/s, Status=SUCCESS]
Optimizing Cluster Centers: 100%|██████████| 

In [4]:
import pandas as pd

df_ami_vs_overlap = pd.DataFrame(data_ami_vs_overlap, columns=cols)
#df_ami_vs_overlap.to_csv('overlap-and-performance.csv',index=False)

In [5]:
df_ami_vs_overlap

Unnamed: 0,overlap,k,dim,samples_per_cluster,aspect_r,aspect_mm,radius_mm,min_o_exact,max_o_exact,min_o_lda,max_o_lda,min_o_heuristic,max_o_heuristic,ami_kmeans
0,0.000095,5,10,100,6.494817,9.976785,6.804347,0.000081,0.000515,0.000085,0.000556,0.001611,0.071928,0.840931
1,0.000095,5,10,250,9.240869,2.660269,6.229838,0.000069,0.000093,0.000095,0.000096,0.002977,0.023320,0.757704
2,0.000095,5,100,100,3.415063,7.221878,4.410352,0.000090,0.000096,0.000094,0.000097,0.000422,0.001010,0.754669
3,0.000095,5,100,250,1.270091,6.319799,6.962098,0.000092,0.000097,0.000093,0.000097,0.000100,0.000112,0.885441
4,0.000095,5,500,100,9.720850,5.335017,1.463916,0.000082,0.000092,0.000093,0.000095,0.000450,0.001422,0.568519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.950000,30,10,250,8.821992,8.813558,5.465036,0.920347,0.928906,0.926253,0.929251,0.925790,0.946055,0.229875
896,0.950000,30,100,100,7.027125,5.852036,3.225751,0.923726,0.929479,0.926250,0.929612,0.930123,0.942772,0.094305
897,0.950000,30,100,250,5.175280,6.741014,7.040062,0.924580,0.927503,0.926258,0.928258,0.929716,0.942263,0.151209
898,0.950000,30,500,100,9.151925,3.935476,4.108060,0.923360,0.942082,0.926288,0.942084,0.929940,0.952468,0.057057
