In [1]:
!pip install repliclust

import repliclust as rpl
import pickle

archetypes = [ rpl.Archetype.from_verbal_description(descr) for descr in [
    "twelve clusters of different distributions",
    "twelve clusters of different distributions and high class imbalance",
    "seven highly separated clusters in 10D with very different shapes",
    "seven clusters in 10D with very different shapes and significant overlap",
    "four clusters in 100D with 100 samples each",
    "four clusters in 100D with 1000 samples each",
] ]

Defaulting to user installation because normal site-packages is not writeable
Processing /home/ubuntu/repliclust-revision/repliclust
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: repliclust
  Building wheel for repliclust (pyproject.toml) ... [?25ldone
[?25h  Created wheel for repliclust: filename=repliclust-0.0.5-py3-none-any.whl size=40767 sha256=f6427158cd45f46d366701ac6b38b76762564faa25d1434389d7fcf02df27041
  Stored in directory: /tmp/pip-ephem-wheel-cache-i9al6oqk/wheels/12/60/99/b2b7f58bf25f0d95d5d3a799ae95badacaff7bfe6ea18fd728
Successfully built repliclust
Installing collected packages: repliclust
  Attempting uninstall: repliclust
    Found existing installation: repliclust 0.0.5
    Uninstalling repliclust-0.0.5:
      Successfully uninstalled repliclust-0.0.5
Successfully installed repliclust-0.0.5


In [1]:
!pip install ../repliclust

import repliclust as rpl
import pickle

with open("./output/archetypes.pkl", "rb") as file:
    archetypes = pickle.load(file)

Defaulting to user installation because normal site-packages is not writeable
Processing /home/ubuntu/repliclust-revision/repliclust
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: repliclust
  Building wheel for repliclust (pyproject.toml) ... [?25ldone
[?25h  Created wheel for repliclust: filename=repliclust-0.0.5-py3-none-any.whl size=40797 sha256=0784dafaed739186c95ecf3ba95cf789a586b75ea21f72f21ca91245c4600894
  Stored in directory: /tmp/pip-ephem-wheel-cache-5x9tz72p/wheels/12/60/99/b2b7f58bf25f0d95d5d3a799ae95badacaff7bfe6ea18fd728
Successfully built repliclust
Installing collected packages: repliclust
  Attempting uninstall: repliclust
    Found existing installation: repliclust 0.0.5
    Uninstalling repliclust-0.0.5:
      Successfully uninstalled repliclust-0.0.5
Successfully installed repliclust-0.0.5


In [2]:
from sklearn.cluster import KMeans, SpectralClustering, HDBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_mutual_info_score, adjusted_rand_score
from sklearn.mixture import GaussianMixture

def carry_out_benchmark(X, y, archetype):
    gmm = GaussianMixture(n_components=archetype.n_clusters,
                            max_iter=500, n_init=10,
                            init_params="k-means++")
    kmeans = KMeans(n_clusters=archetype.n_clusters,
                    max_iter=500, n_init=10,
                    init="k-means++")
    hdbscan = HDBSCAN(min_samples=5)
    hac = AgglomerativeClustering(n_clusters=archetype.n_clusters)
    spectral = SpectralClustering(n_clusters=archetype.n_clusters, affinity='nearest_neighbors')

    models = {
        "gmm": gmm,
        "kmeans": kmeans,
        "hdbscan": hdbscan,
        "spectral": spectral,
        "hac": hac
    }

    predictions = {}
    ami_scores = {}
    arand_scores = {}
    f_noise_scores = {}
    
    for model_name in models.keys():
        model = models[model_name]
        y_hat = model.fit_predict(X)

        if model_name == 'hdbscan':
            slxn = (y_hat != -1)
            ami = adjusted_mutual_info_score(y[slxn], y_hat[slxn])
            arand = adjusted_rand_score(y[slxn], y_hat[slxn])
            f_noise = np.mean(y_hat == -1)
        else:
            ami = adjusted_mutual_info_score(y, y_hat)
            arand = adjusted_rand_score(y, y_hat)
            f_noise = 0
        ami_scores[model_name] = ami
        arand_scores[model_name] = arand
        f_noise_scores[model_name] = f_noise

    return { 'archetype': archetype, 'ami_scores': ami_scores, 'arand_scores': arand_scores, 'f_noise_scores': f_noise_scores }

In [3]:
from tqdm import tqdm
import numpy as np
import pandas as pd

results = []
results_tf = []

for arch in archetypes:
    for _ in tqdm(range(10)):
        # untransformed
        X,y,_ = arch.synthesize(quiet=True)
        record = carry_out_benchmark(X,y,arch)
        results.append(record)

        # transformed
        X_tf = rpl.distort(X)
        record = carry_out_benchmark(X_tf,y,arch)
        results_tf.append(record)

100%|██████████| 10/10 [00:43<00:00,  4.33s/it]
100%|██████████| 10/10 [00:42<00:00,  4.23s/it]
100%|██████████| 10/10 [00:49<00:00,  4.91s/it]
100%|██████████| 10/10 [04:08<00:00, 24.86s/it]
100%|██████████| 10/10 [04:14<00:00, 25.42s/it]
100%|██████████| 10/10 [27:33<00:00, 165.35s/it]


In [9]:
import pandas as pd
import numpy as np

with open("./output/results.pkl", "rb") as file:
    results = pickle.load(file)
with open("./output/results_tf.pkl", "rb") as file:
    results_tf = pickle.load(file)

In [7]:
pd.DataFrame([ record['ami_scores'] | {"archetype": record["archetype"]} for record in results ]).groupby(by="archetype").std()
# pd.DataFrame([ record['ami_scores'] for record in results ]).mean()

pd.DataFrame([ record['ami_scores'] | {"archetype": record["archetype"]} for record in results_tf ]).drop(columns=["archetype"]).std()/np.sqrt(len(archetypes) * 10)
# pd.DataFrame([ record['ami_scores'] for record in results_tf ]).mean()

Unnamed: 0_level_0,gmm,kmeans,hdbscan,spectral,hac
archetype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
four_clusters_100d_1000_samples,0.192196,0.071925,0.421637,0.043482,0.07819
four_clusters_100d_100_samples,0.025535,0.088461,0.0,0.072695,0.069805
seven_highly_separated_10d_very_different_shapes,0.029666,0.015038,0.017563,0.008949,0.006035
seven_very_different_shapes_significant_overlap_10d,0.060868,0.027187,0.347019,0.045064,0.032753
twelve_clusters_different_distributions,0.020783,0.026388,0.141678,0.031506,0.02342
twelve_different_distributions_high_class_imbalance,0.025146,0.023449,0.171761,0.030163,0.028116


In [21]:
hdbscan_df = pd.DataFrame([
    {'archetype': record['archetype'], 'ami': record['ami_scores']['hdbscan'], 'ari': record['arand_scores']['hdbscan'], 'noise': record['f_noise_scores']['hdbscan']}
    for record in results
])
hdbscan_tf_df = pd.DataFrame([
    {'archetype': record['archetype'], 'ami': record['ami_scores']['hdbscan'], 'ari': record['arand_scores']['hdbscan'], 'noise': record['f_noise_scores']['hdbscan']}
    for record in results_tf
])

In [44]:
### HDBSCAN results

print("Mean Scores (Convex Clusters)")
print(hdbscan_df.groupby(by="archetype").mean())

print("\nStandard Deviations (Convex Clusters)")
print(hdbscan_df.groupby(by="archetype").std()/np.sqrt(10))

print("\nAverages (Convex Clusters)")
print(hdbscan_df.drop(columns=['archetype']).mean())

print("\nStandard Deviation for Averages (Convex Clusters)")
print(hdbscan_df.drop(columns=['archetype']).std()/np.sqrt(len(archetypes)*10))

print("\n\nMean Scores (Non-Convex Clusters)")
print(hdbscan_tf_df.groupby(by="archetype").mean())

print("\nStandard Deviations (Non-Convex Clusters)")
print(hdbscan_tf_df.groupby(by="archetype").std()/np.sqrt(10))

print("\nAverages (Non-Convex Clusters)")
print(hdbscan_tf_df.drop(columns=['archetype']).mean())

print("\nStandard Deviation for Averages (Non-Convex Clusters)")
print(hdbscan_tf_df.drop(columns=['archetype']).std()/np.sqrt(len(archetypes)*10))

Mean Scores (Convex Clusters)
                                                         ami       ari  \
archetype                                                                
four_clusters_100d_1000_samples                     0.800000  0.800000   
four_clusters_100d_100_samples                      1.000000  1.000000   
seven_highly_separated_10d_very_different_shapes    0.992324  0.983972   
seven_very_different_shapes_significant_overlap...  0.595328  0.616973   
twelve_clusters_different_distributions             0.795948  0.704565   
twelve_different_distributions_high_class_imbal...  0.733352  0.571849   

                                                       noise  
archetype                                                     
four_clusters_100d_1000_samples                     0.999400  
four_clusters_100d_100_samples                      1.000000  
seven_highly_separated_10d_very_different_shapes    0.175000  
seven_very_different_shapes_significant_overlap...  0.909286  

In [None]:
results_df = pd.DataFrame(results).drop(columns=["hdbscan"])
results_tf_df = pd.DataFrame(results_tf).drop(columns=["hdbscan"])

In [60]:
### AMI Scores

ami_df = pd.DataFrame([ record['ami_scores'] | {"archetype": record["archetype"]} for record in results ])
ami_tf_df = pd.DataFrame([ record['ami_scores'] | {"archetype": record["archetype"]} for record in results_tf ])

print("AMI Results (Convex)")
print(ami_df.groupby(by="archetype").mean())

print("\nStandard Deviations for AMI Results (Convex)")
print(ami_df.groupby(by="archetype").std()/np.sqrt(10))

print("\nAverages for AMI Results (Convex)")
print(ami_df.drop(columns=["archetype"]).mean())

print("\nStandard Deviations for Averages of AMI Results (Convex)")
print(ami_df.drop(columns=["archetype"]).std()/np.sqrt(len(archetypes)*10))

print("\nAMI Results (Non-Convex)")
print(ami_tf_df.groupby(by="archetype").mean())

print("\nStandard Deviations for AMI Results (Non-Convex)")
print(ami_tf_df.groupby(by="archetype").std()/np.sqrt(10))

print("\nAverages for AMI Results (Non-Convex)")
print(ami_tf_df.drop(columns=["archetype"]).mean())

print("\nStandard Deviations for Averages of AMI Results (Non-Convex)")
print(ami_tf_df.drop(columns=["archetype"]).std()/np.sqrt(len(archetypes)*10))

####### ARI Scores

ari_df = pd.DataFrame([ record['arand_scores'] | {"archetype": record["archetype"]} for record in results ])
ari_tf_df = pd.DataFrame([ record['arand_scores'] | {"archetype": record["archetype"]} for record in results_tf ])

print("ARI Results (Convex)")
print(ari_df.groupby(by="archetype").mean())

print("\nStandard Deviations for ARI Results (Convex)")
print(ari_df.groupby(by="archetype").std()/np.sqrt(10))

print("\nAverages for ARI Results (Convex)")
print(ari_df.drop(columns=["archetype"]).mean())

print("\nStandard Deviations for Averages of ARI Results (Convex)")
print(ari_df.drop(columns=["archetype"]).std()/np.sqrt(len(archetypes)*10))

print("\nARI Results (Non-Convex)")
print(ari_tf_df.groupby(by="archetype").mean())

print("\nStandard Deviations for ARI Results (Non-Convex)")
print(ari_tf_df.groupby(by="archetype").std()/np.sqrt(10))

print("\nAverages for ARI Results (Non-Convex)")
print(ari_tf_df.drop(columns=["archetype"]).mean())

print("\nStandard Deviations for Averages of ARI Results (Non-Convex)")
print(ari_tf_df.drop(columns=["archetype"]).std()/np.sqrt(len(archetypes)*10))

AMI Results (Convex)
                                                         gmm    kmeans  \
archetype                                                                
four_clusters_100d_1000_samples                     0.548380  0.664360   
four_clusters_100d_100_samples                      0.063142  0.511513   
seven_highly_separated_10d_very_different_shapes    0.982781  0.975678   
seven_very_different_shapes_significant_overlap...  0.349109  0.451903   
twelve_clusters_different_distributions             0.883547  0.847643   
twelve_different_distributions_high_class_imbal...  0.854256  0.836212   

                                                     hdbscan  spectral  \
archetype                                                                
four_clusters_100d_1000_samples                     0.800000  0.205358   
four_clusters_100d_100_samples                      1.000000  0.074268   
seven_highly_separated_10d_very_different_shapes    0.992324  0.986217   
seven_very_diffe