We need to compare k-means, hdbscan, quickshift, and classix

## Gaussian blobs  

In [1]:
import hdbscan
from sklearn.cluster import KMeans
from quickshift.QuickshiftPP import *
from threadpoolctl import threadpool_limits

In [2]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from classix import CLASSIX, loadData
import classix
from time import time
import cv2
import os


import pandas as pd
import numpy as np

print(classix.__version__)

from HiPart.clustering import DePDDP
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import adjusted_mutual_info_score as ami

import collections


1.0.5


In [3]:
fdim=5
size=int(2*1e5)
clusterNum = 5

lari = list()
lami = list()
runtime = list()
X, y = make_blobs(n_samples=size, centers=clusterNum, n_features=fdim,
                  random_state=42)
min_cluster = min(collections.Counter(y).items(), key=lambda x: x[1])[1]

with threadpool_limits(limits=1, user_api='blas'):
    clx = CLASSIX(radius=0.17, verbose=0)
    st = time()
    clx.fit(X)
    et = time()
    print("CLASSIX uses time:", et - st)
    print("ARI:", ari(clx.labels_, y))
    print("AMI:", ami(clx.labels_, y))
    lari.append(ari(clx.labels_, y))
    lami.append(ami(clx.labels_, y))
    runtime.append(et - st)

    _hdbscan = hdbscan.HDBSCAN(min_cluster_size=5000, algorithm='best')
    st = time()
    _hdbscan.fit(X)
    et = time()
    print("HDBSCAN uses time:", et - st)
    print("ARI:", ari(_hdbscan.labels_, y))
    print("AMI:", ami(_hdbscan.labels_, y))
    lari.append(ari(_hdbscan.labels_, y))
    lami.append(ami(_hdbscan.labels_, y))
    runtime.append(et - st)

    quicks = QuickshiftPP(k=80, beta=0.9)
    st = time()
    quicks.fit(X.copy(order='C'))
    et = time()
    print("Quickshift uses time:", et - st)
    print("ARI:", ari(quicks.memberships, y))
    print("AMI:", ami(quicks.memberships, y))
    lari.append(ari(quicks.memberships, y))
    lami.append(ami(quicks.memberships, y))
    runtime.append(et - st)

    kmeans = KMeans(n_clusters=clusterNum, init='k-means++', random_state=1)
    st = time()
    kmeans.fit(X)
    et = time()
    print("kmeans uses time:", et - st)
    print("ARI:", ari(kmeans.labels_, y))
    print("AMI:", ami(kmeans.labels_, y))
    lari.append(ari(kmeans.labels_, y))
    lami.append(ami(kmeans.labels_, y))
    runtime.append(et - st)

    # depddp_class = DePDDP(max_clusters_number=clusterNum).fit_predict(X)
    # print("DePDDP uses time:", et - st)
    # print("ARI:", ari(depddp_class, y))
    # print("AMI:", ami(depddp_class, y))
    # lari.append(ari(depddp_class, y))
    # lami.append(ami(depddp_class, y))
    # runtime.append(et - st)

lari = np.asarray(lari)
lami = np.asarray(lami)
runtime = np.asarray(runtime)

labels_ = [clx.labels_, _hdbscan.labels_, quicks.memberships, kmeans.labels_]# , depddp_class]
np.save("result/exp1/blobs_ari.npy", lari)
np.save("result/exp1/blobs_ami.npy", lami)
np.save("result/exp1/blobs_runtime.npy", runtime)
np.save("result/exp1/blobs_labels.npy", labels_)

CLASSIX uses time: 0.6888551712036133
ARI: 1.0
AMI: 1.0
HDBSCAN uses time: 190.40062499046326
ARI: 0.9967470647150922
AMI: 0.9936118389400329
Quickshift uses time: 43.935232162475586
ARI: 1.0
AMI: 1.0




kmeans uses time: 2.293975353240967
ARI: 1.0
AMI: 1.0
