We need to compare k-means, hdbscan, quickshift, and classix

## CovidENV

In [1]:
import hdbscan
from sklearn.cluster import KMeans
from quickshift.QuickshiftPP import *
from threadpoolctl import threadpool_limits

In [2]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from classix import CLASSIX, loadData
import classix
from time import time
import cv2
import os


import pandas as pd
import numpy as np

print(classix.__version__)

from HiPart.clustering import DePDDP
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import adjusted_mutual_info_score as ami

import collections


1.0.4


In [3]:
CovidENV = loadData('CovidENV')
X, y = CovidENV
np.random.seed(42)
sampling = np.random.choice(np.arange(X.shape[0]), int(X.shape[0]*0.1), replace=False)
X, y = np.array(X), np.array(y)
X_sample, y_sample = X[sampling], y[sampling] # for Quickshift++
clusterNum = len(np.unique(y))

lari = list()
lami = list()
runtime = list()
min_cluster = min(collections.Counter(y).items(), key=lambda x: x[1])[1]

with threadpool_limits(limits=1, user_api='blas'):
    clx = classix.CLASSIX(radius=0.2, minPts=500, verbose=0);
    st = time()
    clx.fit(X)
    et = time()
    print("CLASSIX uses time:", et - st)
    print("ARI:", ari(clx.labels_, y))
    print("AMI:", ami(clx.labels_, y))
    lari.append(ari(clx.labels_, y))
    lami.append(ami(clx.labels_, y))
    runtime.append(et - st)

    _hdbscan = hdbscan.HDBSCAN(min_cluster_size=180000, min_samples=5) 
    st = time()
    _hdbscan.fit(X)
    et = time()
    print("HDBSCAN uses time:", et - st)
    print("ARI:", ari(_hdbscan.labels_, y))
    print("AMI:", ami(_hdbscan.labels_, y))
    lari.append(ari(_hdbscan.labels_, y))
    lami.append(ami(_hdbscan.labels_, y))
    runtime.append(et - st)

    quicks = QuickshiftPP(k=3500, beta=0.9)
    st = time()
    quicks.fit(X_sample.copy(order='C'))
    et = time()
    print("Quickshift uses time:", et - st)
    print("ARI:", ari(quicks.memberships, y_sample))
    print("AMI:", ami(quicks.memberships, y_sample))
    lari.append(ari(quicks.memberships, y_sample))
    lami.append(ami(quicks.memberships, y_sample))
    runtime.append(et - st)


    kmeans = KMeans(n_clusters=clusterNum, init='k-means++', random_state=1)
    st = time()
    kmeans.fit(X)
    et = time()
    print("kmeans uses time:", et - st)
    print("ARI:", ari(kmeans.labels_, y))
    print("AMI:", ami(kmeans.labels_, y))
    lari.append(ari(kmeans.labels_, y))
    lami.append(ami(kmeans.labels_, y))
    runtime.append(et - st)


    # depddp_class = DePDDP(max_clusters_number=clusterNum).fit_predict(X)
    # print("DePDDP uses time:", et - st)
    # print("ARI:", ari(depddp_class, y))
    # print("AMI:", ami(depddp_class, y))
    # lari.append(ari(depddp_class, y))
    # lami.append(ami(depddp_class, y))
    # runtime.append(et - st)


labels_ = [clx.labels_, _hdbscan.labels_, kmeans.labels_]# , depddp_class]
lari = np.asarray(lari)
lami = np.asarray(lami)
runtime = np.asarray(runtime)
np.save("result/exp1/CovidENV_ari.npy", lari)
np.save("result/exp1/CovidENV_ami.npy", lami)
np.save("result/exp1/CovidENV_runtime.npy", runtime)
np.save("result/exp1/CovidENV_labels.npy", labels_)
np.save("result/exp1/CovidENV_quicks_labels.npy", quicks.memberships)

CLASSIX uses time: 8.066692590713501
ARI: 0.3394674199932332
AMI: 0.5051838946088059
HDBSCAN uses time: 4332.441341876984
ARI: 0.33852678600121944
AMI: 0.4903916598721745
Quickshift uses time: 7621.237954378128
ARI: 0.2878034517576962
AMI: 0.4862109032346981




kmeans uses time: 135.60977172851562
ARI: 0.12623990744063812
AMI: 0.4385715619602608
