We need to compare k-means, hdbscan, quickshift, and classix

## Face clustering

In [1]:
import hdbscan
from sklearn.cluster import KMeans
from quickshift.QuickshiftPP import *
from threadpoolctl import threadpool_limits

In [2]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from classix import CLASSIX, loadData
import classix
from time import time
import cv2
import os


import pandas as pd
import numpy as np

print(classix.__version__)

from HiPart.clustering import DePDDP
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import adjusted_mutual_info_score as ami

import collections


1.0.5


In [3]:
def order_pics(figs):
    images = list()
    labels = list()
    for i in range(40):
        num = i + 1
        for img in figs:
            try:
                if int(img.split('_')[1].replace('.jpg','')) == num:
                    images.append(img)
                    labels.append(num)
            except:
                pass
    return images, labels
    
def load_images(folder, shape=(100, 100)):
    images = list()
    figs = os.listdir(folder)
    figs, targets= order_pics(figs)
    for filename in figs:
        img = cv2.imread(os.path.join(folder,filename)) 
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # transform to grayscale
        img = cv2.resize(img, shape, interpolation = cv2.INTER_AREA) # resize to 80x80
        if img is not None:
            images.append(img)
    images, targets = np.array(images), np.array(targets) - 1
    images = images.reshape(images.shape[0], images.shape[1]*images.shape[2])
    return images, targets

lari = list()
lami = list()
runtime = list()

folder = 'OlivettiFaces'
X, y = load_images(folder)
clusterNum = len(np.unique(y))

min_cluster = min(collections.Counter(y).items(), key=lambda x: x[1])[1]

with threadpool_limits(limits=1, user_api='blas'):
    clx = CLASSIX(sorting='pca', radius=0.54, minPts=3, verbose=0, group_merging='distance')
    st = time()
    clx.fit(X)
    et = time()
    print("CLASSIX uses time:", et - st)
    print("ARI:", ari(clx.labels_, y))
    print("AMI:", ami(clx.labels_, y))
    lari.append(ari(clx.labels_, y))
    lami.append(ami(clx.labels_, y))
    runtime.append(et - st)

    _hdbscan = hdbscan.HDBSCAN(min_cluster_size=2, algorithm='best')
    st = time()
    _hdbscan.fit(X)
    et = time()
    print("HDBSCAN uses time:", et - st)
    print("ARI:", ari(_hdbscan.labels_, y))
    print("AMI:", ami(_hdbscan.labels_, y))
    lari.append(ari(_hdbscan.labels_, y))
    lami.append(ami(_hdbscan.labels_, y))
    runtime.append(et - st)

    quicks = QuickshiftPP(k=9, beta=0.7)
    st = time()
    quicks.fit(X.copy(order='C'))
    et = time()
    print("Quickshift uses time:", et - st)
    print("ARI:", ari(quicks.memberships, y))
    print("AMI:", ami(quicks.memberships, y))
    lari.append(ari(quicks.memberships, y))
    lami.append(ami(quicks.memberships, y))
    runtime.append(et - st)

    kmeans = KMeans(n_clusters=clusterNum, init='k-means++', random_state=1)
    st = time()
    kmeans.fit(X)
    et = time()
    print("kmeans uses time:", et - st)
    print("ARI:", ari(kmeans.labels_, y))
    print("AMI:", ami(kmeans.labels_, y))
    lari.append(ari(kmeans.labels_, y))
    lami.append(ami(kmeans.labels_, y))
    runtime.append(et - st)

    # depddp_class = DePDDP(max_clusters_number=clusterNum).fit_predict(X)
    # print("DePDDP uses time:", et - st)
    # print("ARI:", ari(depddp_class, y))
    # print("AMI:", ami(depddp_class, y))
    # lari.append(ari(depddp_class, y))
    # lami.append(ami(depddp_class, y))
    # runtime.append(et - st)

labels_ = [clx.labels_, _hdbscan.labels_, quicks.memberships, kmeans.labels_]# , depddp_class]
lari = np.asarray(lari)
lami = np.asarray(lami)
runtime = np.asarray(runtime)
np.save("result/exp1/olivetti_ari.npy", lari)
np.save("result/exp1/olivetti_ami.npy", lami)
np.save("result/exp1/olivetti_runtime.npy", runtime)
np.save("result/exp1/olivetti_labels.npy", labels_)

CLASSIX uses time: 1.13997220993042
ARI: 0.6655494516761021
AMI: 0.839551568838854
HDBSCAN uses time: 3.9601776599884033
ARI: 0.3940236570910377
AMI: 0.7273864449199154
Quickshift uses time: 3.698025703430176
ARI: 0.6588833850300853
AMI: 0.8399985246684827




kmeans uses time: 5.143144369125366
ARI: 0.6370368301567606
AMI: 0.7810840623704158
