# $k$-means-based approximate spectral clustering

This algorithm is from "[Fast Approximate Spectral Clustering](https://people.eecs.berkeley.edu/~jordan/papers/yan-huang-jordan-kdd09.pdf)" by Yan et el.

In [1]:
import numpy as np
import scipy as sp
import scipy.sparse.linalg
import scipy.cluster.vq

In [2]:
A = np.loadtxt('../data/processed/usps.csv', delimiter=',')

In [192]:
inds = A[:, -1] < 3
X = A[inds, :-2]
Y = A[inds, -1].astype(int)
n, d = X.shape
n, d

(2199, 255)

In [207]:
# data reduction ratio
gamma = 2
k_prime = n // gamma

In [208]:
centroids_prime, distortion_prime = sp.cluster.vq.kmeans(X, k_prime)

In [209]:
y_prime = np.empty(n)
for i in range(n):
    dists = np.array([np.linalg.norm(X[i] - c) for c in centroids_prime])
    y_prime[i] = np.argmin(dists)

In [211]:
X_prime = centroids_prime
n_prime, d_prime = X_prime.shape
n_prime, d_prime

(871, 255)

In [212]:
W = np.zeros((n_prime, n_prime))
for i in range(n_prime):
    for j in range(i, n_prime):
        val = np.e ** (-1 * np.linalg.norm(X_prime[i] - X_prime[j]) ** 2)
        W[i, j] = val
        W[j, i] = val

In [213]:
D = np.diag(W.sum(axis=1))
D_ = np.diag(1 / np.sqrt(W.sum(axis=1)))
L = np.identity(n_prime) - D_.dot(W).dot(D_)

In [214]:
k = len(np.unique(Y))

In [215]:
V, Z = sp.sparse.linalg.eigsh(L, k=k, mode='SM')

In [224]:
Z_ = sp.cluster.vq.whiten(Z)
centroids, distortion = sp.cluster.vq.kmeans(Z_, k)
centroids

array([[-0.10526097,  0.02261042],
       [ 0.78493158, -0.16552445]])

In [229]:
y_hat_prime = np.empty(n_prime)
for i in range(n_prime):
    dists = np.array([np.linalg.norm(Z[i] - c) for c in centroids])
    y_hat_prime[i] = np.argmin(dists)

In [230]:
y_hat = np.zeros(n)
for i in range(n):
    y_hat[i] = y_hat_prime[ int(y_prime[i]) ]

In [231]:
accuracy = np.zeros(k)
for i in range(k):
    y_hat = (y_hat + i) % (k + 1)
    y_hat[y_hat == 0] = 1
    accuracy[i] = (Y == y_hat).sum() / n * 100
accuracy.max()

54.297407912687589