# Column Sampling Spectral Clustering

This algorithm is from "Time and Space Effecient Spectral Clustering via Column Sampling" by Li et al.

In [1]:
import numpy as np
import scipy as sp
import scipy.sparse.linalg
import scipy.cluster.vq

In [2]:
A = np.loadtxt('../data/processed/usps.csv', delimiter=',')

In [3]:
inds = A[:, -1] < 3
X = A[inds, :-2]
Y = A[inds, -1].astype(int)
n, d = X.shape
n, d

(2199, 255)

In [55]:
m = 1000
inds = np.random.choice(n, m, replace=False)
Z = X[inds, :]

In [56]:
A_11 = np.empty((m, m))
for i in range(m):
    for j in range(i, m):
        val = np.e ** (-1 * np.linalg.norm(Z[i] - Z[j]) ** 2)
        A_11[i, j] = val
        A_11[j, i] = val

In [57]:
D_star = np.diag(A_11.dot(np.ones(m)))

In [58]:
D_star_ = np.diag(1 / np.sqrt(A_11.dot(np.ones(m))))
M_star = D_star_.dot(A_11).dot(D_star_)

In [59]:
k = 2

In [60]:
M_star = sp.cluster.vq.whiten(M_star)

In [61]:
Lam, V = sp.sparse.linalg.eigsh(M_star, k=k, which='LM')

In [62]:
Lam = np.diag(Lam)
B = D_star_.dot(V).dot(np.linalg.inv(Lam))

In [63]:
Q = np.empty((n,k))
for i in range(n):
    a = np.array([np.linalg.norm(X[i] - Z[j]) for j in range(m)])
    Q[i] = a.dot(B)

In [64]:
dd = Q.dot(Lam).dot(Q.T).dot(np.ones(n))
D_hat = np.diag(dd)
U = np.diag(1 / np.sqrt(dd)).dot(Q)

In [65]:
U_ = sp.cluster.vq.whiten(U)
centroids, distortion = sp.cluster.vq.kmeans(U_, k)
centroids

array([[-0.81143518, -1.6708477 ],
       [-2.39151246, -3.55292014]])

In [66]:
y_hat = np.zeros(n, dtype=int)
for i in range(n):
    dists = np.array([np.linalg.norm(U_[i] - centroids[c]) for c in range(k)])
    y_hat[i] = np.argmin(dists) + 1

In [67]:
accuracy = np.zeros(k)
for i in range(k):
    y_hat = (y_hat + i) % (k + 1)
    y_hat[y_hat == 0] = 1
    accuracy[i] = (Y == y_hat).sum() / n * 100
accuracy.max()

99.636198271941794