# Column Sampling Spectral Clustering

This algorithm is from "Time and Space Effecient Spectral Clustering via Column Sampling" by Li et al.

In [2]:
import numpy as np
import scipy as sp
import scipy.sparse.linalg
import scipy.cluster.vq

In [3]:
A = np.loadtxt('../data/processed/usps.csv', delimiter=',')

In [105]:
inds = A[:, -1] < 5
X = A[inds, :-2]
Y = A[inds, -1].astype(int)
k = len(np.unique(Y))
n, d = X.shape
n, d

(3588, 255)

In [106]:
m = 1000
inds = np.random.choice(n, m, replace=False)
Z = X[inds, :]

In [107]:
np.bincount(Y[inds])

array([  0, 341, 248, 208, 203], dtype=int64)

In [108]:
mu = 0
for i in range(m):
    for j in range(m):
        mu += np.linalg.norm(Z[i] - Z[j]) ** 2
mu /= (m ** 2)
mu = 1 / mu

In [109]:
A_11 = np.empty((m, m))
for i in range(m):
    for j in range(i, m):
        val = np.e ** (-mu * np.linalg.norm(Z[i] - Z[j]) ** 2)
        A_11[i, j] = val
        A_11[j, i] = val

In [110]:
D_star = np.diag(A_11.dot(np.ones(m)))

In [111]:
D_star_ = np.diag(1 / np.sqrt(A_11.dot(np.ones(m))))
M_star = D_star_.dot(A_11).dot(D_star_)

In [112]:
M_star = sp.cluster.vq.whiten(M_star)

In [113]:
Lam, V = sp.sparse.linalg.eigsh(M_star, k=k, which='LM')

In [114]:
Lam = np.diag(Lam)
B = D_star_.dot(V).dot(np.linalg.inv(Lam))

In [115]:
Q = np.empty((n,k))
for i in range(n):
    a = np.array([np.linalg.norm(X[i] - Z[j]) for j in range(m)])
    Q[i] = a.dot(B)

In [116]:
dd = Q.dot(Lam).dot(Q.T).dot(np.ones(n))
D_hat = np.diag(dd)
U = np.diag(1 / np.sqrt(dd)).dot(Q)

In [117]:
P = U.T.dot(U)
Sig, Vp = sp.linalg.eigh(P)
Sig_ = np.diag(np.sqrt(Sig))
B = Sig_.dot(Vp.T).dot(Lam).dot(Vp).dot(Sig_)
Lam_tilde, V_tilde = sp.linalg.eigh(B)
U = U.dot(Vp).dot(np.diag(1 / np.sqrt(Sig))).dot(V_tilde)

In [118]:
centroids, distortion = sp.cluster.vq.kmeans(U, k)
centroids

array([[ 0.00190233,  0.00979719,  0.02155122,  0.01740117],
       [ 0.01470648, -0.01967388, -0.00418508,  0.0167114 ],
       [ 0.00487298,  0.01658585, -0.02047612,  0.01583117],
       [-0.02335306, -0.0093726 , -0.00321419,  0.0166447 ]])

In [119]:
y_hat = np.zeros(n, dtype=int)
for i in range(n):
    dists = np.array([np.linalg.norm(U[i] - centroids[c]) for c in range(k)])
    y_hat[i] = np.argmin(dists) + 1

In [120]:
accuracy = np.zeros(k)
for i in range(k):
    y_hat = (y_hat + i) % (k + 1)
    y_hat[y_hat == 0] = 1
    accuracy[i] = (Y == y_hat).sum() / n * 100
accuracy.max()

48.996655518394647