# Standard Nystrom-based Spectral Clustering

This algorithm is from "[Fast Spectral Clustering via the Nystrom Method](http://www.cs.columbia.edu/~jebara/papers/ALT2013FSCVTNM.pdf)" by Choromanska et el.

In [1]:
import numpy as np
import scipy as sp
import scipy.sparse.linalg
import scipy.cluster.vq

%matplotlib inline

In [2]:
A = np.loadtxt('../data/processed/usps.csv', delimiter=',')

In [84]:
inds = A[:, -1] < 3
X = A[inds, :-2]
Y = A[inds, -1].astype(int)
n, d = X.shape
n, d

(2199, 255)

In [85]:
l = 50
inds = np.random.choice(d, l, replace=False)
inds

array([ 74, 149, 244, 138,  36, 251, 128,  55, 137, 245, 223,  23, 124,
        50, 250,   2,   5, 179, 129,  79, 247, 150,  94, 241,  75, 122,
        20,  67, 166,  14, 203, 169, 118,  58, 189,  89, 107,  12, 108,
       180, 130,  27,  29,  47, 146, 225, 243,  85,  66,  90])

In [86]:
W_hat = np.empty((n, l))
for i in range(n):
    for j in range(l):
        val = np.e ** (-1 * np.linalg.norm(X[i] - X[inds[j]]) ** 2)
        W_hat[i, j] = val

In [87]:
D = np.diag(1 / np.sqrt(W_hat.sum(axis=1)))
Delta = np.diag(1 / np.sqrt(W_hat.sum(axis=0)))

In [88]:
C = np.identity(n)[:, inds] - np.sqrt(l / n) * D.dot(W_hat).dot(Delta)

In [89]:
B = C[inds, :]

In [90]:
r = l // 2
U, s, V = sp.sparse.linalg.svds(B, r)
B_r = U.dot(np.diag(s)).dot(V)

In [91]:
s_Br, U_Br = np.linalg.eigh(B_r)
S_Br = np.diag(s_Br)

In [92]:
S_tilde = (n / l) * S_Br
U_tilde = np.sqrt(n / l) * C.dot(U_Br).dot(np.linalg.pinv(S_Br))

In [93]:
k = 2
Z = U_tilde[:, :k]
z = np.sqrt((Z ** 2).sum(axis=1))
for i in range(n):
    if z[i] > 0:
        Z[i] /= z[i]

In [136]:
centroids, distortion = sp.cluster.vq.kmeans(Z, k)
centroids

array([[-1.,  0.],
       [ 1.,  0.]])

In [137]:
y_hat = np.zeros(n, dtype=int)
for i in range(n):
    d = np.array([np.linalg.norm(Z[i] - c) for c in centroids])
    y_hat[i] = np.argmin(d) + 1

In [138]:
accuracy = np.zeros(k)
for i in range(k):
    y_hat = (y_hat + i) % (k + 1)
    y_hat[y_hat == 0] = 1
    accuracy[i] = (Y == y_hat).sum() / n * 100
accuracy.max()

56.525693497044116