# Spectral Clustering

In [1]:
import numpy as np
import scipy as sp
import scipy.sparse.linalg
import scipy.cluster.vq

In [2]:
A = np.loadtxt('../data/processed/usps.csv', delimiter=',')

In [3]:
inds = A[:, -1] < 3
X = A[inds, :-2]
Y = A[inds, -1].astype(int)

In [4]:
k = len(np.unique(Y))
n, d = X.shape
n, d

(2199, 255)

In [77]:
W = np.empty((n, n))
for i in range(n):
    for j in range(i,n):
#         val = np.e ** (-1 * np.linalg.norm(X[i] - X[j]) ** 2)
        val = np.linalg.norm(X[i] - X[j]) ** 2
        W[i, j] = val
        W[j, i] = val

In [78]:
ww = W.sum(axis=0)
D = np.diag(ww)
D_ = np.diag(1 / np.sqrt(ww))
L = np.identity(n) - D_.dot(W).dot(D_)

In [79]:
V, Z = sp.linalg.eigh(L, eigvals=(n-2, n-1))

In [80]:
Z_ = sp.cluster.vq.whiten(Z)
centroids, distortion = sp.cluster.vq.kmeans(Z_, k)

In [81]:
centroids, distortion

(array([[-0.07674317,  0.82391454],
        [ 0.06117486, -1.09696632]]), 0.81845751458051619)

In [82]:
y_hat = np.zeros(n, dtype=int)
for i in range(n):
    dists = np.array([np.linalg.norm(Z_[i] - centroids[c]) for c in range(k)])
    y_hat[i] = np.argmin(dists) + 1

In [83]:
accuracy = np.zeros(k)
for i in range(k):
    y_hat = (y_hat + i) % (k + 1)
    y_hat[y_hat == 0] = 1
    accuracy[i] = (Y == y_hat).sum() / n * 100
accuracy.max()

99.090495679854484