# Spectral Clustering

In [29]:
import numpy as np
import scipy as sp
import scipy.sparse.linalg
import scipy.cluster.vq

In [3]:
A = np.loadtxt('../data/processed/usps.csv', delimiter=',')

In [117]:
inds = A[:, -1] < 3
X = A[inds, :-2]
Y = A[inds, -1].astype(int)

In [106]:
n, d = X.shape
n, d

(2199, 255)

In [107]:
W = np.zeros((n, n))
for i in range(n):
    for j in range(i,n):
        val = np.e ** (-1 * np.linalg.norm(X[i] - X[j]) ** 2)
        W[i, j] = val
        W[j, i] = val

In [126]:
ww = W.sum(axis=1)
D = np.diag(ww)

In [127]:
D_ = np.diag(1 / np.sqrt(ww))
L = np.identity(n) - D_.dot(W).dot(D_)

In [128]:
k = len(np.unique(Y))

In [135]:
V, Z = sp.sparse.linalg.eigsh(L, k=k, mode='SM')

In [136]:
Z.shape

(2199, 2)

In [137]:
Z_ = sp.cluster.vq.whiten(Z)
centroids, distortion = sp.cluster.vq.kmeans(Z_, k)

In [138]:
centroids

array([[-0.06288981, -0.03224486],
       [ 0.51777364,  0.26840626]])

In [139]:
y_hat = np.zeros(n, dtype=int)
for i in range(n):
    dists = np.array([np.linalg.norm(Z_[i] - centroids[c]) for c in range(k)])
    y_hat[i] = np.argmin(dists) + 1

In [140]:
accuracy = np.zeros(k)
for i in range(k):
    y_hat = (y_hat + i) % (k + 1)
    y_hat[y_hat == 0] = 1
    accuracy[i] = (Y == y_hat).sum() / n * 100
accuracy.max()

57.298772169167798