# Experiment: classification with learned graph filters

We want to classify data by first extracting meaningful features from learned filters.

In [None]:
import time
import numpy as np
import scipy.sparse, scipy.sparse.linalg, scipy.spatial.distance
from sklearn import datasets, linear_model
import matplotlib.pyplot as plt
%matplotlib inline

# Dataset

* Two digits version of MNIST with N samples of each class.

In [None]:
def mnist(a, b, N):
    """Prepare data for binary classification of MNIST."""
    mnist = datasets.fetch_mldata('MNIST original', data_home='.')

    assert N < min(sum(mnist.target==a), sum(mnist.target==b))
    M = mnist.data.shape[1]
    
    X = np.empty((M, 2, N))
    X[:,0,:] = mnist.data[mnist.target==a,:][:N,:].T
    X[:,1,:] = mnist.data[mnist.target==b,:][:N,:].T
    
    y = np.empty((2, N))
    y[0,:] = -1
    y[1,:] = +1

    X.shape = M, 2*N
    y.shape = 2*N, 1
    return X, y

X, y = mnist(5, 1, 1000)

M, N = X.shape
print('Dimensionality: N={} samples, M={} features'.format(N, M))

X -= 127.5
print('X in [{}, {}]'.format(np.min(X), np.max(X)))

def plot_digit(nn):
    m = int(np.sqrt(M))
    fig, axes = plt.subplots(1,len(nn), figsize=(15,5))
    for i, n in enumerate(nn):
        n = int(n)
        img = X[:,n]
        axes[i].imshow(img.reshape((m,m)))
        axes[i].set_title('Label: y = {:.0f}'.format(y[n,0]))

plot_digit([0, 1, 1e2, 1e2+1, 1e3, 1e3+1])

#M, N = 784, 1000
#X = X[:M, :N]
#y = y[:N]

# Regularized least-square

## Loss and gradient

In [None]:
def L(w, b=0):
    return np.linalg.norm(X.T @ w + b - y)**2 / N + tauR * np.linalg.norm(w)**2

def dL(w, nn=None):
    N = len(y)
    return 2 / N * X @ (X.T @ w - y) + 2 * tauR * w

def print_perf(w, L, dL):
    print('L({}) = {}'.format(w, L(eval(w))))
    print('|dL({})| = {}'.format(w, np.linalg.norm(dL(eval(w)))))

## Reference: sklearn ridge regression

* With regularized data, the objective is the same with or without bias.

In [None]:
tauR = 1e0

clf = linear_model.Ridge(alpha=tauR*N, fit_intercept=False)
clf.fit(X.T, y)
w_skl = clf.coef_.T

print('L(w_skl) = {}'.format(L(w_skl, clf.intercept_)))
print_perf('w_skl', L, dL)

# Normalized data: intercept should be small.
print('bias: {}'.format(abs(np.mean(y - X.T @ w_skl))))

## Linear classifier

In [None]:
w_d = np.linalg.inv(X @ X.T + tauR * N * np.identity(M)) @ X @ y
print_perf('w_d', L, dL)
np.testing.assert_allclose(w_d, w_skl, atol=1e-8)

# Feature graph

In [None]:
def graph_grid(k=4):
    """Construct a kNN graph aranged on a 2D grid."""
    
    # Construct a grid.
    m = np.int(np.sqrt(M))
    x = np.linspace(0,1,m)
    y = np.linspace(0,1,m)
    xx, yy = np.meshgrid(x, y)
    z = np.empty((M,2))
    z[:,0] = xx.reshape(M)
    z[:,1] = yy.reshape(M)

    # Compute pairwise distances.
    d = scipy.spatial.distance.pdist(z, 'euclidean')
    d = scipy.spatial.distance.squareform(d)

    # k-NN graph.
    idx = np.argsort(d)[:,1:k+1]
    d.sort()
    d = d[:,1:k+1]

    # Weights.
    sigma2 = np.mean(d[:,-1])**2
    d = np.exp(- d**2 / sigma2)

    # Weight matrix.
    I = np.arange(0, M).repeat(k)
    J = idx.reshape(M*k)
    V = d.reshape(M*k)
    W = scipy.sparse.coo_matrix((V, (I, J)), shape=(M, M))
    
    W = graph_regularize(W)
    print("{} > {} edges".format(W.nnz, M*k))
    return W

def graph_regularize(W):
    # No self-connections.
    W.setdiag(0)

    # Non-directed graph.
    bigger = W.T > W
    W = W - W.multiply(bigger) + W.T.multiply(bigger)
    del bigger
    assert np.abs(W - W.T).mean() < 1e-10

    # CSR sparse matrix format for efficient multiplications.
    W = W.tocsr()
    W.eliminate_zeros()
    
    return W

W = graph_grid(k=4)

In [None]:
def laplacian(W, normalized=True):
    """Return the Laplacian of the weigth matrix."""
    
    # Degree matrix.
    d = W.sum(axis=0)

    # Laplacian matrix.
    if not normalized:
        D = scipy.sparse.diags(d.A.squeeze(), 0)
        L = D - W
    else:
        d = 1 / np.sqrt(d)
        D = scipy.sparse.diags(d.A.squeeze(), 0)
        I = scipy.sparse.identity(M, dtype=D.dtype)
        L = I - D * W * D
    
    # Upper-bound on the spectrum.
    if normalized:
        lmax = 2
    else:
        lmax = scipy.sparse.linalg.eigsh(L, k=1, which='LM', return_eigenvectors=False)[0]
    
    assert np.abs(L - L.T).mean() < 1e-10
    return L, lmax

LL, lmax = laplacian(W, True)

# Lanczos basis

In [None]:
K = 3

def lanczos(L, X, K):
    M, N = X.shape
    a = np.empty((K, N))
    b = np.zeros((K, N))
    V = np.empty((K, M, N))
    V[0,...] = X / np.linalg.norm(X, axis=0)
    for k in range(K-1):
        W = L.dot(V[k,...])
        a[k,:] = np.sum(W * V[k,...], axis=0)
        W = W - a[k,:] * V[k,...] - (b[k,:] * V[k-1,...] if k>0 else 0)
        b[k+1,:] = np.linalg.norm(W, axis=0)
        V[k+1,...] = W / b[k+1,:]
    a[K-1,:] = np.sum(L.dot(V[K-1,...]) * V[K-1,...], axis=0)
    return V, a, b

def lanczos_H_diag(a, b):
    K, N = a.shape
    H = np.zeros((K*K, N))
    H[:K**2:K+1, :] = a
    H[1:(K-1)*K:K+1, :] = b[1:,:]
    H.shape = (K, K, N)
    Q = np.linalg.eigh(H.T, UPLO='L')[1]
    Q = np.swapaxes(Q,1,2).T
    return Q

def lanczos_basis_eval(L, X, K):
    V, a, b = lanczos(L, X, K)
    Q = lanczos_H_diag(a, b)
    M, N = X.shape
    Xt = np.empty((K, M, N))
    for n in range(N):
        Xt[...,n] = Q[...,n].T @ V[...,n]
    Xt *= Q[0,:,np.newaxis,:]
    Xt *= np.linalg.norm(X, axis=0)
    return Xt, Q[0,...]

Xt, q = lanczos_basis_eval(LL, X, K)

# GFL classification with weights

* Memory arrangement for fastest computations: largest dimensions on the outside, i.e. fastest varying indices.
* The einsum seems to be efficient for three operands.

In [None]:
F = 2

In [None]:
def test():
    """Test the speed of filtering and weighting."""
    
    def mult(impl=3):
        if impl is 0:
            Xb = Xt.view()
            Xb.shape = (K, M*N)
            XCb = Xb.T @ C  # in MN x F
            XCb = XCb.T.reshape((F*M, N))
            return (XCb.T @ w).squeeze()
        elif impl is 1:
            tmp = np.tensordot(Xt, C, (0,0))
            return np.tensordot(tmp, W, ((0,2),(1,0)))
        elif impl is 2:
            tmp = np.tensordot(Xt, C, (0,0))
            return np.einsum('ijk,ki->j', tmp, W)
        elif impl is 3:
            return np.einsum('kmn,fm,kf->n', Xt, W, C)
    
    C = np.random.normal(0,1,(K,F))
    W = np.random.normal(0,1,(F,M))
    w = W.reshape((F*M, 1))
    a = mult(impl=0)
    for impl in range(4):
        tstart = time.process_time()
        for k in range(1000):
            b = mult(impl)
        print('Execution time (impl={}): {}'.format(impl, time.process_time() - tstart))
        np.testing.assert_allclose(a, b)
#test()

In [None]:
#class gflc_weights:

def L(C, W):
    tmp = np.einsum('kmn,kf,fm->n', Xt, C, W) - y.squeeze()
    return np.linalg.norm(tmp)**2 / N + tauR * np.linalg.norm(W)**2

def dLw(C, W):
    tmp = np.einsum('kmn,kf,fm->n', Xt, C, W) - y.squeeze()
    return 2 / N * np.einsum('kmn,kf,n->fm', Xt, C, tmp) + 2 * tauR * W
    #return 2 / N * XCb @ (XCb.T @ w - y) + 2 * tauR * w

def dLc(C, W):
    tmp = np.einsum('kmn,kf,fm->n', Xt, C, W) - y.squeeze()
    return 2 / N * np.einsum('kmn,n,fm->kf', Xt, tmp, W)

def print_perf(C, W, loss):
    print('L = {}'.format(L(C, W)))
    print('|dLw| = {}'.format(np.linalg.norm(dLw(C, W))))
    print('|dLc| = {}'.format(np.linalg.norm(dLc(C, W))))
    plt.semilogy(loss)

In [None]:
def test_optim(algo):
    tstart = time.process_time()
    ret = algo()
    print('Processing time: {}'.format(time.process_time()-tstart))
    print_perf(*ret)

In [None]:
def sgd():
    C = np.random.normal(0, 1, (K, F))
    W = np.random.normal(0, 1, (F, M))
    
    loss = [L(C, W)]
    
    for t in range(100):
        C -= 1e-8 * dLc(C, W)
        W -= 1e-8 * dLw(C, W)
        loss.append(L(C, W))
        
    return C, W, loss

test_optim(sgd)

In [None]:
def direct():
    C = np.random.normal(0, 1, (K, F))
    W = np.random.normal(0, 1, (F, M))
    c = C.reshape((K*F, 1))
    w = W.reshape((F*M, 1))

    loss = [L(C, W)]

    for t in range(5):
        Xw = np.einsum('kmn,fm->kfn', Xt, W)
        #Xw = np.tensordot(Xt, W, (1,1))
        Xw.shape = (K*F, N)
        c[:] = np.linalg.solve(Xw @ Xw.T, Xw @ y)

        Z = np.einsum('kmn,kf->fmn', Xt, C)
        #Z = np.tensordot(Xt, C, (0,0))
        #Z = C.T @ Xt.reshape((K,M*N))
        Z.shape = (F*M, N)
        w[:] = np.linalg.solve(Z @ Z.T + tauR * N * np.identity(F*M), Z @ y)

        loss.append(L(C, W))
    return C, W, loss

test_optim(direct)

## GFL classification with splitting

Solvers
* Closed-form solution.
* Stochastic gradient descent.

In [None]:
tauF = 1e0

In [None]:
def L(C, w, Z, XCb):
    return np.linalg.norm(XCb.T @ w - y)**2 / N + tauR * np.linalg.norm(w)**2

def Lsplit(C, w, Z, XCb):
    return np.linalg.norm(Z.T @ w - y)**2 / N + tauF / N * np.linalg.norm(XCb - Z)**2 + tauR * np.linalg.norm(w)**2

def dLw(C, w, Z, XCb):
    return 2 / N * Z @ (Z.T @ w - y) + 2 * tauR * w

def dLc(C, w, Z, XCb):
    Xb = Xt.reshape((K, M*N)).T
    Zb = Z.reshape((F, M*N)).T
    return 2 * tauF / N * Xb.T @ (Xb @ C - Zb)

def dLz(C, w, Z, XCb):
    return 2 / N * w @ (w.T @ Z - y.T) + 2 * tauF / N * (Z - XCb)

def print_perf(*args):
    print('L = {}'.format(L(*args[:-2])))
    print('Lsplit = {}'.format(Lsplit(*args[:-2])))
    print('|dLw| = {}'.format(np.linalg.norm(dLw(*args[:-2]))))
    print('|dLc| = {}'.format(np.linalg.norm(dLc(*args[:-2]))))
    print('|dLz| = {}'.format(np.linalg.norm(dLz(*args[:-2]))))
    plt.semilogy(args[-2])
    #plt.figure()
    plt.semilogy(args[-1])

In [None]:
def lanczos_filter(C):
    Xb = Xt.reshape((K, M*N)).T
    #XCb = np.tensordot(Xb, C, (2,1))
    XCb = Xb @ C  # in MN x F
    XCb = XCb.T.reshape((F*M, N))  # Needs to copy data.
    return XCb

In [None]:
def sgd_split():
    C = np.zeros((K, F))
    w = np.zeros((F*M, 1))
    Z = np.random.normal(0, 1, (F*M, N))
    
    XCb = np.empty((F*M, N))

    loss = [L(C, w, Z, XCb)]
    loss_split = [Lsplit(C, w, Z, XCb)]

    for t in range(100):
        C -= 1e-7 * dLc(C, w, Z, XCb)
        XCb[:] = lanczos_filter(C)
        Z -= 1e-3 * dLz(C, w, Z, XCb)
        w -= 1e-3 * dLw(C, w, Z, XCb)
        loss.append(L(C, w, Z, XCb))
        loss_split.append(Lsplit(C, w, Z, XCb))
        
    return C, w, Z, XCb, loss, loss_split

test_optim(sgd_split)

In [None]:
def direct_split():
    C = np.zeros((K, F))
    w = np.zeros((F*M, 1))
    Z = np.random.normal(0, 1, (F*M, N))
    
    XCb = np.empty((F*M, N))
    Xb = Xt.reshape((K, M*N)).T
    Zb = Z.reshape((F, M*N)).T

    loss = [L(C, w, Z, XCb)]
    loss_split = [Lsplit(C, w, Z, XCb)]

    for t in range(8):

        C[:] = Xb.T @ Zb / np.sum((np.linalg.norm(X, axis=0) * q)**2, axis=1)[:,np.newaxis]
        XCb[:] = lanczos_filter(C)

        #Z[:] = np.linalg.inv(tauF * np.identity(F*M) + w @ w.T) @ (tauF * XCb + w @ y.T)
        Z[:] = np.linalg.solve(tauF * np.identity(F*M) + w @ w.T, tauF * XCb + w @ y.T)

        #w[:] = np.linalg.inv(Z @ Z.T + tauR * N * np.identity(F*M)) @ Z @ y
        w[:] = np.linalg.solve(Z @ Z.T + tauR * N * np.identity(F*M), Z @ y)

        loss.append(L(C, w, Z, XCb))
        loss_split.append(Lsplit(C, w, Z, XCb))
        
    return C, w, Z, XCb, loss, loss_split

test_optim(direct_split)