# Experiment: learning graph filters

1. Random kNN graph: $W \in \mathbb{R}^{M \times M} \rightarrow L \rightarrow U, \Lambda$.
2. Random graph signals: $X = \{x_i\}_{i=1}^N \in \mathbb{R}^{M \times N}$.
3. Linear mapping: $f(x_i, c) = U \operatorname{diag}(c) U^T x_i$.
4. Noisy target signals: $Y = \{y_i\}_{i=1}^N \in \mathbb{R}^{M \times N}, y_i = f(x_i, c_{gt}) + \mathcal{N}_M(0,\epsilon)$.
    1. With randomly generated coefficients $c_{gt} \sim \mathcal{N}_M(0,1)$.
5. Convex and smooth loss function: $L = \frac{1}{N} \sum_{i=1}^N \|f(x_i, c) - y_i\|_2^2 = \frac{1}{N} \|U \operatorname{diag}(c) U^TX - Y\|_F^2$.
    1. Gradient: $\nabla_{c} L = \frac{2}{N} \left(U^T X \circ ( c \circ U^T X - U^T Y ) \right) 1_N$.
6. Optimization: $c^* = \operatorname{arg min}_c L(c)$.
7. Verification.
    1. $c^*$ should converge to $c_{gt}$.
    2. The loss $L(c^*)$ should converge to $L(c_{gt})$.

In [None]:
import random, time
import numpy as np
import scipy.sparse, scipy.sparse.linalg
import matplotlib.pyplot as plt
%matplotlib inline
tol = 1e-10

## Random graph

In [None]:
M = 100  # nodes
degree = 5  # edges per vertex

# Random connections and weights.
I = np.arange(0, M).repeat(degree)
J = np.random.randint(0, M, M*degree)
V = np.random.uniform(0, 1, M*degree)
W = scipy.sparse.coo_matrix((V, (I, J)), shape=(M, M))

# No self-connections.
W.setdiag(0)

# Non-directed graph.
bigger = W.T > W
W = W - W.multiply(bigger) + W.T.multiply(bigger)
del bigger
assert np.abs(W - W.T).mean() < tol

# CSR sparse matrix format for efficient multiplications.
W = W.tocsr()
W.eliminate_zeros()

print("{} > {} edges".format(W.nnz, M*degree))

* $L^\text{unnormalized} = D - W$
* $L^\text{normalized} = I - D^{-1/2} W D^{-1/2}$

In [None]:
normalized_laplacian = False

# Degree matrix.
d = W.sum(axis=0)

# Laplacian matrix.
if not normalized_laplacian:
    D = scipy.sparse.diags(d.A.squeeze(), 0)
    L = D - W
else:
    d = 1 / np.sqrt(d)
    D = scipy.sparse.diags(d.A.squeeze(), 0)
    I = scipy.sparse.identity(M, dtype=D.dtype)
    L = I - D * W * D
    del I
del d, D

# Symmetric matrix.
assert np.abs(L - L.T).mean() < tol

$L = U^T \Lambda U$ where $\Lambda$ is a diagonal matrix of eigenvalues.

In [None]:
(lamb_s, U_s) = scipy.sparse.linalg.eigsh(L, k=M-1, which='SM')
(lamb_sh, U_sh) = scipy.sparse.linalg.eigsh(L, k=M-1, which='SM')
(lamb_h, U_h) = np.linalg.eigh(L.toarray())
(lamb, U) = np.linalg.eig(L.toarray())  # Different result. Broken ?
np.testing.assert_allclose(lamb_h[:-1], lamb_s, atol=tol)
np.testing.assert_allclose(lamb_h[:-1], lamb_sh, atol=tol)
#np.testing.assert_allclose(lamb_h, lamb, atol=tol)
np.testing.assert_allclose(np.abs(U_h[:,:-1]), np.abs(U_s), atol=tol)
np.testing.assert_allclose(np.abs(U_h[:,:-1]), np.abs(U_sh), atol=tol)
#np.testing.assert_allclose(np.abs(U_h), np.abs(U), atol=tol)

(lamb, U) = (lamb_h, U_h)

## Linear mapping: graph filter

Linear mapping $f(x_i, c) = U C U^T x_i$, $C$ is the diagonal matrix $C = \operatorname{diag}(c)$, i.e. $c = C 1_M$.

* Parametrized low-pass filter coefficients $(c_{gt})_i = \operatorname{e}^{-t \lambda_i}$
* Random filter coefficients $c_{gt} \sim \mathcal{N}_M(0,1)$

In [None]:
parametrized = False

if parametrized:
    t = 2
    c_gt = np.exp(-t * lamb)
    assert np.all(c[:-1] - c[1:] > 0)
else:
    c_gt = np.random.normal(0, 1, M)

## Signals

* Random input signals $X \sim \mathcal{N}_{M \times N}(0,1)$
  * Low-pass signals ?
* Noisy target signals $y_i = f(x_i, c_{gt}) + \mathcal{N}_M(0,\epsilon)$

In [None]:
N = 200  # signals
eps = 0.1  # noise

X = np.random.normal(0, 1, (M,N))
Y = U @ np.diag(c_gt) @ U.T @ X + (np.random.normal(0, eps, (M,N)) if eps > 0 else 0)

## Loss function

* Loss function $L = \frac{1}{N} \sum_{i=1}^N \|f(x_i, c) - y_i\|_2^2 = \frac{1}{N} \|UCU^TX - Y\|_F^2$.
    * Spectral domain: $L = \frac{1}{N} \| C U^T X - U^T Y \|_F^2$.
    * Independant coefficients: $L = \frac{1}{N} \sum_{i=1}^M \| c_i (U^T X)_{i,\cdot} - (U^T Y)_{i,\cdot} \|_2^2$.
    * Convex and smooth w.r.t. $c$.
* Gradient:
    * Independant coefficients: $\nabla_{c_i} L = \frac{2}{N} ( c_i (U^T X)_{i,\cdot} - (U^T Y)_{i,\cdot} ) (X^T U)_{\cdot,i}$.
    * $\nabla_{c} L = \frac{2}{N} \left(U^T X \circ ( c \circ U^T X - U^T Y ) \right) 1_N$.
* Optimization $c^* = \operatorname{arg min}_{c} L(c)$

In [None]:
def L(c):
    M, N = X.shape
    return np.linalg.norm(U @ np.diag(c) @ U.T @ X - Y, 'fro')**2 / N

print("L(c_gt) = {}".format(L(c_gt)))
np.testing.assert_allclose(L(c_gt), M * eps**2, 1e-2)

def dL(X, Y, c, variant=None):
    M, N = X.shape
    A = U.T @ X
    B = U.T @ Y
    # Speed: v3 >> v1 > v2.
    if variant is 1:
        return 2 / N * np.diag(A @ (A.T @ np.diag(c) - B.T))
    elif variant is 2:
        dc = np.empty(M)
        for i in range(M):
            dc[i] = 2 / N * (c[i] * A[i,:] - B[i,:]) @ A.T[:,i]
        return dc
    else:
        # Speed: .sum(axis=1) is faster than *np.ones(N).y
        return 2 / N * (A * (c[:,np.newaxis] * A - B)).sum(axis=1)

# Gradient should be null at the global minimum. With noise, c_gt is not necessary the optimum.
if eps <= 0:
    np.testing.assert_allclose(dL(X, Y, c_gt), 0, atol=tol)
np.testing.assert_allclose(dL(X, Y, c_gt), dL(X, Y, c_gt, 1))
np.testing.assert_allclose(dL(X, Y, c_gt), dL(X, Y, c_gt, 2))

## Optimization: optimality condition

* Only possible because $L$ is convex and smooth.
* Optimality condition $\nabla_c L = 0$ gives $(U^T X \circ U^T X) 1_N \circ c = (U^T X \circ U^T Y) 1_N$.

In [None]:
tstart = time.process_time()
A = U.T @ X
B = U.T @ Y
c_opt = (A * B).sum(axis=1) / (A * A).sum(axis=1)
print('Execution time: {:1.0f}ms'.format((time.process_time() - tstart) * 1000))

print("L(c_opt) = {}".format(L(c_opt)))
np.testing.assert_allclose(dL(X, Y, c_opt), 0, atol=tol)
if eps <= 0:
    np.testing.assert_allclose(c_opt, c_gt)
    np.testing.assert_allclose(L(c_opt), L(c_gt), atol=tol)

## Optimization: stochastic (mini-batch) gradient descent

* Works also for $L$ which are non-smooth (with sub-gradient) or non-convex.
* Idea: descend the gradient of the loss function.
* Efficiency: compute the gradient $\nabla_c L$ with a sub-set (mini-batch) of the training data.
    * Extreme case: one sample at a time. Very inefficient.
* Update rule (gradient descent) $c^{n+1} = c^n - \lambda_n \nabla_c L$.
* Note: objective (loss on training set) and error (on validation set) are usually evaluated after each epoch. The algorithm is thus stopped after a maximum number of epochs rather than iterations.
* Hyper-parameters.
    * Learning rate (step size) $\lambda_n$. Bigger the batch size, smaller the learning rate.
        * Tradeoff.
            * Small: progress is steady but slow.
            * Big: risks of oscillations or divergence.
        * There are tricks, e.g. vanishing step (like simulated annealing).
    * Size of the mini-batch.
        * We want the one who minimizes the *training time*.
        * Trade-off: should be limited by the available memory, somewhere around 100.
            * Larger is more stable, but computationnaly more expensive.
            * Smaller demands more accesses to memory, which is slow.
            * Larger exploits the parallelism of modern hardware architectures (SIMD on CPU, GPU).
        * Extreme cases:
            * $1$: stochastic gradient descent.
            * $N$: gradient descent.
    * Stopping criterion.
        * Convergence of the loss function $L$.
        * Convergence of the parameters $c$.
        * Maximum number of iterations.

In [None]:
def sgd(learning_rate=.1, batch_size=100, conv=1e-3, maxit=100, window=10):
    """Stochastic (mini-batch) gradient descent."""
    indices = []
    c = np.random.uniform(0, 1, M)
    loss = [L(c)]
    
    def stop(loss):
        """Stop after convergence of the loss."""
        if len(loss) > maxit:
            return True
        elif len(loss) >= 2 * window:
            avg1 = np.mean(loss[-window:])
            avg2 = np.mean(loss[-2*window:-window])
            return True if avg2 - avg1 < conv else False
        else:
            return False
    
    while not stop(loss):
        
        # Be sure to have used all the samples before using one a second time.
        if len(indices) < batch_size:
            new_indices = np.arange(N)
            np.random.shuffle(new_indices)
            indices.extend(new_indices)
        idx = indices[:batch_size]
        del indices[:batch_size]
        
        c -= learning_rate * dL(X[:,idx], Y[:,idx], c)
        loss.append(L(c))
        
    return c, loss

In [None]:
maxit = 40

def plot(learning_rate, batch_size):
    tstart = time.process_time()
    (c, loss) = sgd(learning_rate, batch_size, 1e-3, maxit)
    t = (time.process_time() - tstart) * 1000
    plt.plot(loss, label='rate={}, size={}, L(c)={:1.2e}, time={:1.0f}ms'.format(learning_rate, batch_size, L(c), t))

plt.figure(figsize=(15,5))
plot(0.2, 1)
plot(0.2, 5)
plot(0.2, 50)
plot(0.2, 100)
plot(0.6, 100)
plt.plot(L(c_opt) * np.ones(maxit+1), label='optimality condition')
plt.plot(L(c_gt) * np.ones(maxit+1), label='ground truth')
plt.title('Convergence, M={}, N={}, eps={}'.format(M, N, eps))
plt.xlabel('iteration n')
plt.ylabel('loss L(c^n)')
plt.legend()
plt.show()

Questions:
* Noise: why don't we find the same loss as the ground truth, but the same as linear programming ?
    * The gradient was incorrectly set to $\nabla_c L = \frac{2}{N} U^T X (X^T U c - Y^T U 1_M)$.
* More samples, e.g. $N=2000$: why don't we find the same loss as the linear programm ?
    * Learning rate too high.