# Experiment: learning graph filters

1. Random kNN graph: $W \in \mathbb{R}^{M \times M} \rightarrow L \rightarrow U, \Lambda$.
2. Random graph signals: $X = \{x_i\}_{i=1}^N \in \mathbb{R}^{M \times N}$.
3. Linear mapping: $f(x_i, c) = U \operatorname{diag}(c) U^T x_i$.
4. Noisy target signals: $Y = \{y_i\}_{i=1}^N \in \mathbb{R}^{M \times N}, y_i = f(x_i, c_{gt}) + \mathcal{N}_M(0,\epsilon)$.
    1. With randomly generated coefficients $c_{gt} \sim \mathcal{N}_M(0,1)$.
5. Convex and smooth loss function: $L = \frac{1}{N} \sum_{i=1}^N \|f(x_i, c) - y_i\|_2^2 = \frac{1}{N} \|U \operatorname{diag}(c) U^TX - Y\|_F^2$.
    1. Gradient: $\nabla_{c} L = \frac{2}{N} \left(U^T X \circ ( c \circ U^T X - U^T Y ) \right) 1_N$.
6. Optimization: $c^* = \operatorname{arg min}_c L(c)$.
7. Verification.
    1. $c^*$ should converge to $c_{gt}$.
    2. The loss $L(c^*)$ should converge to $L(c_{gt})$.

In [None]:
import random, time
import numpy as np
import scipy.sparse, scipy.sparse.linalg, scipy.spatial.distance
import matplotlib.pyplot as plt
%matplotlib inline
tol = 1e-10

## Setting

### Graph

* A completely random graph is not smooth at all and will thus have a large spectral gap, i.e. $\lambda_1 >> \lambda_0$.
* A grid, on the contrary, is very regular.

In [None]:
M = 100  # nodes
k = 4  # edges per vertex

def graph_random():
    """Random connections and weights."""
    I = np.arange(0, M).repeat(k)
    J = np.random.randint(0, M, M*k)
    V = np.random.uniform(0, 1, M*k)
    return scipy.sparse.coo_matrix((V, (I, J)), shape=(M, M))

def graph_grid():
    """Construct a kNN graph aranged on a 2D grid."""
    
    # Construct a grid.
    m = np.int(np.sqrt(M))
    x = np.linspace(0,1,m)
    y = np.linspace(0,1,m)
    xx, yy = np.meshgrid(x, y)
    z = np.empty((M,2))
    z[:,0] = xx.reshape(M)
    z[:,1] = yy.reshape(M)

    # Compute pairwise distances.
    d = scipy.spatial.distance.pdist(z, 'euclidean')
    d = scipy.spatial.distance.squareform(d)

    # k-NN graph.
    idx = np.argsort(d)[:,1:k+1]
    d.sort()
    d = d[:,1:k+1]

    # Weights.
    sigma2 = np.mean(d[:,-1])**2
    d = np.exp(- d**2 / sigma2)

    # Weight matrix.
    I = np.arange(0, M).repeat(k)
    J = idx.reshape(M*k)
    V = d.reshape(M*k)
    return scipy.sparse.coo_matrix((V, (I, J)), shape=(M, M))

W = graph_grid()

# No self-connections.
W.setdiag(0)

# Non-directed graph.
bigger = W.T > W
W = W - W.multiply(bigger) + W.T.multiply(bigger)
del bigger
assert np.abs(W - W.T).mean() < tol

# CSR sparse matrix format for efficient multiplications.
W = W.tocsr()
W.eliminate_zeros()

print("{} > {} edges".format(W.nnz, M*k))

* $L^\text{unnormalized} = D - W$
* $L^\text{normalized} = I - D^{-1/2} W D^{-1/2}$

In [None]:
normalized_laplacian = True

def laplacian(W, normalized=True):
    """Return the Laplacian of the weigth matrix."""
    
    # Degree matrix.
    d = W.sum(axis=0)

    # Laplacian matrix.
    if not normalized:
        D = scipy.sparse.diags(d.A.squeeze(), 0)
        return D - W
    else:
        d = 1 / np.sqrt(d)
        D = scipy.sparse.diags(d.A.squeeze(), 0)
        I = scipy.sparse.identity(M, dtype=D.dtype)
        return I - D * W * D

LL = laplacian(W, normalized_laplacian)
assert np.abs(LL - LL.T).mean() < tol

$L = U^T \Lambda U$ where $\Lambda$ is a diagonal matrix of eigenvalues.
Compare the results of four algorithms.

In [None]:
def sort(lamb, U):
    idx = lamb.argsort()
    return lamb[idx], U[:,idx]

lamb, U = np.linalg.eig(LL.toarray())
lamb, U = sort(lamb, U)

lamb_, U_ = np.linalg.eigh(LL.toarray())
np.testing.assert_allclose(lamb_, lamb, atol=tol)
np.testing.assert_allclose(np.abs(U_), np.abs(U), atol=tol)

lamb_, U_ = scipy.sparse.linalg.eigs(LL, k=M-2, which='SM')
lamb_, U_ = sort(lamb_, U_)
np.testing.assert_allclose(lamb[:-2], lamb_, atol=tol)
np.testing.assert_allclose(np.abs(U[:,:-2]), np.abs(U_), atol=tol)

lamb_, U_ = scipy.sparse.linalg.eigsh(LL, k=M-1, which='SM')
np.testing.assert_allclose(lamb[:-1], lamb_, atol=tol)
np.testing.assert_allclose(np.abs(U[:,:-1]), np.abs(U_), atol=tol)

del lamb_, U_

Upper-bound approximation of the spectrum.

* Computed by the Implicitly Restarted Lanczos Method (IRLM), which is a reduction of a variant of the Arnoldi iteration. It is faster than the Power method.
* Normalized graph Laplacian has a bounded spectrum $0 \leq \lambda \leq 2$.

In [None]:
lmax = scipy.sparse.linalg.eigsh(LL, k=1, which='LM', return_eigenvectors=False)[0]
if normalized_laplacian:
    assert lmax <= 2
print('Spectrum: [{:1.2e}, {:1.2e}]'.format(lamb[0], lmax))
np.testing.assert_allclose(lamb[0], 0, atol=tol)
np.testing.assert_allclose(lamb[-1], lmax, atol=tol)

### Ground truth graph filter

Linear mapping $f(x_i, c) = U C U^T x_i$, $C$ is the diagonal matrix $C = \operatorname{diag}(c)$, i.e. $c = C 1_M$.

* Parametrized low-pass filter coefficients $(c_{gt})_i = \operatorname{e}^{-t \lambda_i}$
* Random filter coefficients $c_{gt} \sim \mathcal{N}_M(0,1)$

In [None]:
parametrized = True

if parametrized:
    def g(x, t=.5):
        return np.sin(2 * (x-2)**2)
        return np.exp(-t * x)
    c_gt = g(lamb)
    #assert np.all(c_gt[:-1] - c_gt[1:] > 0)
else:
    c_gt = np.random.normal(0, 1, M)

### Signals

* Random input signals $X \sim \mathcal{N}_{M \times N}(0,1)$
  * Low-pass signals ?
* Noisy target signals $y_i = f(x_i, c_{gt}) + \mathcal{N}_M(0,\epsilon)$

In [None]:
N = 200  # signals
eps = 0.1  # noise

X = np.random.normal(0, 1, (M,N))
Y = U @ np.diag(c_gt) @ U.T @ X + (np.random.normal(0, eps, (M,N)) if eps > 0 else 0)

## Non-parametrized filter learning

### Loss function

* Loss function $L = \frac{1}{N} \sum_{i=1}^N \|f(x_i, c) - y_i\|_2^2 = \frac{1}{N} \|UCU^TX - Y\|_F^2$.
    * Spectral domain: $L = \frac{1}{N} \| C U^T X - U^T Y \|_F^2$.
    * Independant coefficients: $L = \frac{1}{N} \sum_{i=1}^M \| c_i (U^T X)_{i,\cdot} - (U^T Y)_{i,\cdot} \|_2^2$.
    * Convex and smooth w.r.t. $c$.
* Gradient:
    * Independant coefficients: $\nabla_{c_i} L = \frac{2}{N} ( c_i (U^T X)_{i,\cdot} - (U^T Y)_{i,\cdot} ) (X^T U)_{\cdot,i}$.
    * $\nabla_{c} L = \frac{2}{N} \left(U^T X \circ ( c \circ U^T X - U^T Y ) \right) 1_N$.
* Optimization $c^* = \operatorname{arg min}_{c} L(c)$

In [None]:
def L(c):
    M, N = X.shape
    return np.linalg.norm(U @ np.diag(c) @ U.T @ X - Y, 'fro')**2 / N

print("L(c_gt) = {}".format(L(c_gt)))
np.testing.assert_allclose(L(c_gt), M * eps**2, 2e-2)

def dL(X, Y, c, variant=None):
    M, N = X.shape
    A = U.T @ X
    B = U.T @ Y
    # Speed: v3 >> v1 > v2.
    if variant is 1:
        return 2 / N * np.diag(A @ (A.T @ np.diag(c) - B.T))
    elif variant is 2:
        dc = np.empty(M)
        for i in range(M):
            dc[i] = 2 / N * (c[i] * A[i,:] - B[i,:]) @ A.T[:,i]
        return dc
    else:
        # Speed: .sum(axis=1) is faster than *np.ones(N).y
        return 2 / N * (A * (c[:,np.newaxis] * A - B)).sum(axis=1)

print("|dL(c_gt)| = {}".format(np.linalg.norm(dL(X, Y, c_gt))))
# Gradient should be null at the global minimum. With noise, c_gt is not necessary the optimum.
if eps <= 0:
    np.testing.assert_allclose(dL(X, Y, c_gt), 0, atol=tol)
np.testing.assert_allclose(dL(X, Y, c_gt), dL(X, Y, c_gt, 1))
np.testing.assert_allclose(dL(X, Y, c_gt), dL(X, Y, c_gt, 2))

### Optimization: optimality condition

* Only possible because $L$ is convex and smooth.
* Optimality condition $\nabla_c L = 0$ gives $(U^T X \circ U^T X) 1_N \circ c = (U^T X \circ U^T Y) 1_N$.

In [None]:
tstart = time.process_time()
A = U.T @ X
B = U.T @ Y
c_opt = (A * B).sum(axis=1) / (A * A).sum(axis=1)
print('Execution time: {:1.0f}ms'.format((time.process_time() - tstart) * 1000))

print("L(c_opt) = {}".format(L(c_opt)))
assert L(c_opt) < L(c_gt) + tol
print("|dL(c_opt)| = {}".format(np.linalg.norm(dL(X, Y, c_opt))))
assert np.linalg.norm(dL(X, Y, c_opt)) < np.linalg.norm(dL(X, Y, c_gt))
np.testing.assert_allclose(dL(X, Y, c_opt), 0, atol=tol)
if eps <= 0:
    np.testing.assert_allclose(c_opt, c_gt, atol=tol)
    np.testing.assert_allclose(L(c_opt), L(c_gt), atol=tol)

### Optimization: stochastic (mini-batch) gradient descent

* Works also for $L$ which are non-smooth (with sub-gradient) or non-convex.
* Idea: descend the gradient of the loss function.
* Efficiency: compute the gradient $\nabla_c L$ with a sub-set (mini-batch) of the training data.
    * Extreme case: one sample at a time. Very inefficient.
* Update rule (gradient descent) $c^{n+1} = c^n - \lambda_n \nabla_c L$.
* Note: objective (loss on training set) and error (on validation set) are usually evaluated after each epoch. The algorithm is thus stopped after a maximum number of epochs rather than iterations.
* Hyper-parameters.
    * Learning rate (step size) $\lambda_n$. Bigger the batch size, smaller the learning rate.
        * Tradeoff.
            * Small: progress is steady but slow.
            * Big: risks of oscillations or divergence.
        * There are tricks, e.g. vanishing step (like simulated annealing).
    * Size of the mini-batch.
        * We want the one who minimizes the *training time*.
        * Trade-off: should be limited by the available memory, somewhere around 100.
            * Larger is more stable, but computationnaly more expensive.
            * Smaller demands more accesses to memory, which is slow.
            * Larger exploits the parallelism of modern hardware architectures (SIMD on CPU, GPU).
        * Extreme cases:
            * $1$: stochastic gradient descent.
            * $N$: gradient descent.
    * Stopping criterion.
        * Convergence of the loss function $L$.
        * Convergence of the parameters $c$.
        * Maximum number of iterations.

In [None]:
def sgd(c0, L, dL, learning_rate=.1, batch_size=100, conv=1e-3, maxit=100, window=10):
    """Stochastic (mini-batch) gradient descent."""
    indices = []
    c = c0
    loss = [L(c)]
    
    def stop(loss):
        """Stop after convergence of the loss."""
        if len(loss) > maxit:
            return True
        #elif np.linalg.norm(dL(X, Y, c)) < conv:
            #return True
        elif len(loss) >= 2 * window:
            avg1 = np.mean(loss[-window:])
            avg2 = np.mean(loss[-2*window:-window])
            return True if avg2 - avg1 < conv else False
        else:
            return False
    
    while not stop(loss):
        
        # Be sure to have used all the samples before using one a second time.
        if len(indices) < batch_size:
            new_indices = np.arange(N)
            np.random.shuffle(new_indices)
            indices.extend(new_indices)
        idx = indices[:batch_size]
        del indices[:batch_size]
        
        c -= learning_rate * dL(X[:,idx], Y[:,idx], c)
        loss.append(L(c))
        
    return c, loss

### Results

Observations:
* Noise: why don't we find the same loss as the ground truth, but the same as linear programming ?
    * The gradient was incorrectly set to $\nabla_c L = \frac{2}{N} U^T X (X^T U c - Y^T U 1_M)$.
* More samples, e.g. $N=2000$: why don't we find the same loss as the linear programm ?
    * Learning rate too high.
* The spectral gap $\lambda_1$ is large for a random graph.
* Without noise, the recovered filter is exact.

In [None]:
def plots(c0, L_, dL_, params, conv, maxit, cs, T=None):
    
    fig_conv = plt.figure(figsize=(15,5))
    fig_filt = plt.figure(figsize=(15,5))
    ax_conv = fig_conv.add_subplot(1,1,1)
    ax_filt = fig_filt.add_subplot(1,1,1)

    def plot(learning_rate, batch_size):
        tstart = time.process_time()
        (c, loss) = sgd(np.array(c0), L_, dL_, learning_rate, batch_size, conv, maxit)
        t = (time.process_time() - tstart) * 1000
        label = 'rate={}, size={}, L(c)={:1.2e}, |dL(c)|={:1.2e}, time={:1.0f}ms'.format(
            learning_rate, batch_size, L_(c), np.linalg.norm(dL_(X, Y, c)), t)
        ax_conv.plot(loss, label=label)
        ax_filt.plot(lamb, c if T is None else T @ c, '.-', label=label)

    for param in params:
        plot(param[0], param[1])

    for c in cs:
        label = '{0}, L({0})={1:1.2e}, |dL({0})|={2:1.2e})'.format(c, L(eval(c)), np.linalg.norm(dL(X,Y,eval(c))))
        ax_conv.plot(L(eval(c)) * np.ones(maxit+1), label=label)
        ax_filt.plot(lamb, eval(c), '.-', label=label)

    ax_conv.set_title('Convergence, M={}, N={}, eps={}'.format(M, N, eps))
    ax_conv.set_xlabel('iteration n')
    ax_conv.set_ylabel('loss L(c^n)')
    ax_conv.legend(loc='best')
#    fig_conv.show()
    
    ax_filt.set_xlim(lamb[0], lamb[-1])
    ax_filt.set_title('Filters, M={}, N={}, eps={}'.format(M, N, eps))
    ax_filt.set_xlabel('frequency lamb')
    ax_filt.set_ylabel('filter coefficients c')
    ax_filt.legend(loc='best')
#    fig_filt.show()
    
params = []
params.append([0.2, 1])
params.append([0.2, 5])
params.append([0.2, 50])
params.append([0.2, 100])
params.append([0.6, 100])
c0 = np.random.uniform(0, 1, M)
plots(c0, L, dL, params, 1e-3, 40, ['c_opt', 'c_gt'])

## Filter learning: truncated Chebyshev expansion

* Use a $K$th order polynomial approximation of the filter.
* Less free parameters: $K << M$.
* Good approximation for smooth, i.e. localized, filters.

### Basis of Chebyshev polynomials

* Compute the Chebyshev basis $T$ of order $K$.
* This basis will allow us to construct and observe the filter from the inferred polynomial coefficients.
* The figure shows that we indeed generate the Chebyshev polynomials of the first kind.

In [None]:
K = 5

def cheby_pol(K, x):
    """Return the Chebyshev basis of order K (composed of the
    first K polynomials) evaluated at x. Polynomials are generated
    by their recursive formulation."""
    T = np.empty((x.size, K))
    T[:,0] = np.ones(x.size)
    if K >= 2:
        T[:,1] = x
    for k in range(2, K):
        T[:,k] = 2 * x * T[:,k-1] - T[:,k-2]
    return T

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(1,1,1)
x = np.linspace(-1,1,100)
T = cheby_pol(K, x)
for k in range(K):
    ax.plot(x, T[:,k], label='T_{}'.format(k))
ax.set_title('Chebyshev polynomials of the first kind')
ax.set_xlabel('x')
ax.set_ylabel('T_n(x)')
ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1.1)
ax.legend(loc='best')
plt.show()

### Ground-truth Chebyshev coefficients

Given the filter $g$ with a vector $c_{gt} \in \mathbb{R}^M$ of evaluations, find the Chebyshev coefficients $c_{cgt} \in \mathbb{R}^K$. Truncated Chebyshev series closely approximate the minimax polynomial, i.e. $c_{cgt} \approx \operatorname{arg min}_c \| c_{gt} - \sum_k c_k T_k \|_\infty$ where $T_k$ is the Chebyshev polynomial of order $k$. Given that the polynomials form an orthogonal basis for $L^2([-1,1],\frac{dy}{\sqrt{1-y^r}})$, the coefficients can be retrieved by two methods.

1. Analytical projection.
    * $c_k = \frac{2}{\pi} \int_0^\pi \cos(k\theta) g( \frac{\lambda_{max}}{2} (\cos(\theta) + 1)) d\theta$
    * Need the analytic function.
2. Numerical projection (discrete orthogonality condition).
    * $c_k = \frac{2}{K} \sum_j g(x_j) T_k(x_j)$ where the $x_j$ are the $K$ Chebyshev nodes, because the approximation error is null only at these points.
    * Need function evaluations at the Chebyshev nodes, but those only. Much less points than least mean square.

In our setting, the generative filter is the function to learn. We have however access to some evaluations of the filter (at the eigenvalues of the Laplacian) via convex optimization of the loss function $L$ (described above). From those, given the Chebyshev basis, we can retrieve the coefficients that minimize the reconstruction error of this filter.

Results:

* Playing with the order $K$ shows that the approximation converges to the filter $g$.
* The approximation constructed by minimizing the filter l2 reconstruction error is now longer a Chebyshev polynomial (there are error on the Chebyshev nodes) but it provides a smaller loss $L$ (our final measure of quality). It however requires the full Chebyshev basis, which requires the eigenvalues of the Laplacian.

In [None]:
K = 10

def rescale(x, reverse=False):
    """Rescale the spectral domain to [-1,1]."""
    old_min = -1
    old_max = 1
    new_min = lamb[0]
    new_max = lamb[-1]
    if reverse:
        tmp_min, tmp_max = old_min, old_max
        old_min, old_max = new_min, new_max
        new_min, new_max = tmp_min, tmp_max
    x = (x - old_min) / (old_max - old_min)
    return x * (new_max - new_min) + new_min
np.testing.assert_allclose(lamb, rescale(rescale(lamb, True)))

def cheby_nodes(K):
    """Return the K Chebyshev nodes in [-1,1]."""
    return np.cos(np.pi * (np.arange(K) + 1/2) / K)
    
def cheby_coeff(K, f):
    """Compute the coefficients of the Chebyshev polynomial approximation."""
    # Coefficients from discrete orthogonality condition.
    # It can be done faster via the discrete cosine transform.
    c = np.empty(K)
    x = cheby_nodes(K)
    T = cheby_pol(K, x)
    for k in range(K):
        c[k] = 2 / K * np.sum(f(x) * T[:,k])
    c[0] /= 2
    return c

# Domain is [-1, 1].
x = np.linspace(-1,1,100)
x = rescale(lamb, True)
f = lambda x: g(rescale(x))

fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(1,1,1)
np.testing.assert_allclose(f(x), c_gt)
ax.plot(rescale(x), c_gt, '.-', label='target function, L={:1.2e}'.format(L(c_gt)))
ax.plot(rescale(cheby_nodes(K)), f(cheby_nodes(K)), '.', markersize=15, label='Chebyshev nodes')

c = cheby_coeff(K, f)
T = cheby_pol(K, x)
ax.plot(rescale(x), T @ c, '.-', label='discrete orthogonality condition, L={:1.2e}'.format(L(T @ c)))

# The error should be zero at the Chebyshev nodes.
np.testing.assert_allclose(f(cheby_nodes(K)), cheby_pol(K, cheby_nodes(K)) @ c)

c = np.linalg.lstsq(T, c_gt)[0]
ax.plot(rescale(x), T @ c, '.-', label='least mean square, L={:1.2e}'.format(L(T @ c)))

ax.set_title('Chebyshev approximation of order {}'.format(K))
ax.set_xlabel('Frequency')
ax.set_ylabel('Amplitude')
ax.legend(loc='best')
ax.set_xlim(lamb[0], lamb[-1])
plt.show()

Determine the polynomial order by filtering the data with Chebyshev approximations of order $1 \leq k \leq K$ and monitoring the reconstruction loss $L$.

* The result shows that the approximation does indeed converge.
* The approximation loss arrives at a plateau (the round-off error ?) given a high enough order.
* As anticipated on the figure above, the coefficients provided by least square reconstruction have smaller loss than the *correct* ones.

In [None]:
K = 60
loss_gt = np.empty((K))
loss_opt = np.empty((K))
for k in range(1, K+1):
    T = cheby_pol(k, x)
    c = cheby_coeff(k, f)
    loss_gt[k-1] = L(T @ c)
    c = np.linalg.lstsq(T, f(x))[0]
    loss_opt[k-1] = L(T @ c)
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(1,1,1)
ax.semilogy(range(1,K+1), loss_gt, label='L(T @ c_cgt)')
ax.semilogy(range(1,K+1), loss_opt, label='L(T @ c_copt)')
ax.semilogy(L(c_gt) * np.ones(K+1), label='L(c_gt)')
ax.semilogy(L(c_opt) * np.ones(K+1), label='L(c_opt)')
ax.set_title('Loss due to Chebyshev approximation')
ax.set_xlabel('Polynomial order')
ax.set_ylabel('Loss L')
ax.set_xlim(1, K)
ax.legend(loc='best')
plt.show()

Choose the polynomial order $K$ and compute the basis $T$ with their associate coefficients $c_{cgt}$.

In [None]:
K = 15
c_cgt = cheby_coeff(K, f)
T = cheby_pol(K, x)

# If the order is sufficient for a perfect (as good as c_gt) reconstruction (test only).
pol_order_is_sufficient = False

### Loss function

* Independant coefficients: $L = \frac{1}{N} \sum_{i=1}^M \| (Tc)_i (U^T X)_{i,\cdot} - (U^T Y)_{i,\cdot} \|_2^2$.
* $L = \frac{1}{N} \| Tc \circ U^T X - U^T Y \|_2^2$.
* $\nabla_{c} L = \frac{2}{N} \left(T^T \left( U^T X \circ ( Tc \circ U^T X - U^T Y ) \right) \right) 1_N$.

In [None]:
def Lc(c):
    M, N = X.shape
    return np.linalg.norm(U @ np.diag(T @ c) @ U.T @ X - Y, 'fro')**2 / N

print("L(c_cgt) = {}".format(Lc(c_cgt)))
np.testing.assert_allclose(Lc(c_cgt), L(T @ c_cgt), atol=tol)
if pol_order_is_sufficient:
    np.testing.assert_allclose(Lc(c_cgt), M * eps**2, 2e-2)
    np.testing.assert_allclose(Lc(c_cgt), L(c_gt), atol=tol)

def dLc(X, Y, c):
    M, N = X.shape
    A = U.T @ X
    B = U.T @ Y
    return 2 / N * T.T @ (A * ((T @ c)[:,np.newaxis] * A - B)).sum(axis=1)

print('|dL(c_cgt)| = {}'.format(np.linalg.norm(dLc(X, Y, c_cgt))))
# Gradient should be null at the global minimum. With noise, c_gt is not necessary the optimum.
if eps <= 0 and pol_order_is_sufficient:
    np.testing.assert_allclose(dLc(X, Y, c_cgt), 0, atol=tol)

### Optimization: optimality condition

* Given the signals $X$, $Y$ and the Chebyshev basis $T$, find the Chebyshev coefficients $c_{copt}$.
* Optimality condition $\nabla_c L = 0$ gives $(U^T X \circ U^T X) 1_N \circ Tc = (U^T X \circ U^T Y) 1_N$.
* Why do we not always reach the minimum, i.e. $\nabla_c L = 0$ ? E.g. for small polynomial orders, when we are not able to sufficiently approximate the filter.

In [None]:
c_copt = np.linalg.lstsq(T, c_opt)[0]
print("L(c_copt) = {}".format(Lc(c_copt)))
assert Lc(c_copt) < Lc(c_cgt) + tol
print('|dL(c_copt)| = {}'.format(np.linalg.norm(dLc(X, Y, c_copt))))
assert np.linalg.norm(dLc(X, Y, c_copt)) < np.linalg.norm(dLc(X, Y, c_cgt))
#np.testing.assert_allclose(dLc(X, Y, c_copt), 0, atol=tol)
if eps <= 0 and pol_order_is_sufficient:
    np.testing.assert_allclose(c_copt, c_cgt, atol=tol)
    np.testing.assert_allclose(Lc(c_copt), Lc(c_cgt), atol=tol)

* Fast decay of the coefficients: good for approximation.
* The *ground truth* and *optimal* coefficients are similar, given a sufficiently high order $K$. Otherwize the *optimal* are better.

In [None]:
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(1,1,1)
ax.semilogy(abs(c_cgt), '.', label='c_cgt')
ax.semilogy(abs(c_copt), 'x', label='c_copt')
ax.set_title('Chebyshev coefficients')
ax.legend(loc='best')
plt.show()

### Results

* Why |dL(c)| does not converge to the null vector ? There should be no gradient at the optimum.
* Convergence seems harder than before.
* The coefficients c_copt always provide a smallest loss than c_cgt.

In [None]:
params = [[0.005, 100]]
c0 = np.random.uniform(0, 1, K)
plots(c0, Lc, dLc, params, 1e-3, 80, ['T @ c_cgt', 'T @ c_copt', 'c_opt'], T)