[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pingpong402/networks101/blob/main/network_quadratic.ipynb)[![Open In Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/pingpong402/networks101/HEAD?filepath=network_quadratic.ipynb)

In [None]:
import itertools
import copy
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output, display

plt.close('all')

## training data
X = np.array([[1.6, -2.3, -1.1, 0, 3.5, 4.3]])
Y = np.array([[-5.3, -41.8, -27.1, -14.3, -3, -4.3]])

def zeros_like_dict(d):
  return { k: zeros_like_dict(v) if isinstance(v, dict)
        else np.zeros_like(v)
        for k, v in d.items() }

def logistic(x):
  return 1.0 / (1 + np.exp(-x))

def grad_logistic(x):
  z = logistic(x)
  return z*(1-z)

def compute_linear_transform(x, W, b):
  return W @ x + np.atleast_2d(b).T

def compute_hidden_layer(x, p):
  W = p['W']['hidden']
  b = p['b']['hidden']
  a = compute_linear_transform(x, W, b)
  return logistic(a)

def compute_output_layer(x, p):
  W = p['W']['output']
  b = p['b']['output']
  return compute_linear_transform(x, W, b)

def forward(x, p):
  # if x is a single vector of shape (N,) make it a proper column vector: shape (N, 1)
  if x.ndim < 2:
    x = np.atleast_2d(x).T
  h = compute_hidden_layer(x, p)
  return compute_output_layer(h, p)

def compute_loss(x, y, p):
  if x.ndim < 2:
    x = np.atleast_2d(x).T
  if y.ndim < 2:
    y = np.atleast_2d(y).T
  N = x.shape[1]
  prediction = forward(x, p)
  error = (prediction - y)**2
  return 0.5 * np.sum(error) / N

def compute_grad_loss(x, y, p):
  if x.ndim < 2:
    x = np.atleast_2d(x).T
  if y.ndim < 2:
    y = np.atleast_2d(y).T
  grad = zeros_like_dict(p)
  N = x.shape[1]

  # forward pass to get activations
  a_hidden = compute_linear_transform(x, p['W']['hidden'], p['b']['hidden'])
  x_hidden = logistic(a_hidden)
  prediction = compute_output_layer(x_hidden, p)
  error = prediction - y

  # assume e,xh are the columns of error,x_hidden, ie the individual samples.
  # the sum over all samples of the outer product e*x_h is equivalent to error @ x_hidden.T
  grad['W']['output'] = 1/N * error @ x_hidden.T
  grad['b']['output'] = 1/N * np.sum(error, axis=1)

  # backpropagate error to hidden layer
  hidden_layer_error = (p['W']['output'].T @ error)

  grad['W']['hidden'] = 1/N * (hidden_layer_error * grad_logistic(a_hidden)) @ x.T
  grad['b']['hidden'] = 1/N * np.sum(hidden_layer_error * grad_logistic(a_hidden), axis=1)
  return grad

def update_params(p, grad, tau):
  for key1, sub_dict in p.items():
    for key2, param_array in sub_dict.items():
      p_block = param_array
      g_block = grad[key1][key2]
      p_block -= tau * g_block     # gradient descent step
  return p

def p2vec(p):
  pvec = np.array([])
  for key1, sub_dict in p.items():
    for key2, param_array in sub_dict.items():
      pvec = np.append(pvec, param_array.ravel())
  return pvec

def update_visualization(axes, visualization_data, redraw=False):
  loss, p = visualization_data
  axes[0].clear()
  axes[1].clear()
  axes[0].semilogy(loss)
  axes[0].set_xlabel('Iteration')
  axes[0].set_ylabel('Loss')
  axes[0].set_title(f'loss: {loss[-1]:.4f}')

  x_all = np.atleast_2d(np.arange(-5, 5, 0.1))
  prediction = np.squeeze(forward(x_all, p))
  axes[1].plot(np.squeeze(X),np.squeeze(Y),'rx', label='Training data')
  axes[1].plot(np.squeeze(x_all), prediction, label='Learned function')
  axes[1].set_xlim([-5,5])
  axes[1].set_ylim([-50,50])
  axes[1].legend()
  if redraw:
    clear_output(wait=True)     # reset output cell
    display(fig)

## number of hidden units
N_hidden = 2


## initialize parameters (Glorot)
p = {'W': {}, 'b': {} }
limit = np.sqrt(6 / (1 + N_hidden))   # hidden layer: n_in = 1, n_out = N_hidden
p['W']['hidden'] = np.random.uniform(-limit, limit, size=(N_hidden, 1))
limit = np.sqrt(6 / (N_hidden + 1))   # output layer: n_in = N_hidden, n_out = 1
p['W']['output'] = np.random.uniform(-limit, limit, size=(1, N_hidden))
p['b'] = {'hidden': np.zeros(N_hidden), 'output': np.zeros(1)}
p0 = copy.deepcopy(p)

num_iter = 2000
tau = 0.02     # static descent stepsize

loss = []

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for k in range(num_iter):
  grad = compute_grad_loss(X, Y, p)
  p = update_params(p, grad, tau)
  loss.append(compute_loss(X, Y, p))
  if k % 100 == 0:
    print(f'iter {k:04d}, loss {loss[-1]:.4f}')

    ## uncomment next line for interactive visualization
    #update_visualization(axes, (loss, p), True)


print(f'Network with {p2vec(p).size} parameters, final loss {loss[-1]:.4f}')
update_visualization(axes, (loss, p))
