In [1]:
#| echo: false
#| warning: false
import matplotlib
from matplotlib  import pyplot as plt

import cycler

colors = ["#91CCCC", "#FF8FA9", "#CC91BC", "#3F9999", "#A5FFB8"]
matplotlib.pyplot.rcParams["axes.prop_cycle"] = cycler.cycler(color=colors)


def set_square_figures():
    matplotlib.pyplot.rcParams["figure.figsize"] = (2.0, 2.0)


def set_rectangular_figures():
    matplotlib.pyplot.rcParams["figure.figsize"] = (5.0, 2.0)


set_rectangular_figures()
matplotlib.pyplot.rcParams["figure.dpi"] = 350
matplotlib.pyplot.rcParams["savefig.bbox"] = "tight"
matplotlib.pyplot.rcParams["font.family"] = "serif"

matplotlib.pyplot.rcParams["axes.spines.right"] = False
matplotlib.pyplot.rcParams["axes.spines.top"] = False


def squareFig():
    return matplotlib.pyplot.figure(figsize=(2, 2), dpi=350).gca()


def add_diagonal_line():
    xl = matplotlib.pyplot.xlim()
    yl = matplotlib.pyplot.ylim()
    shortestSide = min(xl[1], yl[1])
    matplotlib.pyplot.plot(
        [0, shortestSide], [0, shortestSide], color="black", linestyle="--"
    )


import pandas

pandas.options.display.max_rows = 6

import numpy

numpy.set_printoptions(precision=2)
numpy.random.seed(123)

import tensorflow

tensorflow.random.set_seed(1)
tensorflow.config.set_visible_devices([], "GPU")


def skip_empty(line):
    if line.strip() != "":
        print(line.strip())

2024-09-05 16:26:02.634188: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/oneapi/tbb/2021.12/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/2021.12/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.12/lib:/opt/intel/oneapi/mkl/2024.1/lib:/opt/intel/oneapi/dpl/2022.5/lib:/opt/intel/oneapi/debugger/2024.1/opt/debugger/lib:/opt/intel/oneapi/compiler/2024.1/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/2024.1/opt/compiler/lib:/opt/intel/oneapi/compiler/2024.1/lib:/home/unimelb.edu.au/lcampoli/Downloads/tcl8.6.12-src/install/lib:/home/unimelb.edu.au/lcampoli/OpenFOAM/ThirdParty-7/platforms/linux64Gcc/gperftools-svn/lib:/home/unimelb.edu.au/lcampoli/OpenFOAM/lcampoli-7/platforms/linux64GccDPInt32Opt/lib:/home/unimelb.edu.au/lcampoli/OpenFOAM/site/7/platforms/linux64GccDPInt32Opt/lib:/home/unimelb.edu

AttributeError: module 'numpy' has no attribute 'typeDict'

::: {.content-visible unless-format="revealjs"}


In [None]:
#| code-fold: true
#| code-summary: Show the package imports
import random
import numpy as np
import pandas as pd

:::


# Dense Layers in Matrices {data-visibility="uncounted"}

## Logistic regression


In [None]:
#| echo: false
set_square_figures()

::: columns
::: column

Observations: $\mathbf{x}_{i,\bullet} \in \mathbb{R}^{2}$.

Target: $y_i \in \{0, 1\}$.

Predict: $\hat{y}_i = \mathbb{P}(Y_i = 1)$.

<br>

__The model__

For $\mathbf{x}_{i,\bullet} = (x_{i,1}, x_{i,2})$:
$$
z_i = x_{i,1} w_1 + x_{i,2} w_2 + b
$$

$$
\hat{y}_i = \sigma(z_i) = \frac{1}{1 + \mathrm{e}^{-z_i}} .
$$

:::
::: column


In [None]:
import sympy
#sympy.plot("1/(1 + exp(-z))");

:::
:::


In [None]:
#| echo: false
set_rectangular_figures()

## Multiple observations


In [None]:
data = pd.DataFrame({"x_1": [1, 3, 5], "x_2": [2, 4, 6], "y": [0, 1, 1]})
data

Let $w_1 = 1$, $w_2 = 2$ and $b = -10$.


In [None]:
w_1 = 1; w_2 = 2; b = -10
data["x_1"] * w_1 + data["x_2"] * w_2 + b 

## Matrix notation

::: columns
::: column
Have $\mathbf{X} \in \mathbb{R}^{3 \times 2}$.


In [None]:
X_df = data[["x_1", "x_2"]]
X = X_df.to_numpy()
X

:::
::: column
Let $\mathbf{w} = (w_1, w_2)^\top \in \mathbb{R}^{2 \times 1}$.


In [None]:
w = np.array([[1], [2]])
w

:::
:::

$$
\mathbf{z} = \mathbf{X} \mathbf{w} + b , \quad \mathbf{a} = \sigma(\mathbf{z})
$$

::: columns
::: column

In [None]:
z = X.dot(w) + b
z

:::
::: column

In [None]:
1 / (1 + np.exp(-z))

:::
:::

## Using a softmax output

::: columns
::: column
Observations: $\mathbf{x}_{i,\bullet} \in \mathbb{R}^{2}$.
Predict: $\hat{y}_{i,j} = \mathbb{P}(Y_i = j)$.
:::
::: column
Target: $\mathbf{y}_{i,\bullet} \in \{(1, 0), (0, 1)\}$.
:::
:::

__The model__: For $\mathbf{x}_{i,\bullet} = (x_{i,1}, x_{i,2})$
$$
\begin{aligned}
z_{i,1} &= x_{i,1} w_{1,1} + x_{i,2} w_{2,1} + b_1 , \\
z_{i,2} &= x_{i,1} w_{1,2} + x_{i,2} w_{2,2} + b_2 .
\end{aligned}
$$

$$
\begin{aligned}
\hat{y}_{i,1} &= \text{Softmax}_1(\mathbf{z}_i) = \frac{\mathrm{e}^{z_{i,1}}}{\mathrm{e}^{z_{i,1}} + \mathrm{e}^{z_{i,2}}} , \\
\hat{y}_{i,2} &= \text{Softmax}_2(\mathbf{z}_i) = \frac{\mathrm{e}^{z_{i,2}}}{\mathrm{e}^{z_{i,1}} + \mathrm{e}^{z_{i,2}}} .
\end{aligned}
$$

## Multiple observations

::: columns
::: column

In [None]:
#| echo: false
data = pd.DataFrame({
  "x_1": [1, 3, 5], "x_2": [2, 4, 6],
  "y_1": [1, 0, 0], "y_2": [0, 1, 1]})

In [None]:
data

:::
::: column
Choose:

$w_{1,1} = 1$, $w_{2,1} = 2$,

$w_{1,2} = 3$, $w_{2,2} = 4$, and

$b_1 = -10$, $b_2 = -20$.

:::
:::


In [None]:
w_11 = 1; w_21 = 2; b_1 = -10
w_12 = 3; w_22 = 4; b_2 = -20
data["x_1"] * w_11 + data["x_2"] * w_21 + b_1

## Matrix notation

::: columns
::: column
Have $\mathbf{X} \in \mathbb{R}^{3 \times 2}$.


In [None]:
X

:::
::: column
$\mathbf{W}\in \mathbb{R}^{2\times2}$, $\mathbf{b}\in \mathbb{R}^{2}$


In [None]:
W = np.array([[1, 3], [2, 4]])
b = np.array([-10, -20])
display(W); b

:::
:::

$$
  \mathbf{Z} = \mathbf{X} \mathbf{W} + \mathbf{b} , \quad \mathbf{A} = \text{Softmax}(\mathbf{Z}) .
$$

::: columns
::: column

In [None]:
Z = X @ W + b
Z

:::
::: column

In [None]:
np.exp(Z) / np.sum(np.exp(Z),
  axis=1, keepdims=True)

:::
:::



# Optimisation {data-visibility="uncounted"}

## Gradient-based learning



```{=html}
<div style="font-size: 0px;">
  <py-config>packages = ["matplotlib"]</py-config>
  </div>
<div>
  <!-- Source for slider with current value shown: https://stackoverflow.com/a/18936328 -->
  Make a guess: <input type="range" min="1" max="100" value="50" class="slider" id="new_guess" oninput="this.nextElementSibling.value = this.value">
  <output>50</output><br>
  Show derivatives: <input type="checkbox" id="derivs">
  Reveal function: <input type="checkbox" id="reveal">
</div>
<div id="mpl" style="text-align: center;"></div>
<py-script output="mpl" src="minimise-with-gradients.py" />
```



## Gradient descent pitfalls

![Potential problems with gradient descent.](Geron-mls2_0406.png)

::: footer
Source: Aurélien Géron (2019), _Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow_, 2nd Edition, Figure 4-6.
:::

## Go over all the training data

<br>

Called _batch gradient descent_.

<br>

```python
for i in range(num_epochs):
    gradient = evaluate_gradient(loss_function, data, weights)
    weights = weights - learning_rate * gradient
```

## Pick a random training example

<br>

Called _stochastic gradient descent_.

<br>

```python
for i in range(num_epochs):
    rnd.shuffle(data)
    for example in data:
        gradient = evaluate_gradient(loss_function, example, weights)
        weights = weights - learning_rate * gradient
```

## Take a group of training examples

<br>

Called _mini-batch gradient descent_.

<br>

```python
for i in range(num_epochs):
    rnd.shuffle(data)
    for b in range(num_batches):
        batch = data[b * batch_size : (b + 1) * batch_size]
        gradient = evaluate_gradient(loss_function, batch, weights)
        weights = weights - learning_rate * gradient
```

## Mini-batch gradient descent

::: columns
::: column

Why?

1. Because we have to (data is too big)
2. Because it is faster (lots of quick noisy steps > a few slow super accurate steps)
3. The noise helps us jump out of local minima

:::
::: column
![Example of jumping from local minima.](Geron-mls2_0406.png)
:::
:::

::: footer
Source: Aurélien Géron (2019), _Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow_, 2nd Edition, Figure 4-6.
:::

## Learning rates

::: columns
::: column

![The learning rate is too small](Geron-mls2_0404.png)
:::
::: column
![The learning rate is too large](Geron-mls2_0405.png)
:::
:::

::: footer
Source: Aurélien Géron (2019), _Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow_, 2nd Edition, Figures 4-4 and 4-5.
:::

## Learning rates #2

![Changing the learning rates for a robot arm.](matt-henderson-learning-rates-animation.mov){width=60%}

::: {.content-visible unless-format="revealjs"}

> "a nice way to see how the learning rate affects Stochastic Gradient Descent.
> we can use SGD to control a robot arm - minimizing the distance to the target as a function of the angles θᵢ. Too low a learning rate gives slow inefficient learning, too high and we see instability"

:::

::: footer
Source: Matt Henderson (2021), [Twitter post](https://twitter.com/matthen2/status/1520427516997025792)
:::

## Learning rate schedule

![Learning curves for various learning rates η](Geron-mls2_1108.png)

In training the learning rate may be tweaked manually.

::: footer
Source: Aurélien Géron (2019), _Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow_, 2nd Edition, Figure 11-8.
:::

## We need non-zero derivatives {.smaller}

This is why can't use accuracy as the loss function for classification.

Also why we can have the _dead ReLU_ problem.

::: {.content-hidden unless-format="revealjs"}


{{< video https://www.youtube.com/embed/KpKog-L9veg width="100%" height="80%" >}}




:::
::: {.content-visible unless-format="revealjs"}


{{< video https://www.youtube.com/embed/KpKog-L9veg >}}




:::

# Loss and derivatives {data-visibility="uncounted"}

## Example: linear regression

$$
\hat{y}(x) = w x + b
$$

For some observation $\{ x_i, y_i \}$, the (MSE) loss is

$$ 
\text{Loss}_i = (\hat{y}(x_i) - y_i)^2
$$

For a batch of the first $n$ observations the loss is

$$ 
\text{Loss}_{1:n} = \frac{1}{n} \sum_{i=1}^n (\hat{y}(x_i) - y_i)^2
$$

## Derivatives

Since $\hat{y}(x) = w x + b$,

$$
\frac{\partial \hat{y}(x)}{\partial w} = x \text{ and }
\frac{\partial \hat{y}(x)}{\partial b} = 1 .
$$

As $\text{Loss}_i = (\hat{y}(x_i) - y_i)^2$, we know
$$
\frac{\partial \text{Loss}_i}{\partial \hat{y}(x_i) } = 2 (\hat{y}(x_i) - y_i) .
$$

## Chain rule

$$
\frac{\partial \text{Loss}_i}{\partial \hat{y}(x_i) } = 2 (\hat{y}(x_i) - y_i), \,\,
\frac{\partial \hat{y}(x)}{\partial w} = x , \, \text{ and } \,
\frac{\partial \hat{y}(x)}{\partial b} = 1 .
$$

Putting this together, we have

$$
\frac{\partial \text{Loss}_i}{\partial w}
= \frac{\partial \text{Loss}_i}{\partial \hat{y}(x_i) }
  \times \frac{\partial \hat{y}(x_i)}{\partial w}
= 2 (\hat{y}(x_i) - y_i) \, x_i 
$$

and
$$
\frac{\partial \text{Loss}_i}{\partial b}
= \frac{\partial \text{Loss}_i}{\partial \hat{y}(x_i) }
  \times \frac{\partial \hat{y}(x_i)}{\partial b}
= 2 (\hat{y}(x_i) - y_i) .
$$

## Stochastic gradient descent (SGD)

Start with $\boldsymbol{\theta}_0 = (w, b)^\top = (0, 0)^\top$.

Randomly pick $i=5$, say $x_i = 5$ and $y_i = 5$.

::: fragment
$$
\hat{y}(x_i) = 0 \times 5 + 0 = 0 \Rightarrow \text{Loss}_i = (0 - 5)^2 = 25.
$$
:::
::: fragment
The partial derivatives are
$$
\begin{aligned}
\frac{\partial \text{Loss}_i}{\partial w} 
&= 2 (\hat{y}(x_i) - y_i) \, x_i = 2 \cdot (0 - 5) \cdot 5 = -50, \text{ and} \\
\frac{\partial \text{Loss}_i}{\partial b}
&= 2 (0 - 5) = - 10.
\end{aligned}
$$
The gradient is $\nabla \text{Loss}_i = (-50, -10)^\top$.
:::

## SGD, first iteration

Start with $\boldsymbol{\theta}_0 = (w, b)^\top = (0, 0)^\top$.

Randomly pick $i=5$, say $x_i = 5$ and $y_i = 5$.

The gradient is $\nabla \text{Loss}_i = (-50, -10)^\top$.

Use learning rate $\eta = 0.01$ to update 
$$
\begin{aligned}
\boldsymbol{\theta}_1
&= \boldsymbol{\theta}_0 - \eta \nabla \text{Loss}_i \\
&= \begin{pmatrix} 0 \\ 0 \end{pmatrix} - 0.01 \begin{pmatrix} -50 \\ -10 \end{pmatrix} \\
&= \begin{pmatrix} 0 \\ 0 \end{pmatrix} + \begin{pmatrix} 0.5 \\ 0.1 \end{pmatrix} = \begin{pmatrix} 0.5 \\ 0.1 \end{pmatrix}.
\end{aligned}
$$

## SGD, second iteration

Start with $\boldsymbol{\theta}_1 = (w, b)^\top = (0.5, 0.1)^\top$.

Randomly pick $i=9$, say $x_i = 9$ and $y_i = 17$.

The gradient is $\nabla \text{Loss}_i = (-223.2, -24.8)^\top$.

Use learning rate $\eta = 0.01$ to update 
$$
\begin{aligned}
\boldsymbol{\theta}_2
&= \boldsymbol{\theta}_1 - \eta \nabla \text{Loss}_i \\
&= \begin{pmatrix} 0.5 \\ 0.1 \end{pmatrix} - 0.01 \begin{pmatrix} -223.2 \\ -24.8 \end{pmatrix} \\
&= \begin{pmatrix} 0.5 \\ 0.1 \end{pmatrix} + \begin{pmatrix} 2.232 \\ 0.248 \end{pmatrix} = \begin{pmatrix} 2.732 \\ 0.348 \end{pmatrix}.
\end{aligned}
$$

## Batch gradient descent (BGD) {.smaller}

For the first $n$ observations 
$\text{Loss}_{1:n} = \frac{1}{n} \sum_{i=1}^n \text{Loss}_i$
so

$$
\begin{aligned}
\frac{\partial \text{Loss}_{1:n}}{\partial w}
&= \frac{1}{n} \sum_{i=1}^n \frac{\partial \text{Loss}_{i}}{\partial w}
= \frac{1}{n} \sum_{i=1}^n \frac{\partial \text{Loss}_{i}}{\hat{y}(x_i)} \frac{\partial \hat{y}(x_i)}{\partial w} \\
&= \frac{1}{n} \sum_{i=1}^n 2 (\hat{y}(x_i) - y_i) \, x_i .
\end{aligned}
$$

$$
\begin{aligned}
\frac{\partial \text{Loss}_{1:n}}{\partial b}
&= \frac{1}{n} \sum_{i=1}^n \frac{\partial \text{Loss}_{i}}{\partial b}
= \frac{1}{n} \sum_{i=1}^n \frac{\partial \text{Loss}_{i}}{\hat{y}(x_i)} \frac{\partial \hat{y}(x_i)}{\partial b} \\
&= \frac{1}{n} \sum_{i=1}^n 2 (\hat{y}(x_i) - y_i) .
\end{aligned}
$$

## BGD, first iteration ($\boldsymbol{\theta}_0 = \boldsymbol{0}$) {.smaller}


In [None]:
#| echo: false
numpy.random.seed(111)
n = 3
x = numpy.arange(1, n + 1)
y = 2 * x - 1 + 0.01 * numpy.random.randn(n)

theta_0 = numpy.array([0, 0])
yhat = theta_0[0] * x + theta_0[1]

loss = (yhat - y) ** 2

dLossdw = 2 * (yhat - y) * x
dLossdb = 2 * (yhat - y)

df = pandas.DataFrame(
    {"x": x, "y": y, "y_hat": yhat, "loss": loss, "dL/dw": dLossdw, "dL/db": dLossdb}
)

In [None]:
#| echo: false
df.round(2)

So $\nabla \text{Loss}_{1:3}$ is

In [None]:
nabla = np.array([df["dL/dw"].mean(), df["dL/db"].mean()])
nabla 

so with $\eta = 0.1$ then $\boldsymbol{\theta}_1$ becomes

In [None]:
theta_1 = theta_0 - 0.1 * nabla
theta_1

## BGD, second iteration {.smaller}


In [None]:
#| echo: false
yhat = theta_1[0] * x + theta_1[1]
loss = (yhat - y) ** 2
dLossdw = 2 * (yhat - y) * x
dLossdb = 2 * (yhat - y)

df = pandas.DataFrame(
    {"x": x, "y": y, "y_hat": yhat, "loss": loss, "dL/dw": dLossdw, "dL/db": dLossdb}
)

In [None]:
#| echo: false
df.round(2)

So $\nabla \text{Loss}_{1:3}$ is

In [None]:
nabla = np.array([df["dL/dw"].mean(), df["dL/db"].mean()])
nabla 

so with $\eta = 0.1$ then $\boldsymbol{\theta}_2$ becomes

In [None]:
theta_2 = theta_1 - 0.1 * nabla
theta_2

## Glossary {.appendix data-visibility="uncounted"}

- batches, batch size
- gradient-based learning, hill-climbing
- stochastic (mini-batch) gradient descent