In [2]:
import numpy as np

np.random.seed(0)

![backpropagation](imgs/backpropagation.png)

Mean squared error:

$$
\begin{align*}
    MSE &= \frac{1}{2} \sum^{1}_{n=0} (t_i - y_i)^2 \\
    &= \frac{1}{2} (t_0 - y_0)^2 + \frac{1}{2} (t_1 - y_1)^2
\end{align*}
$$

In general:
$$
\begin{align*}
    \frac{\partial MSE}{\partial y_{i}} &= -(t_i - y_i) \\
    &= -y_i + t_i 
\end{align*}
$$

In specific case:
$$
\begin{align*}
    \frac{\partial MSE}{\partial y_{0}} &= -(t_0 - y_0) \\
    &= -y_0 + t_0 
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial y_{1}} &= -(t_1 - y_1) \\
    &= -y_1 + t_1 
\end{align*}
$$

Linear activation (before final output):
$$
\begin{align*}
    y_i &= z_i^{(L)}
    \hspace{35pt}
    where
    \hspace{10pt}
    \{0, 1, ..., l, ..., L-1, L\} \,\, layers
\end{align*}
$$

In general:
$$
\begin{align*}
    \frac{\partial y_i}{\partial z^{(L)}_{i}} &= \frac{\partial a^{(L)}_i}{\partial z^{(L)}_{i}} = 1 \\
\end{align*}
$$

In specific case:
$$
\begin{align*}
    \frac{\partial y_0}{\partial z^{(L)}_{0}} &= \frac{\partial a^{(L)}_0}{\partial z^{(L)}_{i}} = 1 \\
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial y_1}{\partial z^{(L)}_{1}} &= \frac{\partial a^{(L)}_1}{\partial z^{(L)}_{i}} = 1 \\
\end{align*}
$$

Before activation function:

- In matrix form:
$$
\begin{align*}
    z^{(l)} &= W^{(l)} \, a^{l-1} \\
\end{align*}
$$

- For instante, we fix $l=2$ (that is last layer L), therefore we want compute between last layer and middle layer:
$$
\begin{align*}

    z^{(l)} &= W^{(l)} \, a^{l-1} \\
    &= W^{(2)} \, a^{1} \\
    
    &=
    \begin{bmatrix}
        w^{(2)}_{00} & w^{(2)}_{01} & w^{(2)}_{02} \\
        w^{(2)}_{10} & w^{(2)}_{11} & w^{(2)}_{12}
    \end{bmatrix}

    \begin{bmatrix}
        a^{(1)}_{0} \\
        a^{(1)}_{1} \\
        a^{(1)}_{2}
    \end{bmatrix} \\

    &=
    \begin{bmatrix}
        w^{(2)}_{00} a^{(1)}_{0} + w^{(2)}_{01} a^{(1)}_{01} + w^{(2)}_{02} a^{(1)}_{2} \\
        w^{(2)}_{10} a^{(1)}_{0} + w^{(2)}_{11} a^{(1)}_{01} + w^{(2)}_{12} a^{(1)}_{2}
    \end{bmatrix} \\

    &=
    \begin{bmatrix}
        z^{(2)}_{0} \\
        z^{(2)}_{1}
    \end{bmatrix}

\end{align*}
$$

- For instante, we fix $l=1$, therefore we want compute between middle layer and input:
$$
\begin{align*}

    z^{(l)} &= W^{(l)} \, a^{l-1} \\
    &= W^{(1)} \, a^{0} \\
    &= W^{(1)} \, x \\
    
    &=
    \begin{bmatrix}
        w^{(1)}_{00} & w^{(1)}_{01 } \\
        w^{(1)}_{10} & w^{(1)}_{11} \\
        w^{(1)}_{20} & w^{(1)}_{21}
    \end{bmatrix}

    \begin{bmatrix}
        x^{(1)}_{0} \\
        x^{(1)}_{1}
    \end{bmatrix} \\

    &=
    \begin{bmatrix}
        w^{(1)}_{00} x_{0} + w^{(1)}_{01} x_{1} \\
        w^{(1)}_{10} x_{0} + w^{(1)}_{11} x_{1} \\
        w^{(1)}_{20} x_{0} + w^{(1)}_{21} x_{1}
    \end{bmatrix} \\

    &=
    \begin{bmatrix}
        z^{(1)}_{0} \\
        z^{(1)}_{1} \\
        z^{(1)}_{2}
    \end{bmatrix}

\end{align*}
$$

- In scalar form:
$$
\begin{align*}
    z^{(l)}_{i} &= \sum^{len(l)}_{i=0} \sum^{len(l-1)}_{j=0} w^{(l)}_{ij} \, a^{(l-1)}_{j} \\
\end{align*}
$$

$$
\begin{align*}
    z^{(2)}_{0} &= w^{(2)}_{00} a^{(1)}_{0} + w^{(2)}_{01} a^{(1)}_{1} + w^{(2)}_{02} a^{(1)}_{2} \\
    z^{(2)}_{1} &= w^{(2)}_{10} a^{(1)}_{0} + w^{(2)}_{11} a^{(1)}_{1} + w^{(2)}_{12} a^{(1)}_{2} \\
    \\
    z^{(1)}_{0} &= w^{(1)}_{00} x_{0} + w^{(1)}_{01} x_{1} \\
    z^{(1)}_{1} &= w^{(1)}_{10} x_{0} + w^{(1)}_{11} x_{1} \\
    z^{(1)}_{2} &= w^{(1)}_{20} x_{0} + w^{(1)}_{21} x_{1}
\end{align*}
$$

In general:

$$
\begin{align*}
    \frac{\partial z^{(l)}_{i}}{\partial a^{(l-1)}_{j}} = w_{ij}^{(l)}
    \hspace{35pt}
    where
    \hspace{10pt}
    a^{(0)}_{j} = x_j
\end{align*}
$$

In specific case:

$$
\begin{align*}
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{0}} = w_{00}^{(2)} \\
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{1}} = w_{01}^{(2)} \\
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{2}} = w_{02}^{(2)} \\
    \\
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{0}} = w_{10}^{(2)} \\
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{1}} = w_{11}^{(2)} \\
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{2}} = w_{12}^{(2)} \\
    \\
    \frac{\partial z^{(1)}_{0}}{\partial a^{(1)}_{0}} = \frac{\partial z^{(1)}_{0}}{\partial x_{0}} = w_{00}^{(1)} \\
    \frac{\partial z^{(1)}_{0}}{\partial a^{(1)}_{1}} = \frac{\partial z^{(1)}_{0}}{\partial x_{1}} = w_{01}^{(1)} \\
    \\
    \frac{\partial z^{(1)}_{1}}{\partial a^{(1)}_{0}} = \frac{\partial z^{(1)}_{1}}{\partial x_{0}} = w_{10}^{(1)} \\
    \frac{\partial z^{(1)}_{1}}{\partial a^{(1)}_{1}} = \frac{\partial z^{(1)}_{1}}{\partial x_{1}} = w_{11}^{(1)} \\
    \\
    \frac{\partial z^{(1)}_{2}}{\partial a^{(1)}_{0}} = \frac{\partial z^{(1)}_{2}}{\partial x_{0}} = w_{20}^{(1)} \\
    \frac{\partial z^{(1)}_{2}}{\partial a^{(1)}_{1}} = \frac{\partial z^{(1)}_{2}}{\partial x_{1}} = w_{21}^{(1)} \\
\end{align*}

In general:

$$
\begin{align*}
    \frac{\partial z^{(l)}_{i}}{\partial w^{(l)}_{ij}} = a^{(l-1)}_{j}
    \hspace{35pt}
    where
    \hspace{10pt}
    a^{(0)}_{j} = x_j
\end{align*}
$$

In specific case:

$$
\begin{align*}
    \frac{\partial z^{(2)}_{0}}{\partial w_{00}^{(2)}} = a^{(1)}_{0} \\
    \frac{\partial z^{(2)}_{0}}{\partial w_{01}^{(2)}} = a^{(1)}_{1}  \\
    \frac{\partial z^{(2)}_{0}}{\partial w_{02}^{(2)}} = a^{(1)}_{2} \\
    \\
    \frac{\partial z^{(2)}_{1}}{\partial w_{10}^{(2)}} = a^{(1)}_{0} \\
    \frac{\partial z^{(2)}_{1}}{\partial w_{11}^{(2)}} = a^{(1)}_{1} \\
    \frac{\partial z^{(2)}_{1}}{\partial w_{12}^{(2)}} = a^{(1)}_{2} \\
    \\
    \frac{\partial z^{(1)}_{0}}{\partial w_{00}^{(1)}} = a^{(1)}_{0} = x_{0}\\
    \frac{\partial z^{(1)}_{0}}{\partial w_{01}^{(1)}} = a^{(1)}_{1} = x_{1}\\
    \\
    \frac{\partial z^{(1)}_{1}}{\partial w_{10}^{(1)}} = a^{(1)}_{0} = x_{0} \\
    \frac{\partial z^{(1)}_{1}}{\partial w_{11}^{(1)}} = a^{(1)}_{1} = x_{1} \\
    \\
    \frac{\partial z^{(1)}_{2}}{\partial w_{20}^{(1)}} = a^{(1)}_{0} = x_{0} \\
    \frac{\partial z^{(1)}_{2}}{\partial w_{21}^{(1)}} = a^{(1)}_{1} = x_{1}
\end{align*}

Activation function:
$$
\begin{align*}
    a^{(l)}_{i} = \frac{1}{1 + e^{-z^{(l)}_{i}}} \\
\end{align*}
$$

In general:

$$
\begin{align*}
    \frac{\partial a^{(l)}_{i}}{\partial z^{(l)}_{i}} = a^{(l)}_{i} (1 - a^{(l)}_{i})
    \hspace{35pt}
    where
    \hspace{10pt}
    1 \le l < L
\end{align*}
$$

Note that for $L=2$ we have linear activation that we compute in a different way, as we have already seen before. But for the hidden layer we have the no linear activation functions.

Update weights for $l=2$:
$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(2)}_{00}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \,
    \frac{\partial z^{(2)}_{0}}{\partial w^{(2)}_{00}}
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(2)}_{01}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \,
    \frac{\partial z^{(2)}_{0}}{\partial w^{(2)}_{01}}
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(2)}_{02}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \,
    \frac{\partial z^{(2)}_{0}}{\partial w^{(2)}_{02}}
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(2)}_{10}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \,
    \frac{\partial z^{(2)}_{1}}{\partial w^{(2)}_{10}}
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(2)}_{11}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \,
    \frac{\partial z^{(2)}_{1}}{\partial w^{(2)}_{11}}
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(2)}_{12}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \,
    \frac{\partial z^{(2)}_{1}}{\partial w^{(2)}_{12}}
\end{align*}
$$

Update weights for $l=1$:
$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(1)}_{00}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \, 
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{0}} \,
    \frac{\partial a^{(1)}_{0}}{\partial z^{(1)}_{0}} \,
    \frac{\partial z^{(1)}_{0}}{\partial w^{(1)}_{00}} + \\
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \, 
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{0}} \,
    \frac{\partial a^{(1)}_{0}}{\partial z^{(1)}_{0}} \,
    \frac{\partial z^{(1)}_{0}}{\partial w^{(1)}_{00}}
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(1)}_{01}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \, 
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{0}} \,
    \frac{\partial a^{(1)}_{0}}{\partial z^{(1)}_{0}} \,
    \frac{\partial z^{(1)}_{0}}{\partial w^{(1)}_{01}} + \\
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \, 
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{0}} \,
    \frac{\partial a^{(1)}_{0}}{\partial z^{(1)}_{0}} \,
    \frac{\partial z^{(1)}_{0}}{\partial w^{(1)}_{01}}
\end{align*}
$$



$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(1)}_{10}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \, 
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{1}} \,
    \frac{\partial a^{(1)}_{1}}{\partial z^{(1)}_{1}} \,
    \frac{\partial z^{(1)}_{1}}{\partial w^{(1)}_{10}} + \\
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \, 
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{1}} \,
    \frac{\partial a^{(1)}_{1}}{\partial z^{(1)}_{1}} \,
    \frac{\partial z^{(1)}_{1}}{\partial w^{(1)}_{10}}
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(1)}_{11}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \, 
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{1}} \,
    \frac{\partial a^{(1)}_{1}}{\partial z^{(1)}_{1}} \,
    \frac{\partial z^{(1)}_{1}}{\partial w^{(1)}_{11}} + \\
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \, 
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{1}} \,
    \frac{\partial a^{(1)}_{1}}{\partial z^{(1)}_{1}} \,
    \frac{\partial z^{(1)}_{1}}{\partial w^{(1)}_{11}}
\end{align*}
$$


$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(1)}_{20}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \, 
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{2}} \,
    \frac{\partial a^{(1)}_{2}}{\partial z^{(1)}_{2}} \,
    \frac{\partial z^{(1)}_{2}}{\partial w^{(1)}_{10}} + \\
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \, 
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{2}} \,
    \frac{\partial a^{(1)}_{2}}{\partial z^{(1)}_{2}} \,
    \frac{\partial z^{(1)}_{2}}{\partial w^{(1)}_{20}}
\end{align*}
$$

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(1)}_{21}} = 
    \frac{\partial MSE}{\partial y^{(2)}_{0}} \,
    \frac{\partial y^{(2)}_{0}}{\partial z^{(2)}_{0}} \, 
    \frac{\partial z^{(2)}_{0}}{\partial a^{(1)}_{2}} \,
    \frac{\partial a^{(1)}_{2}}{\partial z^{(1)}_{2}} \,
    \frac{\partial z^{(1)}_{2}}{\partial w^{(1)}_{11}} + \\
    \frac{\partial MSE}{\partial y^{(2)}_{1}} \,
    \frac{\partial y^{(2)}_{1}}{\partial z^{(2)}_{1}} \, 
    \frac{\partial z^{(2)}_{1}}{\partial a^{(1)}_{2}} \,
    \frac{\partial a^{(1)}_{2}}{\partial z^{(1)}_{2}} \,
    \frac{\partial z^{(1)}_{2}}{\partial w^{(1)}_{21}}
\end{align*}
$$

Decompose

$$
\begin{align*}
    \delta^{(L)} &= \frac{\partial MSE}{\partial y} \,
    \frac{\partial y}{\partial z^{(L)}} \\
    &= 
    \frac{\partial MSE}{\partial a^{(L)}} \,
    \frac{\partial a^{(L)}}{\partial z^{(L)}} \\

    \delta^{(L-1)} &= \delta^{(L)} \,
    \frac{\partial z^{(L)}}{\partial a^{(L-1)}} \,
    \frac{\partial a^{(L-1)}}{\partial z^{(L-1)}} \\

    \delta^{(L-2)} &= \delta^{(L-1)} \,
    \frac{\partial z^{(L-1)}}{\partial a^{(L-2)}} \,
    \frac{\partial a^{(L-2)}}{\partial z^{(L-2)}} \\

    &\vdots\\
    \delta^{(l-1)} &= \delta^{(l)} \,
    \frac{\partial z^{(l)}}{\partial a^{(l-1)}} \,
    \frac{\partial a^{(l-1)}}{\partial z^{(l-1)}}
\end{align*}
$$

Therefore

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(l)}_{ij}} = \delta^{(l)} \, \frac{\partial z^{(l)}}{\partial w^{(l)}_{ij}}
\end{align*}
$$

Scalar mode:

$$
\begin{align*}
    \delta^{(L)}_{i} &= 
    \frac{\partial MSE}{\partial a^{(L)}_{i}} \,
    \frac{\partial a^{(L)}_{i}}{\partial z^{(L)}_{i}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le i < len(L) \\

    \delta^{(L-1)}_{k} &= 
    \sum^{len(L)}_{i=0} \delta^{(L)}_{i} \,
    \frac{\partial z^{(L)}_{i}}{\partial a^{(L-1)}_{k}} \,
    \frac{\partial a^{(L-1)}_{k}}{\partial z^{(L-1)}_{k}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le k < len(L-1) \\

    &\vdots\\
    \delta^{(l-1)}_{k} &= 
    \sum^{len(l)}_{i=0} \delta^{(l)}_{i} \,
    \frac{\partial z^{(l)}_{i}}{\partial a^{(l-1)}_{k}} \,
    \frac{\partial a^{(l-1)}_{k}}{\partial z^{(l-1)}_{k}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le k < len(l-1)
\end{align*}
$$

Therefore:

$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(L)}_{ij}} &= \delta^{(L)}_{i} \, \frac{\partial z^{(L)}_{i}}{\partial w^{(L)}_{ij}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le i < len(L) &
    0 \le j < len(L-1) \\

    \frac{\partial MSE}{\partial w^{(L-1)}_{ij}} &= \delta^{(L-1)}_{i} \, \frac{\partial z^{(L-1)}_{i}}{\partial w^{(L-1)}_{ij}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le i < len(L-1) &
    0 \le j < len(L-2) \\

    & \vdots \\
    \frac{\partial MSE}{\partial w^{(l-1)}_{ij}} &= \delta^{(l-1)}_{i} \, \frac{\partial z^{(l-1)}_{i}}{\partial w^{(l-1)}_{ij}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le i < len(l-1) &
    0 \le j < len(l-2)
\end{align*}
$$

## Forward pass

In [234]:
print("Input x:")
x = np.array([[1,3]]).T
print(x, "\n")

print("Layer 1:")
W_l1 = np.random.rand(3,2)
z_l1 = W_l1 @ x
a_l1 = 1 / (1 + np.exp(-z_l1))
print("W_l1", "\n", W_l1, "\n")
print("z_l1", "\n", z_l1, "\n")
print("a_l1", "\n", a_l1, "\n")

print("Layer 2:")
W_l2 = np.random.rand(2,3)
z_l2 = W_l2 @ z_l1
print("W_l2", "\n", W_l2, "\n")
print("z_l2", "\n", z_l2, "\n")

y = z_l2
print("y", "\n", y, "\n")

t = np.random.randint(5, size=(2,1))
print("t" "\n", t, "\n")
MSE = 0.5 * ( (t[0] - y[0])** 2 + (t[1] - y[1])** 2 )
print("MSE", "\n", MSE, "\n")

Input x:
[[1]
 [3]] 

Layer 1:
W_l1 
 [[0.97007005 0.61840113]
 [0.92420913 0.97500156]
 [0.72183716 0.53289514]] 

z_l1 
 [[2.82527343]
 [3.8492138 ]
 [2.32052258]] 

a_l1 
 [[0.94402637]
 [0.97914761]
 [0.91056251]] 

Layer 2:
W_l2 
 [[0.85264027 0.97336448 0.51301245]
 [0.2620828  0.97489056 0.90059674]] 

z_l2 
 [[7.34608685]
 [6.58287285]] 

y 
 [[7.34608685]
 [6.58287285]] 

t
 [[4]
 [1]] 

MSE 
 [21.18238321] 



## Backward pass

$$
\begin{align*}
    \delta^{(L)}_{i} &= 
    \frac{\partial MSE}{\partial a^{(L)}_{i}} \,
    \frac{\partial a^{(L)}_{i}}{\partial z^{(L)}_{i}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le i < len(L) \\

    &= (-y_i + t_i) \, 1 \\
    &= (-y_i + t_i)
\end{align*}
$$

In [235]:
L2_size = z_l2.shape[0]

delta_L2 = np.zeros(L2_size)
for i in range(L2_size):
    delta_L2[i] = -y[i] + t[i]

print(f"Dimension of L-layer: {L2_size}")
print(f"delta_L2: {delta_L2}")

Dimension of L-layer: 2
delta_L2: [-3.34608685 -5.58287285]


$$
\begin{align*}    
    \frac{\partial MSE}{\partial w^{(L)}_{ij}} &= \delta^{(L)}_{i} \, \frac{\partial z^{(L)}_{i}}{\partial w^{(L)}_{ij}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le i < len(L) &
    0 \le j < len(L-1) \\

    &= \delta^{(L)}_{i} \, a^{(L-1)}_{j}
\end{align*}
$$

In [236]:
l1_size = z_l1.shape[0]

W_l2_grad = np.zeros_like(W_l2)
for i in range(L2_size):
    for j in range(l1_size):
        W_l2_grad[i,j] = delta_L2[i] * a_l1[j]

print("W_grad of layers 2:")
print(W_l2_grad)

W_grad of layers 2:
[[-3.15879422 -3.27631294 -3.04682123]
 [-5.27037919 -5.4664566  -5.0835547 ]]


 $$
\begin{align*}   
    \delta^{(L-1)}_{k} &= 
    \sum^{len(L)}_{i=0} \delta^{(L)}_{i} \,
    \frac{\partial z^{(L)}_{i}}{\partial a^{(L-1)}_{k}} \,
    \frac{\partial a^{(L-1)}_{k}}{\partial z^{(L-1)}_{k}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le k < len(L-1) \\

    &= \sum^{len(L)}_{i=0} \delta^{(L)}_{i} \, w^{(L)}_{ik} \, a^{(L-1)}_{k} \, (1 - a^{(L-1)}_{k})
\end{align*}
$$

In [237]:
delta_l1 = np.zeros(l1_size)
for k in range(l1_size):
    acc = 0
    for i in range(L2_size):
        acc += delta_L2[i] * W_l2[i,k] * a_l1[k] * (1 - a_l1[k])
    delta_l1[k] = acc

print(f"Dimension of L-1 layer: {l1_size}")
print(f"delta_L-1: {delta_l1}")

Dimension of L-1 layer: 3
delta_L-1: [-0.22806964 -0.17762574 -0.54926157]


$$
\begin{align*}
    \frac{\partial MSE}{\partial w^{(L-1)}_{ij}} &= \delta^{(L-1)}_{i} \, \frac{\partial z^{(L-1)}_{i}}{\partial w^{(L-1)}_{ij}}
    \hspace{35pt}
    where
    \hspace{10pt}
    0 \le i < len(L-1) &
    0 \le j < len(L-2) \\

    &= \delta^{(L-1)}_{i} \, a^{(L-2)}_{j} \\
    &= \delta^{(L-1)}_{i} \, x_{j} \\
\end{align*}
$$

In [238]:
l0_size = x.shape[0]

W_l1_grad = np.zeros_like(W_l1)
for i in range(l1_size):
    for j in range(l0_size):
        W_l1_grad[i,j] = delta_l1[i] * x[j]

print("W_grad of layers 1:")
print(W_l1_grad)

W_grad of layers 1:
[[-0.22806964 -0.68420891]
 [-0.17762574 -0.53287723]
 [-0.54926157 -1.64778472]]


## Gradient descent

In [239]:
mu = 0.01

In [240]:
print("w_l1 before:")
print(W_l1)
W_l1 = W_l1 + mu * W_l1_grad

print("\n")

print("w_l1 new:")
print(W_l1)

w_l1 before:
[[0.97007005 0.61840113]
 [0.92420913 0.97500156]
 [0.72183716 0.53289514]]


w_l1 new:
[[0.96778936 0.61155904]
 [0.92243287 0.96967279]
 [0.71634454 0.51641729]]


In [241]:
print("w_l2 before:")
print(W_l2)
W_l2 = W_l2 - mu * W_l2_grad

print("\n")

print("w_l2 new:")
print(W_l2)

w_l2 before:
[[0.85264027 0.97336448 0.51301245]
 [0.2620828  0.97489056 0.90059674]]


w_l2 new:
[[0.82105233 0.94060135 0.48254424]
 [0.20937901 0.920226   0.84976119]]


In [242]:
print("BEFORE")
print("y", "\n", y, "\n")
print("t" "\n", t, "\n")
print("MSE", "\n", MSE, "\n")

x = np.array([[1,3]]).T
z_l1 = W_l1 @ x
a_l1 = 1 / (1 + np.exp(-z_l1))
z_l2 = W_l2 @ z_l1
y = z_l2
MSE = 0.5 * ( (t[0] - y[0])** 2 + (t[1] - y[1])** 2 )

print("AFTER")
print("y", "\n", y, "\n")
print("t" "\n", t, "\n")
print("MSE", "\n", MSE, "\n")

BEFORE
y 
 [[7.34608685]
 [6.58287285]] 

t
 [[4]
 [1]] 

MSE 
 [21.18238321] 

AFTER
y 
 [[6.9980903 ]
 [6.03779459]] 

t
 [[4]
 [1]] 

MSE 
 [17.18395987] 



## Problem

In [368]:
X = np.random.rand(10000, 2)
y = 300 * np.exp(np.sin(X))

In [369]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [370]:
X_train.shape, y_train.shape

((6700, 2), (6700, 2))

In [371]:
in_size = 2
hid_1_size = 3
out_size = 2

sizes = [in_size, hid_1_size, out_size]

In [372]:
W = []

W.append(np.random.rand(sizes[1], sizes[0]))
print("W_l1 | Layer 1, with index 0")
print(W[0])

print("\n")

W.append(np.random.rand(sizes[2], sizes[1]))
print("W_l2 | Layer 2, with index 1")
print(W[1])

W_l1 | Layer 1, with index 0
[[0.1119346  0.41278924]
 [0.0773129  0.04300956]
 [0.95629953 0.7044888 ]]


W_l2 | Layer 2, with index 1
[[0.08559176 0.67401742 0.95201433]
 [0.73588922 0.44185315 0.74352088]]


In [373]:
z = []
a = []

# Layer 1, with index 0
z.append(np.zeros(sizes[1]))
a.append(np.zeros(sizes[1]))
print("z_l1 | Layer 1, with index 0:")
print(z[0])
print("a_l1 | Layer 1, with index 0:")
print(a[0])

print("\n")

# Layer 2, with index 1
a.append(np.zeros(sizes[2]))
z.append(np.zeros(sizes[2]))
print("z_l2 | Layer 2, with index 1:")
print(z[1])
print("a_l2 = y | Layer 2, with index 1:")
print(a[1])

z_l1 | Layer 1, with index 0:
[0. 0. 0.]
a_l1 | Layer 1, with index 0:
[0. 0. 0.]


z_l2 | Layer 2, with index 1:
[0. 0.]
a_l2 = y | Layer 2, with index 1:
[0. 0.]


In [374]:
def forward(xi, log=False):

    # Input
    if log: print("Input x:")
    x = np.array([xi]).T
    if log: print(x, "\n")

    # Layer 1
    if log: print("Layer 1:")
    z[0] = W[0] @ x
    a[0] = 1 / (1 + np.exp(-z[0]))
    if log: print("W_l1", "\n", W[0], "\n")
    if log: print("z_l1", "\n", z[0], "\n")
    if log: print("a_l1", "\n", a[0], "\n")

    # Layer 2
    if log: print("Layer 2:")
    z[1] = W[1] @ z[0]
    a[1] = z[1]
    if log: print("W_l2", "\n", W[1], "\n")
    if log: print("z_l2", "\n", z[1], "\n")
    if log: print("y", "\n", a[1], "\n")

    # MSE
    t = np.random.randint(5, size=(2,1))
    if log: print("t" "\n", t, "\n")
    MSE = 0.5 * ( (t[0] - a[1][0])** 2 + (t[1] - a[1][1])** 2 )
    return MSE


def backward():

    # Initialization
    W_grad = []
    for w in W:
        W_grad.append(np.zeros_like(w))

    delta = []
    for i in range(len(sizes)-1, 0, -1):
        delta.append(np.zeros(sizes[i]))

    # -------------------------------
    # LAYER 2
    # compute delta
    for i in range(sizes[2]):
        delta[0][i] = -a[1][i] + t[i]

    # compute W_grad
    for i in range(sizes[2]):
        for j in range(sizes[1]):
            W_grad[1][i,j] = delta[0][i] * a[0][j]

    # -------------------------------
    # LAYER 1
    # compute delta
    for k in range(sizes[1]):
        acc = 0
        for i in range(sizes[2]):
            acc += delta[0][i] * W[1][i,k] * a[0][k] * (1 - a[0][k])
        delta[1][k] = acc

    # compute W_grad
    for i in range(sizes[1]):
        for j in range(sizes[0]):
            W_grad[0][i,j] = delta[1][i] * x[j]

    # UPDATE WEIGHTS
    mu = 0.001
    for idx, w_grad in enumerate(W_grad):
        W[idx] = W[idx] + mu * w_grad # NOTE WITH "-" EXPLODING GRADIENTS, THERE MUST EXISTS SOME ERROR

In [375]:
EPOCHS = 50
for ep in range(EPOCHS):
    for i in range(X_train.shape[0]):
        forward(X_train[i])
        backward()
    
    loss = 0
    for j in range(X_train.shape[0]):
        loss += forward(X_train[j])
    print(f"Epoch: {ep}, Train loss: {loss}")

    loss = 0
    for j in range(X_test.shape[0]):
        loss += forward(X_test[j])
    print(f"Epoch: {ep}, Test loss: {loss}")

Epoch: 0, Train loss: [40778.84947518]
Epoch: 0, Test loss: [20047.53456471]
Epoch: 1, Train loss: [41694.15444421]
Epoch: 1, Test loss: [20476.62903596]
Epoch: 2, Train loss: [41717.28322423]
Epoch: 2, Test loss: [20890.64052609]
Epoch: 3, Train loss: [43523.63314933]
Epoch: 3, Test loss: [21942.02686374]
Epoch: 4, Train loss: [43334.72402015]
Epoch: 4, Test loss: [21387.0804549]
Epoch: 5, Train loss: [43874.65806443]
Epoch: 5, Test loss: [21659.51677369]
Epoch: 6, Train loss: [44009.26624979]
Epoch: 6, Test loss: [21663.25510824]
Epoch: 7, Train loss: [43927.3506858]
Epoch: 7, Test loss: [21534.51274113]
Epoch: 8, Train loss: [43384.98844775]
Epoch: 8, Test loss: [21499.80313462]
Epoch: 9, Train loss: [43752.17746886]
Epoch: 9, Test loss: [21671.96727445]
Epoch: 10, Train loss: [44066.98790726]
Epoch: 10, Test loss: [22099.602941]
Epoch: 11, Train loss: [44268.05357934]
Epoch: 11, Test loss: [21548.4276201]
Epoch: 12, Train loss: [44647.57467009]
Epoch: 12, Test loss: [22322.10562657

KeyboardInterrupt: 