# Demo FNN Training in Numpy

## Data Preparation

In [1]:
# importing the dataset
from sklearn.datasets import load_breast_cancer
import numpy as np
np.random.seed(seed=None)
data = load_breast_cancer()
x = data['data'][:400, 1:3]
y = data['target'][:400]
print(f"shape of x: {x.shape}, shape of y: {y.shape}")

# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)
data = np.concatenate((y.reshape(-1, 1), x), axis=1)

shape of x: (400, 2), shape of y: (400,)


## Define FNN Model
- NN architecture

<img src="images/FNN_simple.png" alt="FNN" width="400"/>

- Activation function

<img src="images/activationfunction1.png" alt="FNN" width="600"/>

The forward pass is
$$
\boldsymbol z_1=\sigma(W_0\boldsymbol x\!+\!\boldsymbol \beta_0)  =\!\boldsymbol z_1(\boldsymbol x;\!\boldsymbol\theta),
$$
$$
\boldsymbol z_2=\sigma(W_1\boldsymbol z_1\!+\!\boldsymbol \beta_1)\!=\!\boldsymbol z_2(\boldsymbol x;\!\boldsymbol\theta),
$$
$$
\boldsymbol z_3=\sigma(W_2\boldsymbol z_2\!+\!\boldsymbol \beta_2)\!=\!\boldsymbol z_3(\boldsymbol x;\!\boldsymbol\theta).
$$



where $\boldsymbol z_3(\boldsymbol x;\!\boldsymbol\theta)$ is implemented as `forward(x,params)`.

In [2]:
from scipy.special import expit
def forward(x, params):
    z_lists = []
    for z in x:
        z_list = [z]
        for param in params:
            w, b = param.values()
            z = expit(w@z + b)
            z_list.append(z)
        z_lists.append(z_list)
    return z_lists

Define FNN class with the above forward pass and create FNN instance

In [3]:
import pickle
# Define FNN model class
class ModelNP:
    def __init__(self, params):
        self.params = params

    def __call__(self, x):
        z_lists = forward(x, self.params)
        self.z_lists = z_lists
        z_outs = [z_list[-1] for z_list in z_lists]
        return np.array(z_outs)

# Load initial parameters from a pytorch nn model
import pickle
with open('params_np.pickle', 'rb') as f:
    params_np = pickle.load(f)

# Create a FNN model instance
model_np = ModelNP(params_np)

## Train FNN Model

### Loss Gradient

#### Loss Function

Use binary cross-entropy
$$
H(y,p)= -y\ln p - (1-y) \ln(1-p).
$$
Loss function
$$
J(\boldsymbol\theta;\boldsymbol x,y) = H[y,\boldsymbol z_3(\boldsymbol x;\!\boldsymbol\theta)].
$$

#### Loss Function Gradient

Denote  
$$D(\boldsymbol z)\!=\! \text{diag}[\boldsymbol z\!\odot\!(\boldsymbol 1 \!-\!\boldsymbol z)], \ \ \ g= \frac{\partial J}{\partial h_3}=z_3-y.$$
We have
$$
\frac{\partial J}{\partial \boldsymbol{\beta}_2} =\frac{\partial J}{\partial h_3}= g
\ \ \ \  \ \ \ \ \ \ \ \ \ \ \ \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
\frac{\partial J}{\partial W_2}
=\frac{\partial J}{\partial h_3} \boldsymbol{z}^T_2,
$$
$$
\frac{\partial J}{\partial \boldsymbol\beta_1}
=\frac{\partial J}{\partial \boldsymbol{h}_2}
= D(\boldsymbol z_2) W_2^T\frac{\partial J}{\partial h_3}
\ \ \ \  \ \ \ \ \ \ \ \ \ \    
\frac{\partial J}{\partial W_1} 
=\frac{\partial J}{\partial \boldsymbol{h}_2} \boldsymbol{z}_1^T,
$$
$$
\frac{\partial J}{\partial \boldsymbol\beta_0}=
\frac{\partial J}{\partial \boldsymbol{h}_1}
\!=\! D(\boldsymbol z_1)W_1^T\frac{\partial J}{\partial \boldsymbol{h}_2}
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ 
\frac{\partial J}{\partial W_0} =\frac{\partial J}{\partial \boldsymbol{h}_1}\boldsymbol{z}_0^T.
$$


<div style="text-align: center>
<img src="../images/FNN_simple.PNG" width=300 align=center>
</div>

The gradients are calculated iteratively as

\begin{align}
&g \leftarrow z_3\!-\!y& \\
&\text{for} \ j:=2,1,0 \\
& \ \ \ \ \ \ \  \text{Grad}({W_j}),\text{Grad}({\boldsymbol\beta_j}) \leftarrow\! g\boldsymbol z_j^T,\boldsymbol  g\\
& \ \ \ \ \ \ \  g \leftarrow  D(\boldsymbol z_j){W_{j}^{*}}^T\boldsymbol  g
\end{align}

which is implemented as `gradients(y,z_list,w_list)` as follows

In [4]:
def gradients(y, z_list, w_list):
    g = z_list[-1].reshape(-1, 1) - y
    grads = []
    for z, w in reversed(list(zip(z_list[:-1], w_list))):
        grads.append({'w': g@z.reshape(1, -1), 'b': g})
        g = np.diag(z*(1-z))@w.T@g
    return grads

### Model Training

Gradient descent algorithm for optimization
\begin{align}
& \text{Input}: \text{learning rate } \eta,\text{loss function } J,N_{\text{epochs}}\\
&\text{return:}\ \arg\!\min J(\boldsymbol\theta)\\
&\ \ \ \ \ \ \ \  \boldsymbol \theta \leftarrow \boldsymbol \theta_{\text{initial}}\\
&\ \ \ \ \ \ \ \ \text{for}\ i:=1,\dots,N_{\text{epochs}}\\
&\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \boldsymbol \theta \leftarrow \boldsymbol \theta -\eta \frac{\partial }{\partial\boldsymbol\theta}J(\boldsymbol\theta)\\
&\ \ \ \ \ \ \  \ \text{ return}\ \boldsymbol \theta 
\end{align}

Model traing algorithm
\begin{align}
&\boldsymbol \theta\leftarrow \left[W_0,W_1,W_2,\boldsymbol\beta_0,\boldsymbol\beta_1,\boldsymbol\beta_2\right]_{\text{init}}\\
&\text {for} \text{ epoch}:=1,\dots,N_{\text{epochs}}\\
&\ \  \ \ \  \ \text {for} \  \{X,\boldsymbol{y}\}\  := \{X,\boldsymbol{y}\}_1,\dots,\!\{X,\boldsymbol{y}\}_m \\
&\ \  \ \ \  \ \ \  \ \ \ \ W_0,W_1,W_2,\boldsymbol\beta_0,\boldsymbol\beta_1,\boldsymbol\beta_2\leftarrow \boldsymbol\theta \\
&\ \  \ \ \ \  \ \  \ \ \ \ W_0^*,W_1^*,W_2^*\leftarrow W_0,W_1,W_2 \\
&\ \ \ \ \ \ \ \ \ \ \ \ \text {for} \ {\boldsymbol x },y \  \text{in}\  \{X,\boldsymbol{y}\} \\
&\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \boldsymbol z_1,\boldsymbol z_2,z_3 \leftarrow 
\boldsymbol z_1(\boldsymbol x_i;\!\boldsymbol \theta),
\boldsymbol z_2(\boldsymbol x_i;\!\boldsymbol \theta),  
z_3(\boldsymbol x_i;\!\boldsymbol \theta)  \\
&\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \boldsymbol g \leftarrow z_3\!-\!y \\
&\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \    \text {for} \ j:=2,1,0\\
&\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \   
\ \ \ \ \ \  W_j,\boldsymbol\beta_j  \leftarrow W_j\!-\!\frac{\eta}{N}\boldsymbol  g\boldsymbol z_j^T,\boldsymbol\beta_j\!-\!\frac{\eta}{N}\boldsymbol  g \\
&\ \ \ \ \ \ \ \ \  \ \ \ \ \ \  \ \ \   \ \ \  \ \ \   
\boldsymbol  g \leftarrow  D(\boldsymbol z_j){W_{j}^{*}}^T\boldsymbol  g\\
&\ \ \ \ \ \ \ \ \ \ \ \   \boldsymbol \theta\leftarrow \left[W_0,W_1,W_2,\boldsymbol\beta_0,\boldsymbol\beta_1,\boldsymbol\beta_2\right]\\
&\text { return }\boldsymbol \theta\
\end{align}
which is implemented as

In [5]:
import numpy as np

class SGD:
    def __init__(self, model, lr):
        self.model = model
        self.lr = lr
        self.grads = None

    @staticmethod
    def gradients(y, z_lst, w_lst):
        g = z_lst[-1].reshape(-1, 1) - y
        g_lst = []
        for z, w in reversed(list(zip(z_lst[:-1], w_lst))):
            g_lst.append({'w': g @ z.reshape(1, -1), 'b': g})
            g = np.diag(z * (1 - z)) @ w.T @ g
        return g_lst

    @staticmethod
    def _mean_grad(grads):
        n = len(grads)
        w = sum([g['w'] for g in grads]) / n
        b = sum([g['b'] for g in grads]) / n
        return {'w': w, 'b': b[:, 0]}

    @staticmethod
    def _backward(y_vec, z_lists, w_list):
        grads_all = []
        for z_list, y in zip(z_lists, y_vec):
            grads = SGD.gradients(y, z_list, w_list)
            grads_all.append(grads)
        grads_zip = reversed([list(x) for x in zip(*grads_all)])
        grads_avg = [SGD._mean_grad(g) for g in grads_zip]
        return grads_avg

    def backward(self, y_vec):  # calculate gradients
        params = self.model.params
        w_list = [param['w'] for param in params]
        z_lists = self.model.z_lists
        self.grads = self._backward(y_vec, z_lists, w_list)
        return

    def step(self):  # optimize parameters with gradients
        lr = self.lr
        params = self.model.params
        grads = self.grads
        for param, grad in zip(params, grads):
            param['w'] -= lr * grad['w']
            param['b'] -= lr * grad['b']

# minibatch data parameters
batch_size = 50
n_batch = int(np.ceil(data.shape[0] / batch_size))

# hyper parameters
learning_rate, epochs = 0.1, 400
sdg = SGD(model_np, learning_rate)

# training loop
for epoch in range(epochs):
    np.random.shuffle(data)
    for i in range(n_batch):
        batch = data[i * batch_size:(i + 1) * batch_size]
        x, y = batch[:, 1:], batch[:, 0]
        z_out = model_np(x)
        sdg.backward(y)
        sdg.step()
    if epoch % 50 == 0:
        predict = model_np(data[:, 1:]).round()
        print(f'epoch:{epoch},accuracy:{(predict[:, 0] == data[:, 0]).mean()}')


epoch:0,accuracy:0.5675
epoch:50,accuracy:0.5675
epoch:100,accuracy:0.7175
epoch:150,accuracy:0.89
epoch:200,accuracy:0.9
epoch:250,accuracy:0.9025
epoch:300,accuracy:0.9025
epoch:350,accuracy:0.9025
