# Feedforward Neural Networks

A full, general FNN function

In [115]:
import numpy as np
import copy
from typing import Callable

## Activation Functions

First, we'll implement the activation functions and their derivatives:

In [116]:
def sigmoid(x: np.ndarray) -> np.ndarray:
    """
    Element-wise sigmoid function

    Args:
        x: array of values

    Returns: array of transformed values of the same shape as x
    """
    return 1.0 / (1.0 + np.exp(-x))

def deriv_sigmoid(x: np.ndarray) -> np.ndarray:
    """
    Element-wise sigmoid derivative function

    Args:
        x: array of values

    Returns: array of transformed values of the same shape as x
    """
    return sigmoid(x)*(1-sigmoid(x))

In [117]:
def tanh(x: np.ndarray) -> np.ndarray:
    """
    Element-wise tanh function

    Args:
        x: array of values

    Returns: array of transformed values of the same shape as x
    """
    return (np.exp(x) - np.exp(-x))/(np.exp(x)+np.exp(-x))

def deriv_tanh(x: np.ndarray) -> np.ndarray:
    """
    Element-wise tanh derivative function

    Args:
        x: array of values

    Returns: array of transformed values of the same shape as x
    """
    return 1 - np.power(tanh(x),2)

In [118]:
def relu(x: np.ndarray) -> np.ndarray:
    """
    Element-wise ReLU function

    Args:
        x: array of values

    Returns: array of transformed values of the same shape as x
    """
    return x * (x>=0)

def deriv_relu(x: np.ndarray) -> np.ndarray:
    """
    Element-wise ReLU derivative function

    Args:
        x: array of values

    Returns: array of transformed values of the same shape as x
    """
    return (x>=0).astype("int")

In [119]:
def linear(x: np.ndarray) -> np.ndarray:
    """
    Element-wise linear function

    Args:
        x: array of values

    Returns: array of transformed values of the same shape as x
    """
    return x

def deriv_linear(x: np.ndarray) -> np.ndarray:
    """
    Element-wise linear derivative function

    Args:
        x: array of values

    Returns: array of transformed values of the same shape as x
    """
    return np.ones(x.shape)

## Cost Functions

Next, we'll implement the activation functions and their derivatives:

In [120]:
def mse(yhat: np.ndarray, y: np.ndarray) -> np.ndarray:
    """
    MSE cost function

    Args:
        yhat: Array of predicted values
        y: Array of true values

    Returns: An array of costs
    """
    return np.power(yhat-y,2)

def deriv_mse(yhat: np.ndarray, y: np.ndarray) -> np.ndarray:
    """
    MSE cost derivative function

    Args:
        yhat: Array of predicted values
        y: Array of true values

    Returns: An array of cost derviatives
    """
    return 2*(yhat-y)

In [121]:
def binary_cross_entropy(yhat: np.ndarray, y: np.ndarray) -> np.ndarray:
    """
    Binary cross entropy cost function

    Args:
        yhat: Array of predicted values
        y: Array of true values

    Returns: An array of costs
    """
    return -y * np.log(yhat) - (1-y)*np.log(1-yhat)

def deriv_binary_cross_entropy(yhat: np.ndarray, y: np.ndarray) -> np.ndarray:
    """
    Binary cross entropy cost derivatives function

    Args:
        yhat: Array of predicted values
        y: Array of true values

    Returns: An array of cost derivatives
    """
    return -(y/yhat) + (1-y)/(1-yhat)

## Managing Activation and Cost Function Choice

In [122]:
def get_activation(name: str) -> Callable:
    """
    A function to manage which activation function and
    derivative to use given input string

    Args:
        name: the name of the activation function

    Returns: The activation function and its derivative
    """
    if name == "sigmoid":
        return sigmoid, deriv_sigmoid
    if name == "tanh":
        return tanh, deriv_tanh
    if name == "relu":
        return relu, deriv_relu
    if name == "linear":
        return linear, deriv_linear

def get_cost(name: str) -> Callable:
    """
    A function to manage which cost function and
    derivative to use given input string

    Args:
        name: the name of the cost function

    Returns: The cost function and its derivative
    """
    if name == "mse":
        return mse, deriv_mse
    if name == "bce":
        return binary_cross_entropy, deriv_binary_cross_entropy

## Weight Initialisation

In [123]:
def xavier_init(shape: tuple[int]) -> np.ndarray:
    """
    Xavier initialisation of weights

    Args:
        shape: tuple giving the shape of the weights

    Returns: np.ndarray of initialised weights
    """
    num_input = shape[1]
    num_output = shape[0]
    W = np.random.randn(num_output, num_input) / num_input
    return W

def he_init(shape: tuple[int]):
    """
    He initialisation of weights

    Args:
        shape: tuple giving the shape of the weights

    Returns: np.ndarray of initialised weights
    """
    num_input = shape[1]
    num_output = shape[0]
    W = np.random.randn(num_output, num_input) * np.sqrt(2/num_input)
    return W

def linear_init(shape: tuple[int]) -> np.ndarray:
    """
    Simple initialisation of weights

    Args:
        shape: tuple giving the shape of the weights

    Returns: np.ndarray of initialised weights
    """
    num_input = shape[1]
    num_output = shape[0]
    W = np.random.randn(num_output, num_input)/num_input
    return W

def get_init(name: str) -> Callable:
    """
    A function to manage which weights initalisation
    to use

    Args:
        name: the name of the activation function

    Returns: An initialisation function
    """
    if name == "sigmoid" or name == "tanh":
        return xavier_init
    if name == "relu":
        return he_init
    if name == "linear":
        return linear_init
    

## Forward Propagation

Note Z\[i+1\] is because we're storing X in the Z_list as well. i+1 should be understood as layer i!

In [124]:
def forward_pass(X: np.ndarray,
                 W_list: list[np.ndarray],
                 B_list: list[np.ndarray],
                 activation_hidden: Callable,
                 activation_output: Callable,
                 num_pass: int):
    """
    A function that performs forward-propagation through a network

    Args:
        X: matrix of input features
        weight_list: A list with num_features_out by num_features_in weight
                     matrices, where num_features_out is the number of outputs
                     to the layer and num_features_in is the number of inputs to
                     the layer
        bias_list: A list with num_features_out by 1 bias matrices, where
                   num_features_out is the number of outputs to the layer
        activation_hidden: Activation function for hidden layers
        activation_output: Activation function for the output layer
        num_pass: Number of forward passes

    Returns: Z_list, a list of layer weighted inputs, and A_list, a list of
             activated weighted inputs. Both contain X as their first entry
    """
    Z_list = [X]
    A_list = [X] #for use in backprop
    for i in range(num_pass):
        Z_list.append(np.dot(W_list[i],A_list[i]) + B_list[i])
        if i < (num_pass-1):
            A_list.append(activation_hidden(Z_list[i+1]))
        elif i == (num_pass-1):
            A_list.append(activation_output(Z_list[i+1]))

    return Z_list, A_list

## Back Propagation

In [125]:
def back_pass(Y: np.ndarray,
              W_list: list[np.ndarray],
              Z_list: list[np.ndarray],
              A_list: list[np.ndarray],
              cost_deriv: Callable,
              activation_hidden_deriv: Callable,
              activation_output_deriv: Callable,
              num_pass: int,
              num_obs: int):
    """
    A function to perform backpropagation through the network

    Args:
        Y: array of output variables
        W_list: list of layer weights
        Z_list: list of layer weighted inputs
        A_list: list of layer activations
        cost_deriv: Cost derivative function
        activation_hidden_deriv: Hidden layer activation func derivative
        activation_output_deriv: Output layer activation func derivative
        num_pass: Number of forward/backward passes
        num_obs: Number of training examples/observations

    Returns: dW_list, a list of weight derivatives, and dB_list, a list of
             bias derivatives
    """
    dZ = np.multiply(cost_deriv(A_list[num_pass], Y), activation_output_deriv(Z_list[num_pass]))
    dW_list = [np.dot(dZ, A_list[num_pass-1].T)/num_obs]
    dB_list = [np.sum(dZ, axis=1, keepdims=True)/num_obs]
    for i in range(num_pass-1, 0, -1):
        dZ = np.multiply(np.dot(W_list[i].T, dZ), activation_output_deriv(Z_list[i]))
        dW_list = [np.dot(dZ, A_list[i-1].T)/num_obs] + dW_list
        dB_list = [np.sum(dZ, axis=1, keepdims=True)/num_obs] + dB_list
    return dW_list, dB_list

## A Feedforward Neural Network Function

In [126]:
def fnn(X, Y, hidden, learn_rate=0.03, hidden_activation="relu",
        output_activation="linear", loss="mse", batch_size=2, tol=0.0001, max_iter=10000):
    """
    A feedforward neural network function that allows an arbitary number of 
    hidden layers and 

    Args:
        X (np.ndarray): A num_features by num_obs matrix
        Y (np.ndarray): A num_obs by num_outputs matrix
        hidden (list[int]): A list of length num_hidden_layers where each
                            entry describes the number of nodes for the nth
                            hidden layer.
        learn_rate (float): Learning rate of the network.
        hidden_activation (str): Activation function for hidden layers.
        output_activation (str): Activation function for output layer.
        loss: loss function
        tol (float): Minimum difference for ending training loop.
        max_iter (int): Maximum number of training iterations.
    """
    
    # Data will be aligned in a different format to the network
    X = X.T
    Y = Y.T
    
    # Initialise network: core numbers
    shape_X = X.shape
    num_obs = shape_X[1]
    indices = list(range(num_obs))
    num_features = shape_X[0]
    shape_Y = Y.shape
    num_output = shape_Y[0]
    num_hidden = len(hidden)
    num_pass = num_hidden + 1 #passes between layers
    
    # Initialise network: weighted outputs of nodes Z
    Z_list = [X]
    for i in range(num_hidden):
        Z_list.append(np.zeros((hidden[i], num_obs)))
    Z_list.append(np.zeros((num_output, num_obs)))
    
    # Initialise network: weights W:
    hidden_init, output_init = get_init(hidden_activation), get_init(output_activation)
    W_list =  [hidden_init((hidden[0], num_features))]
    for i in range(1, num_hidden):
        W_list.append(hidden_init((hidden[i], hidden[i-1])))
    W_list.append(output_init((num_output, hidden[num_hidden-1])))
    
    # Initialise network: biases B
    B_list = []
    for i in range(num_hidden):
        B_list.append(np.zeros((hidden[i],1)))
    B_list.append(np.zeros((num_output, 1)))
    
    # Initalise network: activation + cost funcs and their derivatives
    activation_hidden, activation_hidden_deriv = get_activation(hidden_activation)
    activation_output, activation_output_deriv = get_activation(output_activation)
    cost_func, cost_deriv = get_cost(loss)
    
    
    # Training loop
    iter = 0
    diff = np.inf
    while (iter <= max_iter and diff > tol):
        # Compute forward pass
        Z_list, A_list = forward_pass(X=X,
                                      W_list=copy.deepcopy(W_list),
                                      B_list=copy.deepcopy(B_list),
                                      activation_hidden=activation_hidden,
                                      activation_output=activation_output,
                                      num_pass=num_pass)
        
        # Compute cost
        cost = cost_func(A_list[num_pass], Y) #individual costs
        
        # Backpropagate to find gradients
        batch_indices = list(np.random.choice(num_obs, batch_size, replace=False))
        dW_list, dB_list = back_pass(cost_deriv=cost_deriv,
                                     Y=copy.deepcopy(Y[:,batch_indices]),
                                     activation_output_deriv=activation_output_deriv,
                                     activation_hidden_deriv=activation_hidden_deriv,
                                     W_list=copy.deepcopy(W_list),
                                     A_list=copy.deepcopy([A[:,batch_indices] for A in A_list]),
                                     Z_list=copy.deepcopy([Z[:,batch_indices] for Z in Z_list]),
                                     num_pass=num_pass,
                                     num_obs=batch_size)
        
        # Update weights (w_new = w_old - learn_rate * gradient(w_old))
        W_old = copy.deepcopy(W_list)
        B_old = copy.deepcopy(B_list)
        new_diff = 0
        for i in range(num_pass):
            W_list[i] = W_list[i] - (learn_rate * dW_list[i]) #(learn_rate * dW_list[i])
            B_list[i] = B_list[i] - (learn_rate * dB_list[i])
            new_diff += np.sum(np.abs(W_list[i] - W_old[i])) + np.sum(np.abs(B_list[i] - B_old[i]))
        diff = new_diff

        # Increment iterations
        iter += 1
    
    # Predicted values of the network
    prediction = A_list[num_pass].T
    
    # Prediction function
    def fnn_pred_func(X):
        X = X.T,
        _, A_list = forward_pass(X=X,
                                      W_list=copy.deepcopy(W_list),
                                      B_list=copy.deepcopy(B_list),
                                      activation_hidden=activation_hidden,
                                      activation_output=activation_output,
                                      num_pass=num_pass)
        return A_list[num_pass]
    
    # Return Yhat
    return prediction, iter, diff, np.sum(cost), fnn_pred_func
    
    
    

## Example 1: Learning XOR

In [127]:
X = np.array([[0,0],
              [0,1],
              [1,0],
              [1,1]])
Y = np.array([[0],[1],[1],[0]])

print(X)
print(Y)

[[0 0]
 [0 1]
 [1 0]
 [1 1]]
[[0]
 [1]
 [1]
 [0]]


In [128]:
np.random.seed(42)
pred, iter, diff, cost, func = fnn(X=X,
                                   Y=Y,
                                   hidden=[2],
                                   learn_rate=0.9,
                                   batch_size=4,
                                   hidden_activation="sigmoid",
                                   output_activation="sigmoid",
                                   loss="mse",
                                   max_iter=10000)
print(np.round(pred,2))

[[0.02]
 [0.98]
 [0.98]
 [0.02]]


In [129]:
print(func(np.array([[0,1]])))
# print(func(np.array([[1,1]])))


[[[0.94134682]
  [0.74656039]]]


## Example 2: 