https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795

In [1]:
import numpy as np
from pprint import pprint

In [63]:
class Sigmoid:
    def __call__(self, Z):
        return 1 / (1 + np.exp(-Z))
    
    def grad(self, dA, Z):
        sig = self(Z)
        return dA * sig * (1 - sig)
    
class Relu:
    def __call__(self, Z):
        return np.maximum(0, Z)
    
    def grad(self, dA, Z):
        dZ = np.array(dA, copy=True)
        dZ[Z <= 0] = 0
        return dZ
    
class LogLoss():
    def __call__(self, Y_hat, Y):
        m = Y_hat.shape[1]
        cost = Y @ np.log(Y_hat).T + (1 - Y) @ np.log(1 - Y_hat).T
        return np.squeeze(-cost / m)
    
    def grad(self, Y_hat, Y):
        return -(np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat))
    
class MSE():
    def __call__(self, Y_hat, Y):
        cost = (Y_hat - Y) ** 2
        return np.mean(cost)
    
    def grad(self, Y_hat, Y):
        return Y_hat - Y
    
ACT = {'relu': Relu(), 'sigmoid': Sigmoid()}
LOSS = {'logloss': LogLoss(), 'mse': MSE()}

class Layer:
    def __init__(self, in_size, out_size, act='relu'):
        self.W = np.random.normal(scale=0.1, size=(out_size, in_size))
        self.b = np.random.normal(scale=0.1, size=(out_size, 1))
        self.act = ACT[act]
    
    def forward(self, A):
        Z = self.W @ A + self.b
        return self.act(Z), Z
    
    def backward(self, dA, Z, A):
        m = A.shape[1]
        # print('dA', dA.shape)
        dZ = self.act.grad(dA, Z)
        dW = (dZ @ A.T) / m
        db = np.sum(dZ, axis=1, keepdims=True) / m
        dA = self.W.T @ dZ
#         pprint({
#             'Z': Z.shape,
#             'A': A.shape,
#             'dZ': dZ.shape,
#             'W': self.W.shape,
#             'dW': dW.shape,
#             'db': db.shape,
#             'dA': dA.shape
#         })
        
        return dA, dW, db
        
class Model:
    def __init__(self, layers, loss='mse', seed=42):
        np.random.seed(42)
        num_layers = len(layers) - 1
        self.layers = [[]] * num_layers
        self.A = [[]] * num_layers
        self.Z = [[]] * num_layers
        self.dW = [[]] * num_layers
        self.db = [[]] * num_layers
        self.loss = LOSS[loss]
        for i in range(num_layers):
            in_size, out_size = layers[i: i + 2]
            self.layers[i] = Layer(in_size, out_size)
        # self.layers[-1] = Layer(sizes[-1], 1, 'sigmoid')
        
    def forward(self, X):
        for i, l in enumerate(self.layers):
            self.A[i] = X
            X, self.Z[i] = l.forward(X)
            # pprint({'X': X})
        return X
    
    def backward(self, Y_hat, Y):
        dA = self.loss.grad(Y_hat, Y)
        for i, l in reversed(list(enumerate(self.layers))):
            dA, self.dW[i], self.db[i] = l.backward(dA, self.Z[i], self.A[i])
        # pprint({'dW': self.dW})
        
    def update(self, learning_rate):
        for i, l in enumerate(self.layers):
            l.W -= learning_rate * self.dW[i]
            l.b -= learning_rate * self.db[i]
            
    def train(self, X, Y, epochs, learning_rate):
        for i in range(epochs):
            Y_hat = self.forward(X)
            self.backward(Y_hat, Y)
            self.update(learning_rate)
            print(self.loss(Y_hat, Y))
            
    
        
m = Model([3, 2])

X = inputs = np.array([
    [73, 67, 43], 
    [91, 88, 64], 
    [87, 134, 58], 
    [102, 43, 37], 
    [69, 96, 70]
], dtype='float32').T
Y = np.array([
    [56, 70], 
    [81, 101], 
    [119, 133], 
    [22, 37], 
    [103, 119]
], dtype='float32').T

m.layers[0].W
#m.loss(Y, m.forward(X))
# pprint({'W': m.layers[0].W, 'b': m.layers[0].b})
m.train(X, Y, 5, 1e-5)
# Y_hat = m.forward(X)
# m.backward(Y_hat, Y)
#m.backward()

7063.5814226954535
4947.771130422629
3519.6786823196294
2555.058010474672
1902.792550472873


In [95]:
X = np.array([[1], [2]])
m.forward(X)

array([[0.51058178]])

In [157]:
def get_cost_value(Y_hat, Y):
    m = Y_hat.shape[1]
    cost = Y @ np.log(Y_hat).T + (1 - Y) @ np.log(1 - Y_hat).T
    return np.squeeze(-cost / m)

get_cost_value(np.array([[0.3, 0.1]]), np.array([[1, 0]]))

array(0.65466666)

In [38]:
def init_layers(nn_architecture, seed = 42):
    np.random.seed(seed)
    number_of_layers = len(nn_architecture)
    params_values = {}

    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]
        
        params_values['W' + str(layer_idx)] = np.random.randn(
            layer_output_size, layer_input_size) * 0.1
        params_values['b' + str(layer_idx)] = np.random.randn(
            layer_output_size, 1) * 0.1
        
    return params_values

def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0,Z)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0;
    return dZ;

def single_layer_forward_propagation(A_prev, W_curr, b_curr, activation="relu"):
    Z_curr = np.dot(W_curr, A_prev) + b_curr
    
    if activation is "relu":
        activation_func = relu
    elif activation is "sigmoid":
        activation_func = sigmoid
    else:
        raise Exception('Non-supported activation function')
        
    return activation_func(Z_curr), Z_curr

def full_forward_propagation(X, params_values, nn_architecture):
    memory = {}
    A_curr = X
    
    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        A_prev = A_curr
        
        activ_function_curr = layer["activation"]
        W_curr = params_values["W" + str(layer_idx)]
        b_curr = params_values["b" + str(layer_idx)]
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, activ_function_curr)
        
        memory["A" + str(idx)] = A_prev
        memory["Z" + str(layer_idx)] = Z_curr
       
    return A_curr, memory

def single_layer_backward_propagation(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation="relu"):
    m = A_prev.shape[1]
    
    if activation is "relu":
        backward_activation_func = relu_backward
    elif activation is "sigmoid":
        backward_activation_func = sigmoid_backward
    else:
        raise Exception('Non-supported activation function')
    
    dZ_curr = backward_activation_func(dA_curr, Z_curr)
    dW_curr = np.dot(dZ_curr, A_prev.T) / m
    db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m
    dA_prev = np.dot(W_curr.T, dZ_curr)

    return dA_prev, dW_curr, db_curr

def full_backward_propagation(Y_hat, Y, memory, params_values, nn_architecture):
    grads_values = {}
    m = Y.shape[1]
    Y = Y.reshape(Y_hat.shape)
   
    dA_prev = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat));
    print(dA_prev)
    for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))):
        layer_idx_curr = layer_idx_prev + 1
        activ_function_curr = layer["activation"]
        
        dA_curr = dA_prev
        
        A_prev = memory["A" + str(layer_idx_prev)]
        Z_curr = memory["Z" + str(layer_idx_curr)]
        W_curr = params_values["W" + str(layer_idx_curr)]
        b_curr = params_values["b" + str(layer_idx_curr)]
        
        dA_prev, dW_curr, db_curr = single_layer_backward_propagation(
            dA_curr, W_curr, b_curr, Z_curr, A_prev, activ_function_curr)
        
        grads_values["dW" + str(layer_idx_curr)] = dW_curr
        grads_values["db" + str(layer_idx_curr)] = db_curr
    
    return grads_values


nn_architecture = [
    {"input_dim": 2, "output_dim": 2, "activation": "relu"},
    {"input_dim": 2, "output_dim": 1, "activation": "sigmoid"},
]
param_values = init_layers(nn_architecture)
pprint(param_values)

X = np.array([[1], [2]])
Y = np.array([[1]])
Y_hat, memory = full_forward_propagation(X, param_values, nn_architecture)
pprint({'Y_hat': Y_hat, 'Y': Y})
pprint(full_backward_propagation(Y_hat, Y, memory, param_values, nn_architecture))


{'W1': array([[ 0.04967142, -0.01382643],
       [ 0.06476885,  0.15230299]]),
 'W2': array([[0.15792128, 0.07674347]]),
 'b1': array([[-0.02341534],
       [-0.0234137 ]]),
 'b2': array([[-0.04694744]])}
{'Y': array([[1]]), 'Y_hat': array([[0.49490088]])}
[[-2.02060662]]
{'dW1': array([[ 0.        ,  0.        ],
       [-0.03876306, -0.07752612]]),
 'dW2': array([[ 0.        , -0.17474466]]),
 'db1': array([[ 0.        ],
       [-0.03876306]]),
 'db2': array([[-0.50509912]])}


In [37]:
m = Model([2, 2])
X = np.array([[1], [2]])
Y_hat = m.forward(X)
Y = np.array([[1]])
pprint({'Y_hat': Y_hat, 'Y': Y})
m.backward(Y_hat, Y)

{'Y': array([[1]]), 'Y_hat': array([[0.49490088]])}
[[-2.02060662]]
[array([[ 0.        ,  0.        ],
       [-0.03876306, -0.07752612]]), array([[ 0.        , -0.17474466]])]
[array([[ 0.        ],
       [-0.03876306]]), array([[-0.50509912]])]


In [122]:
m.forward(X)

array([[0.49490088]])

In [36]:
m.layers[0].W

array([[ 0.04967142, -0.01382643]])

In [35]:
m.layers[0].b

array([[0.06476885]])