In [1]:
import numpy as np
from pprint import pprint

# Activation functions
## Sigmoid

$$
\begin{align}
S(x) &= \frac{1}{1+e^{-x}} \\ 
\frac{d}{dx}S(x) &= \frac{d}{dx}\frac{1}{1+e^{-x}} \\
&= \frac{e^{-x}}{(1+e^{-x})^2} \\ 
&= \frac{1+e^{-x}}{(1+e^{-x})^2}-\frac{1}{(1+e^{-x})^2} \\ 
&= \frac{1}{1+e^{-x}}-\frac{1}{(1+e^{-x})^2} \\ 
&= \frac{1}{1+e^{-x}}\big(1-\frac{1}{1+e^{-x}}\big) \\ 
&= S(x)\big(1-S(x)\big)
\end{align}
$$

In [119]:
class Sigmoid:
    def __call__(self, X):
        """
        Shapes: [m, n] -> [m, n]
        Example:
            [[0,  1, -1]       [[0.5 , 0.73, 0.27]
        f (  [2, -2,  3]] ) ->  [0.88, 0.12, 0.95]] 
        """
        return 1 / (1 + np.exp(-X))
    
    def grad(self, X):
        """
        Shapes: [m, n] -> [m, n]
        Example:
            [[0,  1, -1]       [[0.25, 0.2, 0.2 ]
        f (  [2, -2,  3]] ) ->  [0.1 , 0.1, 0.05]] 
        """
        sig = self(X)
        return sig * (1 - sig)

## Relu

In [120]:
class Relu:
    def __call__(self, X):
        """
        Shapes: [m, n] -> [m, n]
        Example:
            [[0,  1, -1]       [[0, 1, 0]
        f (  [2, -2,  3]] ) ->  [2, 0, 3]] 
        """
        return np.maximum(0, X)
    
    def grad(self, X):
        """
        Shapes: [m, n] -> [m, n]
        Example:
            [[0,  1, -1]       [[0, 1, 0]
        f (  [2, -2,  3]] ) ->  [1, 0, 1]] 
        """
        return np.maximum(0, np.sign(X))

In [121]:
ACT = {'relu': Relu(), 'sigmoid': Sigmoid()}

# Losses
* https://stats.stackexchange.com/questions/154879/a-list-of-cost-functions-used-in-neural-networks-alongside-applications/154880#154880

## LogLoss

$$
\begin{align}
\text{LogLoss}(y_t, y_p) &= -\frac{1}{n}\sum{\big(y_t\log{y_p} + (1-y_t)\log{(1-y_p)}\big)} \\
\frac{d}{dy_p}\text{LogLoss}(y_t, y_p) &= -\big(\frac{y_t}{y_p} - \frac{1-y_t}{1-y_p}\big) \\ 
&= \frac{1-y_t}{1-y_p} - \frac{y_t}{y_p}
\end{align}
$$



In [122]:
class LogLoss():
    def __call__(self, Y, Y_hat):
        """
        Shapes: [m, n] -> 1
        """
        loss =  Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat)
        return np.mean(-loss)
    
    def grad(self, Y, Y_hat):
        """
        Shapes: [m, n] -> [m, n]
        """
        return np.divide(1 - Y, 1 - Y_hat) - np.divide(Y, Y_hat)

## MSE

$$
\begin{align}
MSE(y_t, y_p) &= \frac{1}{n}\sum{(y_p-y_t)^2} \\
\frac{d}{dy_p}MSE(y_t, y_p) &= y_p - y_t
\end{align}
$$

In [123]:
class MSE():
    def __call__(self, Y, Y_hat):
        """
        Shapes: [m, n] -> 1
        """
        loss = (Y_hat - Y) ** 2
        return np.mean(loss)
    
    def grad(self, Y, Y_hat):
        """
        Shapes: [m, n] -> [m, n]
        """
        return Y_hat - Y

In [124]:
LOSS = {'logloss': LogLoss(), 'mse': MSE()}

# Layer

In [165]:
class Layer:
    def __init__(self, in_size, out_size, act='relu'):
        """
        Shapes:
        W: [in, out]
        b: [1, out]
        """
        self.W = np.random.randn(in_size, out_size)
        self.b = np.random.randn(1, out_size)
        self.act = ACT[act]
    
    def forward(self, X):
        """
        Shapes: [m, in] × [in, out] + [1, out] = [m, out]
        Example:
        X × W + b = Z
        [[0,  1, -1]    [[0, 1]               [[ 0,  1]
             ...      ×  [2, 3]  + [[2, 3]] =    ...
         [2, -2,  0]]    [4, 5]]               [-2, -1]]
        """                         
        Z = X @ self.W + self.b
        Y = self.act(Z)
        return Y, Z
    
    def backward(self, X, Z, dX):
        """
        Shapes:
        X: [m, in]
        Z: [m, out]
        dX: [m, out]
        dZ: [m, out]
        dW: [in, m] × [m, out] -> [in, out]
        db: [m, out] -> [1, out]
        dX_prev: [m, out] × [out, in] -> [m, in]
        """
        m = X.shape[0]
        dZ = dX * self.act.grad(Z)
        dW = X.T @ dZ / m
        db = np.mean(dZ, axis=0, keepdims=True)
        dX_prev = dZ @ self.W.T
        return dX_prev, dW, db

# Model

In [219]:
class Model:
    def __init__(self, layers, loss='mse', seed=42):
        num_layers = len(layers) - 1
        self.layers = [[]] * num_layers
        self.X = [[]] * num_layers
        self.Z = [[]] * num_layers
        self.dW = [[]] * num_layers
        self.db = [[]] * num_layers
        self.loss = LOSS[loss]
        np.random.seed(seed)
        for i in range(num_layers):
            in_size, out_size = layers[i: i + 2]
            self.layers[i] = Layer(in_size, out_size)
            
    def forward(self, X):
        for i, l in enumerate(self.layers):
            self.X[i] = X
            X, self.Z[i] = l.forward(X)
        return X

    def backward(self, Y, Y_hat):
        dX = self.loss.grad(Y, Y_hat)
        for i, l in reversed(list(enumerate(self.layers))):
            dX, self.dW[i], self.db[i] = l.backward(self.X[i], self.Z[i], dX)

    def update(self, learning_rate):
        for i, l in enumerate(self.layers):
            l.W -= learning_rate * self.dW[i]
            l.b -= learning_rate * self.db[i]

    def train(self, X, Y, epochs, verbose=100, learning_rate=1e-5):
        for i in range(epochs):
            Y_hat = self.forward(X)
            self.backward(Y, Y_hat)
            if i % verbose == 0:
                print({'eposh': i, 'loss': self.loss(Y, Y_hat), 
                       'grad': [np.mean(d) for d in self.dW]})
            self.update(learning_rate)

In [229]:
X = inputs = np.array([
    [73, 67, 43], 
    [91, 88, 64], 
    [87, 134, 58], 
    [102, 43, 37], 
    [69, 96, 70]
], dtype='float32')

Y = np.array([
    [56, 70], 
    [81, 101], 
    [119, 133], 
    [22, 37], 
    [103, 119]
], dtype='float32')

m = Model([3, 2])
m.train(X, Y, 10000, 1000, 1e-5)

{'eposh': 0, 'loss': 6599.363005885239, 'grad': [-5925.799150994647]}
{'eposh': 1000, 'loss': 5.40122380186291, 'grad': [-2.317954562129738]}
{'eposh': 2000, 'loss': 1.2720614736794869, 'grad': [-0.9136500455442104]}
{'eposh': 3000, 'loss': 0.6344518005362033, 'grad': [-0.3591325039280515]}
{'eposh': 4000, 'loss': 0.5359601597938106, 'grad': [-0.14119302348173338]}
{'eposh': 5000, 'loss': 0.5207420630081447, 'grad': [-0.05553904003193775]}
{'eposh': 6000, 'loss': 0.5183866106088224, 'grad': [-0.02187554449889954]}
{'eposh': 7000, 'loss': 0.5180179557975881, 'grad': [-0.008645206705092787]}
{'eposh': 8000, 'loss': 0.517956187590354, 'grad': [-0.003445452037034139]}
{'eposh': 9000, 'loss': 0.517941822659721, 'grad': [-0.0014018549320747792]}


In [187]:
m.layers[0].W

array([[ 0.19267347, -0.01633728, -0.34794431, -0.00444718],
       [-0.03196527, -0.02690705,  0.73858809,  0.309309  ],
       [-0.12634051,  0.05170876,  0.59796808,  0.18072154]])

In [188]:
m.layers[1].W

array([[-0.24320262,  0.03142473],
       [-0.09070897, -0.14123037],
       [ 1.0060721 , -0.02257763],
       [ 0.31407386, -0.14247482]])

In [189]:
m.layers[1].b

array([[-0.04790052,  0.01109226]])

In [200]:
x = np.zeros()
x

array([], shape=(3, 0), dtype=float64)

In [202]:
x[1] = np.array([2])
x

array([], shape=(3, 0), dtype=float64)