In [1]:
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd
import timeit
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (7,5)

In [89]:
class neural_net:
    """
    https://nbviewer.jupyter.org/github/ImadDabbura/blog-posts/blob/master/notebooks/Coding-Neural-Network-Forwad-Back-Propagation.ipynb
    
    link_function: specifies the link function. Possible values are
                   "sigmoid" and "softmax"
    
    batch_size: int. Specify the sample size of each bin. Uses np.array_split to avoid
                exception due to not even split possible.  Set to None to avoid it.
                   
    note: working under the assumption that the rows and columns of
          the data corresponds to observations and variables respectively
    """
    def __init__(self, step_size = 0.01, epochs = 10000, random_init = False, 
                activation_function = "sigmoid", batch_size = None):
        self.step_size = step_size
        self.epochs = epochs
        self.random_init = random_init
        self.activation_function = activation_function
        self.batch_size = batch_size
        
    def sigmoid(self, z):
        h = 1/(1+np.exp(-z))
        return h, z
    
    def relu(self, z):
        h = np.maximum(z, 0)
        return h, z
    
    def softmax(self, z):
        """
        Using normalization as done here
        https://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick/
        """
        s = np.max(z, axis=1).reshape(z.shape[0], 1)
        e_x = np.exp(z - s)
        div = np.sum(e_x, axis=1).reshape(z.shape[0], 1)
        h = e_x / div 
        return h, z

    def loss(self, hL, y):
        return (-1 / y.shape[1]) * np.sum(y * np.log(hL))
            
    def add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis = 1)
    
    def undo_one_hot(self, y):
        """
        Undoes one-hot-encoding of target vector
        to calculate prediction accuracy
        """
        return np.argmax(y, axis  = 1)
    
    def one_hot(self, y):
        """
        One-hot encodes a cases x 1 vector of labels
        """
        return np.eye(np.max(y) + 1)[y]
        
    
    def get_preds_and_probs(self, X):
        """
        Returns [predicted probabilties, predictions]
        """
        X = self.add_intercept(X)
        
        if self.activation_function.lower() == "softmax":
            probs = self.softmax(np.dot(X, self.w))
            preds = np.argmax(probs,axis=1)
        else:
            probs = self.sigmoid(np.dot(X, self.w))
            preds = probs >= .5
        
        return probs, preds
    
    def h(self, X):
        """
        Performs activation function check and computes the output based on X
        """
        z = X.dot(self.w)
        # activation function
        if self.activation_function.lower() == "sigmoid":
            p = self.sigmoid(z)
        else:
            p = self.softmax(z)
        return p

    def initialize_parameters(self, layers_dims, k):
        """
        Returns a dict of initialized parameters
        
        Arguments
        --------
        layers_dims: [X.shape[0], # nodes layer 1, ..., # nodes final layer]
        k: int. # unique values of y.
        
        Returns
        --------
        dict. Final weight matrix is k x nodes in last layer 
        
        """
        np.random.seed(1)
        parameters = {}
        L = len(layers_dims)
        
        if L == 1:
            parameters["W" + str(L)] = np.random.normal(
                0, .01, layers_dims[0] * k).reshape(k, layers_dims[0])
            parameters["b" + str(L)] = np.zeros((k,1))
            return parameters
        else:
            for l in range(1, L):
                parameters["W" + str(l)] = np.random.normal(0, .01, 
                    layers_dims[l]*layers_dims[l-1]).reshape(layers_dims[l-1],layers_dims[l])
                parameters["b" + str(l)] = np.zeros((1,layers_dims[l]))

            dim_l = parameters["W" + str(L-1)].shape[0]
            parameters["W" + str(L)] = np.random.normal(
                0, .01, dim_l * k).reshape(dim_l, k)
            parameters["b" + str(L)] = np.zeros((k,1))

            return parameters
    
    def linear_forward(self, h_prev, W, b):
        """
        Computes transformation of the input
        
        Arguments
        ---------
        h_prev: np.array with output values from previous layer
        W: np.array with weights, 
            shape: size of current layer x size of prev layer
        b: np.array, 
            shape: size of current layer x 1
            
        Returns
        ---------
        Z: np.array with transformation output
        cached_objects: tuple
            stores h_prev, W, b for backprop
        """
        Z =  h_prev.dot(W) + b
        cache = (h_prev, W, b)
        
        return Z, cache
        
    def linear_activation_forward(self, h_prev, W, b, activation_fun):
        """
        Computes output from activation function
        
        Arguments
        ---------
        activation_fun : str
            "sigmoid", "relu" or "softmax"
        """
        if activation_fun == "sigmoid":
            Z, lin_cache = self.linear_forward(h_prev, W, b)
            h, activation_cache = self.sigmoid(Z)
        
        elif activation_fun == "relu":
            Z, lin_cache = self.linear_forward(h_prev, W, b)
            h, activation_cache = self.relu(Z)
            
        elif activation_fun == "softmax":
            Z, lin_cache = self.linear_forward(h_prev, W, b)
            h, activation_cache = self.softmax(Z)
        
        cache = (lin_cache, activation_cache)
        
        return h, cache
    
    def L_model_forward(self, X, parameters, hidden_layers_acv_fun = "relu"):
        """
        Arguments
        parameters : dict
            output from self.initialize_parameters
        hidden_layers_acv_fun : str or [str]
            if str: uses the activation funtcion on all layers
            if [str] : allows for different activation functions
                       over the layers
        """
        h = X
        caches = []
        L = len(parameters) // 2 
        
        if isinstance(hidden_layers_acv_fun, str):
            hidden_layers_acv_fun = [hidden_layers_acv_fun] * (L)
            
        for i in range(1, L):
            h_prev = h
            h, cache = self.linear_activation_forward(
                h_prev, parameters["W" + str(i)], parameters["b" + str(i)],
                activation_fun = hidden_layers_acv_fun[i])
            caches.append(cache)
        
        hL, cache = self.linear_activation_forward(
            h, parameters["W" + str(L)], parameters["b" + str(L)],
            activation_fun = "softmax")
        caches.append(cache)
        
        return hL, caches
    
    def batch_data(self, X, y):
        X_list = []
        y_list = []
        # create a permutation of the row ids and 
        # split them using array_split
        ids = np.random.permutation(X.shape[0])
        ids_list = np.array_split(ids, np.round(X.shape[0]/self.batch_size))
        for batches in ids_list:
            X_list.append(X[batches, :])
            y_list.append(y[batches, :])
        return X_list, y_list
    
    def sigmoid_grad(self, dh, z):
        """
        computes gradient of the sigmoid function wrt z
        
        Arguments
        ---------
        dh : np.array, post-activation gradient 
        """
        h, _ = self.sigmoid(z)
        dz = dh * h * (1-h)
        
        return dz
    
    def relu_grad(self, dh, z):
        """
        computes gradient of the relu function wrt z
        
        Arguments
        ---------
        dh : np.array, post-activation gradient 
        """
        h, _ = self.relu(z)
        dz = np.multiply(dh, np.int64(h > 0))
        return dz
    
    def softmax_grad(self, h_prev, z, y):
        """
        Note that I have not made any extension that allows for the 
        softmax to be the activation of any intermediate layer.
        
        Arguments
        ---------
        h_prev: activation output from second to last layer 
        """
        h, _ = self.softmax(z)
        dz = h_prev * (h - y)
        return dz
  
    def compute_cost(self, hL, y):
        return (-1 / y.shape[1]) * np.sum(y * np.log(hL))
    
    def linear_backward(self, dZ, cache):
        """
        Computes gradient wrt weight, bias and post-activation output of (l-1)
        layers at layer l.
        
        Arguments
        ---------
        dZ: np.array, gradient of the cost wrt linear output
        cache: tuple, values of (h_prev, W, b)
        
        Returns
        dh_prev : np.array, gradient of cost wrt activation (previous l-1 layers)
        dW : np.array, gradient of cost wrt W (current layer)
        db : np.array, gradient of cost wrt b (current layer)
        """
        h_prev, W, b = cache
        m = h_prev.shape[1]
        
        dW = (1/m) * np.dot(dZ, h_prev.T)
        db = (1/m) * np.sum(dZ, axis = 1, keepdims = True)
        dh_prev = np.dot(W.T, dZ)
        
        assert dh_prev.shape == h_prev.shape
        assert dW.shape == W.shape
        assert db.shape == b.shape
        
        return dh_prev, dW, db
    
    def linear_activation_backward(self, dh, cache, activation_fun):
        """
        Arguments
        ---------
        dh: np.array, post-activation gradient for current layer l
        cache: tuple, (lin_cache, activation_cache)
        """
        lin_cache, activation_cache = cache
        if activation_fun == "sigmoid":
            dZ = self.sigmoid_gradient(dh, activation_cache)
            dh_prev, dW, db = linear_backward(dZ, lin_cache)
        elif activation_fun == "relu":
            dZ = self.relu_gradient(dh, activation_cache)
            dh_prev, dW, db = linear_backward(dZ, lin_cache)
        
        return dh_prev, dW, db
    
    def L_model_backward(hL, y, caches, hidden_layers_activation_fn = "relu"):
        """
        Computes gradient of output layer wrt weights etc. 
        
        Arguments
        ---------
        AL: np.array, shape = [# nodes in last layer x k]
        y: np.array, one-hot encoded class labels
        """
        L = len(caches)
        grads = {}
        
        if isinstance(hidden_layers_acv_fun, str):
            hidden_layers_acv_fun = [hidden_layers_acv_fun] * (L)
    
        dhL = np.divide()
        


In [58]:
X = np.array([1,2,-2,
              2,3,4, 
              5,6,2,
             -2,-1,2]).reshape(4,3)
y = np.array([0,1,0,1,0,0,0,0,1,0,1,0]).reshape(4,3)

y

array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0]])

In [90]:
test = neural_net()
w = test.initialize_parameters([X.shape[1], 2], y.shape[1])
AL, _ = test.L_model_forward(X, w, "sigmoid")

ValueError: shapes (4,2) and (3,3) not aligned: 2 (dim 1) != 3 (dim 0)

In [75]:
h1 = test.sigmoid(X.dot(w["W1"].T) + w["b1"].T)[0]

In [76]:
h2 = test.softmax(h1.dot(w["W2"].T) + w["b2"].T)[0]
h2

array([[0.33473931, 0.33321826, 0.33204243],
       [0.33469846, 0.33314349, 0.33215805],
       [0.33472663, 0.33313986, 0.33213351],
       [0.33469606, 0.33319979, 0.33210415]])

In [73]:
AL

array([[0.24997938, 0.25001267, 0.24995363, 0.25005433],
       [0.25004044, 0.24997069, 0.24998366, 0.2500052 ],
       [0.24997437, 0.25001626, 0.24995467, 0.2500547 ]])

In [86]:
X.dot(w["W1"])

array([[-0.01162813,  0.01845384],
       [ 0.05125806, -0.13648573],
       [ 0.06683512, -0.14099671],
       [-0.00989704, -0.02306596]])

Need the following,

$$
\begin{aligned}
\delta^L = \nabla_a C \odot \sigma'\big(z^L\big),
\end{aligned}
$$
where
$$
\begin{aligned}
a^l = \sigma(w^l a^{l-1} + b^l)
\end{aligned}
$$

In [31]:
probs = test.softmax(AL.T)[0] - y
probs

array([[ 0.33340699, -0.6665725 ,  0.33316551],
       [-0.66675657,  0.33312733,  0.33362925],
       [ 0.33335597,  0.33311322, -0.6664692 ],
       [ 0.33323417, -0.666647  ,  0.33341283]])

In [28]:
y_vec = test.undo_one_hot(y)
probs[range(4), y_vec] -= 1

In [29]:
probs

array([[ 0.33340699, -1.6665725 ,  0.33316551],
       [-1.66675657,  0.33312733,  0.33362925],
       [ 0.33335597,  0.33311322, -1.6664692 ],
       [ 0.33323417, -1.666647  ,  0.33341283]])

In [32]:
probs - y

array([[ 0.33340699, -1.6665725 ,  0.33316551],
       [-1.66675657,  0.33312733,  0.33362925],
       [ 0.33335597,  0.33311322, -1.6664692 ],
       [ 0.33323417, -1.666647  ,  0.33341283]])

### New attempt

In [2]:
class neural_net:
    """
    https://nbviewer.jupyter.org/github/ImadDabbura/blog-posts/blob/master/notebooks/Coding-Neural-Network-Forwad-Back-Propagation.ipynb
    
    link_function: specifies the link function. Possible values are
                   "sigmoid" and "softmax"
    
    batch_size: int. Specify the sample size of each bin. Uses np.array_split to avoid
                exception due to not even split possible.  Set to None to avoid it.
                   
    note: working under the assumption that the rows and columns of
          the data corresponds to observations and variables respectively
    """
    def __init__(self, step_size = 0.01, epochs = 10000, random_init = False, 
                activation_function = "sigmoid", batch_size = None):
        self.step_size = step_size
        self.epochs = epochs
        self.random_init = random_init
        self.activation_function = activation_function
        self.batch_size = batch_size
        
    def sigmoid(self, z):
        h = 1/(1+np.exp(-z))
        return h, z
    
    def relu(self, z):
        h = np.maximum(z, 0)
        return h, z
    
    def softmax(self, z):
        """
        Using normalization as done here
        https://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick/
        """
        s = np.max(z, axis=1).reshape(z.shape[0], 1)
        e_x = np.exp(z - s)
        div = np.sum(e_x, axis=1).reshape(z.shape[0], 1)
        h = e_x / div 
        return h, z

    def loss(self, hL, y):
        return (-1 / y.shape[1]) * np.sum(y * np.log(hL))
    
    def initialize_parameters(self, layers_dims, k):
        """
        Returns a dict of initialized parameters
        
        Arguments
        --------
        layers_dims: [X.shape[0], # nodes layer 1, ..., # nodes final layer]
        k: int. # unique values of y.
        
        Returns
        --------
        dict. Final weight matrix is k x nodes in last layer 
        
        """
        np.random.seed(1)
        parameters = {}
        L = len(layers_dims)
        
        if L == 1:
            parameters["W" + str(L)] = np.random.normal(
                0, .01, layers_dims[0] * k).reshape(layers_dims[0],k)
            parameters["b" + str(L)] = np.zeros((k,1))
            return parameters
        else:
            for l in range(1, L):
                parameters["W" + str(l)] = np.random.normal(0, .01, 
                    layers_dims[l]*layers_dims[l-1]).reshape(layers_dims[l],layers_dims[l-1])
                parameters["b" + str(l)] = np.zeros((layers_dims[l], 1))

            dim_l = parameters["W" + str(L-1)].shape[0]
            parameters["W" + str(L)] = np.random.normal(
                0, .01, dim_l * k).reshape(k, dim_l)
            parameters["b" + str(L)] = np.zeros((k, 1))

            return parameters
    
    ## forward methods ##
    def linear_forward(self, h_prev, W, b):
        """
        Computes affine transformation of input previous hidden
        """
        Z = h_prev.dot(W.T) + b.T
        cache = (h_prev, W, b)
        return Z, cache
    
    def linear_activation_forward(self, h_prev, W, b, activation_fun):
        """
        Computes output from activation function
        
        Arguments
        ---------
        activation_fun : str
            "sigmoid", "relu" or "softmax"
        """
        if activation_fun == "sigmoid":
            Z, lin_cache = self.linear_forward(h_prev, W, b)
            h, activation_cache = self.sigmoid(Z)
        
        elif activation_fun == "relu":
            Z, lin_cache = self.linear_forward(h_prev, W, b)
            h, activation_cache = self.relu(Z)
            
        elif activation_fun == "softmax":
            Z, lin_cache = self.linear_forward(h_prev, W, b)
            h, activation_cache = self.softmax(Z)
        
        cache = (lin_cache, activation_cache)
        
        return h, cache
    
    def L_model_forward(self, X, parameters, hidden_layers_acv_fun = "relu"):
        """
        Goes through the network and returns the final output aswell as storing 
        information in the cache along the way for backpropagation
        
        Arguments
        ---------
        parameters : dict
            output from self.initialize_parameters
        hidden_layers_acv_fun : str or [str]
            if str: uses the activation funtcion on all layers
            if [str] : allows for different activation functions
                       over the layers
        """
        h = X
        caches = []
        L = len(parameters) // 2 
        
        if isinstance(hidden_layers_acv_fun, str):
            hidden_layers_acv_fun = [hidden_layers_acv_fun] * (L)
            
        for i in range(1, L):
            h_prev = h
            h, cache = self.linear_activation_forward(
                h_prev, parameters["W" + str(i)], parameters["b" + str(i)],
                activation_fun = hidden_layers_acv_fun[i-1])
            caches.append(cache)
        
        hL, cache = self.linear_activation_forward(
            h, parameters["W" + str(L)], parameters["b" + str(L)],
            activation_fun = "softmax")
        caches.append(cache)
        
        return hL, caches
    
    def compute_cost(self, hL, y):
        """
        Arguments
        ---------
        hL : np.array, output layer
        y : np.array, one-hot encoded classes
        """
        return (-1 / y.shape[0]) * np.sum(y * np.log(hL))
    
    ## Gradients ##
    def sigmoid_grad(self, dh, z):
        """
        computes gradient of the sigmoid function wrt z
        
        Arguments
        ---------
        dh : np.array, post-activation gradient 
        """
        h, _ = self.sigmoid(z)
        dz = dh * h * (1-h)
        
        return dz
    
    def relu_grad(self, dh, z):
        """
        computes gradient of the relu function wrt z
        
        Arguments
        ---------
        dh : np.array, post-activation gradient 
        """
        h, _ = self.relu(z)
        dz = np.multiply(dh, np.int64(h > 0))
        
        return dz
    
    def softmax_grad(self, dh, z):
        """
        Computes gradient of the relu function wrt z
        
        Arguments
        ---------
        dh : np.array, gradient of cost-function
        """
        h,_  = self.softmax(z)
        dz = dh 
    
    ## Back prop ##
    def linear_backward(dZ, cache):
        """
        Computes gradient of output wrt weight, bias and post-act output
        of layer (l-1) at layer (l)
        
        Arguments
        ---------
        dZ : np.array, gradient of cost wrt linear output of current layer
        cache : tuple, values of (h_prev, W, b) from forwards step at current layer
        
        Returns
        ---------
        dh_prev : np.array, gradient of cost wrt activation of previous layer
        dW : np.array, gradient of cost wrt W of current layer
        db : np.array, gradient of cost wrt b of current layer
        """
        h_prev, W, b = cache
        m = h_prev.shape[1]
        
        dW = 1/m * dZ.dot(h_prev.T)
        db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
        dh_prev = np.dot(W.T, dZ)
        
        return dh_prev, dW, db

    def linear_activation_backward(self, dh, cache, activation_fun):
        """
        Arguments
        ---------
        dh: np.array, post-activation gradient for current layer l
        cache: tuple, (lin_cache, activation_cache)
        """
        lin_cache, activation_cache = cache
        
        if activation_fun == "sigmoid":
            dZ = self.sigmoid_gradient(dh, activation_cache)
            dh_prev, dW, db = linear_backward(dZ, lin_cache)
            
        elif activation_fun == "relu":
            dZ = self.relu_gradient(dh, activation_cache)
            dh_prev, dW, db = linear_backward(dZ, lin_cache)
        
        return dh_prev, dW, db
    
    def L_model_backward(hL, y, caches, hidden_layers_activation_fn="relu"):
        """
        Computes gradient of output layer wrt weights etc. 
        
        Arguments
        ---------
        AL: np.array, shape = [# nodes in last layer x k]
        y: np.array, one-hot encoded class labels
        """
        L = len(caches)
        grads = {}
        
        if isinstance(hidden_layers_acv_fun, str):
            hidden_layers_acv_fun = [hidden_layers_acv_fun] * (L)
        
        # compute gradient of the cost function wrt final hidden layer output 
        # http://cs231n.github.io/neural-networks-case-study/
        dhL = (hL - y) / y.shape[0]
        
    
        

In [5]:
X = np.array([1,2,-2,2,3,4,5,6,2,-2,-1,2]).reshape(4,3)
y = np.array([0,1,0,1,0,0,0,0,1,0,1,0]).reshape(4,3)
test = neural_net()
w = test.initialize_parameters([X.shape[1], 2, 3], y.shape[1])
hL, c = test.L_model_forward(X, w, ["relu", "sigmoid"])

In [6]:
c

[((array([[ 1,  2, -2],
          [ 2,  3,  4],
          [ 5,  6,  2],
          [-2, -1,  2]]), array([[ 0.01624345, -0.00611756, -0.00528172],
          [-0.01072969,  0.00865408, -0.02301539]]), array([[0.],
          [0.]])), array([[ 0.01457176,  0.05260924],
         [-0.00699266, -0.08755869],
         [ 0.03394845, -0.04775475],
         [-0.03693278, -0.03322548]])), ((array([[0.01457176, 0.05260924],
          [0.        , 0.        ],
          [0.03394845, 0.        ],
          [0.        , 0.        ]]), array([[ 0.01744812, -0.00761207],
          [ 0.00319039, -0.0024937 ],
          [ 0.01462108, -0.02060141]]), array([[0.],
          [0.],
          [0.]])), array([[-1.46215378e-04, -8.47022474e-05, -8.70769511e-04],
         [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
         [ 5.92336520e-04,  1.08308823e-04,  4.96362957e-04],
         [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])), ((array([[0.49996345, 0.49997882, 0.49978231],
          [0.5    