## Homeworks 5
1. Implement feed-forward
2. Hand-derived backpropagation for small neural network
3. General-case backpropagation

In [1]:
%matplotlib inline
# standard library
import itertools

# pandas
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
import sklearn.datasets

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
def feed_forward(X, betas, activ_func, out_activ_func=None):
    """Feeds the inputs to the network, given the weights 
    and the activation function. If the output activation
    is not specified, assumes it's the same as the other
    layers. Returns each layer's preactivation and activation."""
    if out_activ_func is None:
        out_activ_func = activ_func  # activation for last 
    if len(X.shape) < 2:
        X = X[np.newaxis, :]  # make it two dim
    
    # intialize preactivations 
    preactivs, activs = [], [X]
    for layer, beta in enumerate(betas):
        # iterate over betas, get the last activation
        # and use it to compute the preactivations
        last_activ = activs[-1] 
        
        # also, pad wth bias
        last_activ = np.hstack((np.ones((last_activ.shape[0], 1)), 
                                last_activ))
        preactivs.append(last_activ.dot(beta.T))
        
        # pass it through the appropriate activ func
        layer_func = activ_func if layer < len(betas) else out_activ_func
        activs.append(layer_func(preactivs[-1]))  
        
    # return the preactivations and activations
    return preactivs, activs
    
    
def sigm(x):
    """Returns the sigmoid of the array/matrix"""
    return 1. / (1. + np.exp(-x))

In [3]:
# test out a 3, 2, 1 network
X = np.array([[1, 1, 0, 1],
              [0, 1, 1, 3]])
betas = [np.array([[0.2, -1, -1, 0.3, -0.5],
                   [0.1, -1, 0, 0.4, 1],
                   [0, 0, 0, .5, 3]]),
         np.array([[-0.3, 0.2, 0.5, 1]])]

feed_forward(X, betas, sigm)

([array([[-2.3,  0.1,  3. ],
         [-2. ,  3.5,  9.5]]), array([[ 0.93328831],
         [ 1.20910962]])], [array([[1, 1, 0, 1],
         [0, 1, 1, 3]]), array([[ 0.09112296,  0.52497919,  0.95257413],
         [ 0.11920292,  0.97068777,  0.99992515]]), array([[ 0.71774194],
         [ 0.77014137]])])

In [10]:
def back_prop(expected, preactivs, activs, betas, activ_grad_func, out_grad_func=None):
    """Computes and returns the gradient of the MLP, given the computed vals
    the activations, preactivations, etas and gradient funcs for the hidden
    and output layers.
    They must all be received in matrix form!
    """
    if out_grad_func is None:
        # if unspecified, use the other activations 
        out_grad_func = activ_grad_func

    # the error signal for the output nodes
    # same for squared error and cross-entropy
    pred_diff = activs[-1] - expected
    err_signs = [pred_diff ]
    # compute error signals for the rest, not for the input tho
    for preactiv, beta in zip(preactivs[-2::-1], betas[:0:-1]):
        preactiv_grad = activ_grad_func(preactiv)  # the gradient of the input
        err_signs.append(preactiv_grad * err_signs[-1].dot(beta[:,1:]))  # no bias

    err_signs = err_signs[::-1]  # reverse it
    
    grads = []
    # compute the gradients from the error_signs 
    for err_sign, activ in zip(err_signs, activs[:-1]):
        act = np.hstack((np.ones((activ.shape[0], 1)), activ))  # bias
        grads.append(err_sign.T.dot(act) / expected.shape[0])
        
    return grads


def sigm_grad(x):
    """Returns the sigmoid of the gradient function in x"""
    return sigm(x) * (1 - sigm(x))

In [11]:
X = np.array([[0.4]])
betas =  [np.array([[0.25, 0.5]]), np.array([[0.75, 0.60]])]
preactivs, activs = feed_forward(X, betas, lambda x: x)

# the result should be 
back_prop(np.array([[0.7]]), preactivs, activs, betas, lambda x: np.ones(x.shape)), activs

([array([[ 0.192 ,  0.0768]]), array([[ 0.32 ,  0.144]])],
 [array([[ 0.4]]), array([[ 0.45]]), array([[ 1.02]])])

In [27]:
class MLP(object):
    """Class for a multilayer neural net"""
    def __init__(self, layer_sizes, activ_func, activ_func_grad, out_activ_func=None, out_activ_func_grad=None):
        self.layer_sizes = layer_sizes
        self.activ_func = activ_func
        self.out_activ_func = out_activ_func
        self.activ_func_grad = activ_func_grad
        self.out_activ_func_grad = out_activ_func_grad
        
    def init_weights(self, X, Y, minibatch_size=1):
        # initialize the weights
        self.betas = []
        nodes_per_layer = [X.shape[1]] + self.layer_sizes + [Y.shape[1]]
        for in_layer, out_layer in zip(nodes_per_layer[:-1], nodes_per_layer[1:]):
            self.betas.append(np.random.normal(size=(out_layer, in_layer+1), scale=.001))
            
    def predict(self, X):
        """Returns the predicted values for the given input"""
        preactivs, activs = feed_forward(X, self.betas, self.activ_func, self.out_activ_func)
        return activs[-1]
    
    def get_grads(self, X, Y):
        preactivs, activs = feed_forward(X, self.betas, self.activ_func, self.out_activ_func)
        return back_prop(Y, preactivs, activs, self.betas, self.activ_func_grad, self.out_activ_func_grad)
    
    def train(self, X, Y, epochs=100, alpha=0.001):
        self.init_weights(X, Y)
        for i in range(epochs):
            pos = i % X.shape[0]
            X_batch, Y_batch = X[pos: pos+1], Y[pos: pos+1]
            grads = self.get_grads(X_batch, Y_batch)
            self.betas = [beta - alpha * grad for beta, grad in zip(self.betas, grads)]  # gradient descent
    

In [28]:
# X = np.array([[1, 0], [0, 1], [0, 0], [1, 1], [1, 0], [0, 1], [0, 0], [1, 1]])
# Y = np.array([[1], [1], [0], [0], [1], [1], [0], [0]])
# X, Y = sklearn.datasets.load_diabetes(return_X_y=True)
# Y = Y[:, np.newaxis]
# X = np.array([[1, 0], [0, 1], [0, 0], [1, 1], [1, 0], [0, 1], [0, 0], [1, 1]])
# Y = np.array([[3], [4], [1], [6], [3], [4], [1], [6]])

X = np.array([[0.4, 0.2], [0.5, 0.3]])
Y = np.array([[0.7], [0.9]])

mlp = MLP([1,1], lambda x: x, lambda x: np.ones(x.shape))
mlp.train(X, Y, epochs=1000, alpha=0.03)
mlp.betas, mlp.predict(X)

([array([[ -1.16512147e-03,  -6.34220542e-04,   9.02079215e-05]]),
  array([[-0.0055869 , -0.00045346]]),
  array([[ 0.80149321, -0.0053202 ]])],
 array([[ 0.80152293],
        [ 0.80152293]]))