In [2820]:
import numpy as np

class NN(object):
    def __init__(self, size_layers=[784, 16, 16, 10], datapath=None, modelpath=None):
        self.data = np.load(datapath)
        self.size_layers = size_layers
        
        # The cache would be used to store forward feed activations and preactivations values 
        self.cache = {
            'activations': [],
            'preactivations': []
        }
        
        # Data structure to hold the params of the model
        self.theta = {
            'b': [],
            'w': []
        }
        
        # Data structure to hold the gradients calculated during backprop
        self.grads = {
            'b': [],
            'w': []
        }

In [2821]:
def initialize_weights(self):
    size_next_layers = self.size_layers.copy()
    size_next_layers.pop(0)
    
    for size_layer, size_next_layer in zip(self.size_layers, size_next_layers):
        self.theta['w'].append(np.zeros((size_next_layer, size_layer)))
        self.theta['b'].append(np.zeros((1, size_next_layer)))
        
        self.grads['w'].append(np.zeros((size_next_layer, size_layer)))
        self.grads['b'].append(np.zeros((1, size_next_layer)))

In [2822]:
def forward(self, X):
    '''
    Forward propagation
    params:
        X: N by k matrix
        where
            N: number of samples
            k: dimension of sample
    returns: output is a N by o matrix where o is the output dimension specified
    in mlp initialization
    '''
    n_layers = len(self.size_layers)
    input_layer = X
    
    # Adding input in activations needed for gradient descent
    self.cache['activations'].append(X)
    
    for layer_idx in range(n_layers - 1):
        n_examples = input_layer.shape[0]
        
        # Multiply the input by the weights
        pre_act_layer = np.matmul(input_layer,  self.theta['w'][layer_idx].transpose()) + self.theta['b'][layer_idx]
        # Apply activation function
        output_layer = activation(self, pre_act_layer)
        
        self.cache['preactivations'].append(pre_act_layer)
        self.cache['activations'].append(output_layer)
        
        input_layer = output_layer
    
    return output_layer

In [2823]:
def activation(self, input):
    '''
    Rectified Linear function
    z can be an numpy array or scalar
    '''
    if np.isscalar(input):
        result = np.max((input, 0))
    else:
        zero_aux = np.zeros(input.shape)
        meta_z = np.stack((input , zero_aux), axis = -1)
        result = np.max(meta_z, axis = -1)
    return result

In [2824]:
def activation_prime(self, input):
    input[input<0] = 0
    input[input>=0] = 1

    return input

In [2825]:
def loss(self, predictions, targets):
    '''
    Takes a batch of predictions and gets the cross entropy
    params:
        predictions: N by k matrix
        where
            N: number of samples
            k: dimension of each sample
        targets: N by 1 matrix containing label of each one
        where 
            N: number of samples
    returns: a scalar which is the loss value            
    '''
    N = predictions.shape[0] # batch size
    k = predictions.shape[1] # numb of classes
    
    return -np.sum(np.eye(k)[targets]*np.log(predictions))/N

In [2826]:
def loss_prime(self, predictions, targets):
    N = predictions.shape[0] # batch size
    k = predictions.shape[1] # numb of classes
    
    return np.sum(predictions - np.eye(k)[targets], axis=0, keepdims=1)/N

In [2827]:
def backward(self, Y):
    '''
    Backward propagation
    params:
        Y: N by 1 matrix
        where
            N: number of samples
            column: is the label of the samples
    '''
    N = Y.shape[0]
    n_layers = len(self.size_layers)
    
    delta = loss_prime(self, self.cache['activations'][-1], Y)
    
    self.grads['b'][-1] = delta
    self.grads['w'][-1] = np.dot(delta.transpose(), np.sum(self.cache['activations'][-2], axis=0, keepdims=1)/N)
    
    for l in range(2, n_layers):
        z = self.cache['preactivations'][-l]
        o = activation_prime(self, z)
        
        delta = np.dot(delta, self.theta['w'][-l+1])*(np.sum(o, axis=0, keepdims=1)/N)
        
        self.grads['b'][-l] = delta
        self.grads['w'][-l] = np.dot(delta.transpose(),np.sum(self.cache['activations'][-l-1], axis=0, keepdims=1)/N)

In [2828]:
def update(self, eta):
    n_layers = len(self.size_layers)
    
    for layer_idx in range(n_layers - 1):
        self.theta['w'][layer_idx] = self.theta['w'][layer_idx] - eta*self.grads['w'][layer_idx]
        self.theta['b'][layer_idx] = self.theta['b'][layer_idx] - eta*self.grads['b'][layer_idx]

In [2829]:
def train(self, eta):
    train_set, valid_set, test_set = self.data
    
    X, Y = train_set
    
    forward(mlp, X)
    backward(mlp, Y)
    update(mlp, eta)

In [2830]:
def test(self):
    pass

In [2831]:
datapath = 'data/mnist.pkl.npy'
modelpath = 'model/mlp.pkl.npy'

eta = 0.5
mlp = NN(datapath=datapath, modelpath=modelpath)

initialize_weights(mlp)
train(mlp, eta)

(1, 16) (1, 16)
(1, 16) (1, 16)
(1, 10) (1, 10)


{'b': [array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
  array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
  array([[0.04932, 0.05678, 0.04968, 0.05101, 0.04859, 0.04506, 0.04951,
          0.05175, 0.04842, 0.04988]])],
 'w': [array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0.