In [597]:
import numpy as np

class NN(object):
    def __init__(self, size_layers=[784, 16, 16, 10], datapath=None, modelpath=None):
        self.data = np.load(datapath)
        self.size_layers = size_layers
        
        # The cache would be used to store forward feed activations and preactivations values 
        self.cache = {
            'activations': [],
            'preactivations': []
        }
        
        # Data structure to hold the params of the model
        self.theta = {
            'b': [],
            'w': []
        }
        
        # Data structure to hold the gradients calculated during backprop
        self.grads = {
            'b': [],
            'w': []
        }

In [598]:
def initialize_weights(self):
    size_next_layers = self.size_layers.copy()
    size_next_layers.pop(0)
    
    for size_layer, size_next_layer in zip(self.size_layers, size_next_layers):
        self.theta['w'].append(np.zeros((size_next_layer, size_layer)))
        self.theta['b'].append(np.zeros((size_next_layer, 1)))
        
        self.grads['w'].append(np.zeros((size_next_layer, size_layer)))
        self.grads['b'].append(np.zeros((size_next_layer, 1)))

In [599]:
def forward(self, X):
    '''
    Forward propagation
    params:
        X: N by k matrix
        where
            N: number of samples
            k: dimension of sample
    returns: output is a N by o matrix where o is the output dimension specified
    in mlp initialization
    '''
    n_layers = len(self.size_layers)
    input_layer = X
    
    # Adding input in activations needed for gradient descent
    self.cache['activations'].append(X)
    
    for layer_idx in range(n_layers - 1):
        n_examples = input_layer.shape[0]
        
        # Multiply the input by the weights
        pre_act_layer = np.matmul(input_layer,  self.theta['w'][layer_idx].transpose()) + self.theta['b'][layer_idx].transpose()
        # Apply activation function
        output_layer = activation(self, pre_act_layer)
        
        self.cache['preactivations'].append(pre_act_layer)
        self.cache['activations'].append(output_layer)
        
        input_layer = output_layer
    
    return output_layer

In [600]:
def activation(self, input):
    '''
    Rectified Linear function
    z can be an numpy array or scalar
    '''
    if np.isscalar(input):
        result = np.max((input, 0))
    else:
        zero_aux = np.zeros(input.shape)
        meta_z = np.stack((input , zero_aux), axis = -1)
        result = np.max(meta_z, axis = -1)
    return result

In [601]:
def activation_prime(self, input):
    input[input<=0] = 0
    input[input>0] = 1
    return input

In [602]:
def loss(self, predictions, targets):
    '''
    Takes a batch of predictions and gets the cross entropy
    params:
        predictions: N by k matrix
        where
            N: number of samples
            k: dimension of each sample
        targets: N by 1 matrix containing label of each one
        where 
            N: number of samples
    returns: a scalar which is the loss value            
    '''
    N = predictions.shape[0] # batch size
    k = predictions.shape[1] # numb of classes
    
    return -np.sum(np.eye(k)[targets]*np.log(predictions))/N

In [603]:
def loss_prime(self, predictions, targets):
    '''
    Derivative of the cross entropy
    params:
        predictions: N by k matrix
        where
            N: number of samples
            k: dimension of each sample
        targets: N by 1 matrix containing label of each one
        where 
            N: number of samples
    returns: a scalar which is the loss value            
    '''
    N = predictions.shape[0] # batch size
    k = predictions.shape[1] # numb of classes
    epsilon = 0.000001 # needed when weights are all zeros
    
    return -np.sum(1/(epsilon + np.eye(k)[targets]*predictions))/N

In [604]:
def softmax(self, input):
    '''
    params:
        input: N by k matrix
        where
            N: number of samples
            k: dimension of each sample
    returns: a N by k matrix representing the probability distribution 
        of each sample
    '''
    e_x = np.exp(input.T - np.max(input))
    return (e_x / e_x.sum(axis=0)).T

In [605]:
def softmax_prime(self, input):
    return np.diag(softmax(self, input)- np.outer(softmax(self, input), softmax(self, input)))

In [606]:
def backward(self, Y):
    N = Y.shape[0]
    n_layers = len(self.size_layers)
    
    delta = np.sum(loss_prime(self, self.cache['activations'][-1], Y)*activation_prime(self, self.cache['preactivations'][-1]))/N

    self.grads['b'][-1] = delta
    self.grads['w'][-1] = np.sum(delta*self.cache['activations'][-2])/N
    
    for l in range(2, n_layers):
        z = self.cache['preactivations'][-l]
        o = activation_prime(self, z)
        
        delta = np.sum(np.matmul(o, self.theta['w'][-l+1].transpose() * delta))
        
        self.grads['b'][l] = delta
        self.grads['w'][l] = delta * self.cache['activations'][-l-1]

In [607]:
def update(self,grads):
    pass

In [608]:
def train(self):
    train_set, valid_set, test_set = self.data
    
    X, Y = train_set
    
    forward(mlp, X)
    backward(mlp, Y)
    
    mlp.grads

In [609]:
def test(self):
    pass

In [610]:
datapath = 'data/mnist.pkl.npy'
modelpath = 'model/mlp.pkl.npy'

mlp = NN(datapath=datapath, modelpath=modelpath)

initialize_weights(mlp)
train(mlp)

IndexError: list assignment index out of range