## Repurposing Multi-Layer Perceptron class to include backpropagation 

#### Changes include saving the activations and derivatives to compute backpropagation. We are now:
1. Implementing backpropagation
2. Implementing gradient descent
3. Implementing a higher level training method that uses backpropagation and gradient descent. 
4. Training artifical neural net with dummy dataset
5. Making some predictions

In [2]:
import numpy as np
from random import random

class MLP: #MLP: Multi-Layer Perceptron
    def __init__(self, num_inputs=3, num_hidden=[3,3], num_outputs=2): #Default values
        #First hidden layer has 3 neurons/inputs and second hidden layer has 5 neurons/inputs
        #Output has 2 outcomes, or categories that the prediction will fall into. 
        
        self.num_inputs = num_inputs
        self.num_hidden = num_hidden
        self.num_outputs = num_outputs
        
        #Internal representation of a hidden layers, as a list
        #Each item in list represents # of neurons in a layer. Layer moves from 0 index to # of layers that we have.
        layers = [num_inputs] + num_hidden + [num_outputs] #Concatenating function variables with +
        
        #Initiate random weights; weights represent the connections and their connection strengths (hence the word weights).
        weights = [] #Initializing weight vector
        #Iterating through all layers to create matrix weight for each pair of layers
        for i in range(len(layers) - 1):
            #Rows are the current layer that it's in and columns are number of neurons on the subsequent layer.
            #We have all connection of a neuron from the previous layer in the rows with the subsequent/previous layer.
            #Number of rows equal number of neurons in a layer and number of columns equal number of neurons in sub layer. 
            w = np.random.rand(layers[i], layers[i+1]) #2-D Array 
            weights.append(w) #Appending weight values in the array for each given layer. Each layer has their own weight vector
        self.weights = weights #Storing weight matrixes; number of weight matrixes is equal to number of layers minus 1.
            
        #Storing activations
        activations = []
        #Going through all the layers and creating dummy activiation array for each layer. 
        for i in range(len(layers)):
            a = np.zeros(layers[i]) #Amount of zeros equals the number of neurons that we have in each layer. 1-D array. 
            activations.append(a) #Each array in the list represents the activations for a given layer.
        self.activations = activations #Storing this information in an instance variable called activations. 
        
        #Storing derivatives
        derivatives = []
        for i in range(len(layers) - 1): #Derivatives taken with respect to weights and weights only appear between layers.
            #Now expecting a 2-D array, which is a matrix.
            d = np.zeros((layers[i], layers[i+1])) #(number of neurons in current layer, number of neurons in subsequent or next layer) 
            derivatives.append(d)
        self.derivatives = derivatives
        
    
    def forward_propagate(self, inputs): #Computes forward propagation of the network based on input signals.  
        activations = inputs
        
        #Saving activations of the first layer as the inputs we receive as an arg for forward propagation.
        self.activations[0] = activations 
        
        #Iterating through the network layers.
        for i, w in enumerate(self.weights): #Loop through all weight matrixes which is looping through all layers in network.
            #Calculating net inputs of given layer
            net_inputs = np.dot(activations, w) #Matrix multiplication of activation of previous layer with weight matrix. 
            
            #Calculating activation of given layer using sigmoid function
            activations = self._sigmoid(net_inputs) #Passing net inputs to the sigmoid function
            self.activations[i+1] = activations #Storing activation at i + 1 given it is used for the next layer. 
                                                #Ex: if i = 2; a_3 = s(h_3) and h_3 = a_2 * W_2      
        return activations
    
    
    #Implementing backpropagation using stored derivates of the error. 
    def back_propagate(self, error, verbose=False): #Implementing verbose mode. Verbose just means more wordy descriptions. 
        # dE/dW_i = (y - a_[i+1]) * s`(h_[i+1]) * a_i    (dE/dW = delta); (d = derivative); (s = sigma); (s` = sigma prime)
        # s`(h_[i+1]) = s(h_[i+1])(1 - s(h_[i+1]))
        # s(h_[i+1]) = a_[i+1]
        
        for i in reversed(range(len(self.derivatives))): #Going from right to left of the artifical neural net with reversed. 
            activations = self.activations[i+1] #Using activations in the subsequent or previous layers. 
            
            delta = error * self._sigmoid_derivative(activations) #ndarray([0.1, 0.2]) --> ndarray([[0.1, 0.2]]); 2D array
            delta_reshaped = delta.reshape(delta.shape[0], -1).T #Transpose the 2D array with a single row to a single column.
            
            current_activations = self.activations[i] # Array needs to be reorganized to perform dot product for deriv_i below.
            #ndarray([0.1, 0.2]) --> ndarray([[0.1], [0.2]]) #2D array with a single row.
            current_activations_reshaped = current_activations.reshape(current_activations.shape[0], -1) 
            self.derivatives[i] = np.dot(current_activations_reshaped, delta_reshaped)
            
            #Now calculating the next derivative going to the left by using the previous derivative
            error = np.dot(delta, self.weights[i].T)
        
            if verbose == True: #Implementing verbose mode where derivatives are printed.
                print("Derivatives for W{}: {}".format(i, self.derivatives[i]))
        
        return error #Returning error backpropagated back to the input layer.
    
    
    def gradient_descent(self, learning_rate):
        for i in range(len(self.weights)): #Going through all the different weight matrixes
            weights = self.weights[i] #Retrieving Weights for a given layer
            #print("Original W{}: {}".format(i, weights))
            
            derivatives = self.derivatives[i] #Retrieving relative derivatives for a given layer.
            
            weights += derivatives * learning_rate #Summing weight and derivative matrixes for given layer and applying lrn_rate
            #print("Updated W{}: {}".format(i, weights))
    
    
    def train(self, inputs, targets, epochs, learning_rate): #Fitting inputs into the model for training.
        #Epoch: After passing all the inputs in the dataset, an epoch is finished.
        #The number of epochs describes how many times the ENTIRE dataset is fed into the network.
        
        for i in range(epochs): #Number of epochs to train for
            sum_error = 0
            #Using zip to unpack inputs and targets to receive the elements one-by-one. 
            for input, target in zip(inputs, targets): #Going through all the inputs and target variables.
                    
                #Doing forward propagation
                output = self.forward_propagate(input)

                #Calculating error
                error = target - output

                #Doing backward propagation
                self.back_propagate(error)

                #Applying gradient descent
                self.gradient_descent(learning_rate)
                
                sum_error += self._mse(target, output)
                
            #return average error at each epoch
            print("Error: {} at epoch {}".format(sum_error / len(inputs), i))
    
    def _mse(self, target, output):
        return np.average((target - output)**2) #Returning mean squared error
    
    
    def _sigmoid(self, x): #Activation function
        return 1 / (1 + np.exp(-x))
    
    
    def _sigmoid_derivative(self, x): #Derivative of activation function
        return x * (1.0 - x)

In [3]:
if __name__ == "__main__":
    
    #Creating a multi-layer perceptrion (mlp)
    mlp = MLP(2, [5], 1)
    
    #Creating a dummy dataset
    inputs = np.array([[random() / 2 for _ in range(2)] for _ in range(1000)]) #array example: ([[0.1, 0.2], [0.3, 0.4]])
     #We are seeing if the network "learns" addition without us telling it.
    targets = np.array([[i[0] +i[1]] for i in inputs]) #array example: ([[0.3], [0.7]])

#Section is commented out given that it is now implemented in the def train() method.     
#     #Doing forward propagation
#     output = mlp.forward_propagate(inputs)
    
#     #Calculating error
#     error = target - output
    
#     #Doing backward propagation
#     mlp.back_propagate(error, verbose = False)
    
#     #Applying gradient descent
#     mlp.gradient_descent(learning_rate = 1)

    #Training the mlp
    mlp.train(inputs, targets, 50, 0.1)
    
    
    #Now finally using the model to make predictions
    input = np.array([0.3, 0.1])
    target = np.array([0.4])
    
    #Passing dummy data through the network to make a prediction.
    output = mlp.forward_propagate(input) 
    print()
    
    #Printing predictions
    print("Our network believes that {} + {} = {}".format(input[0], input[1], output))
    

Error: 0.04699608109464281 at epoch 0
Error: 0.04001267241563012 at epoch 1
Error: 0.0396049213346582 at epoch 2
Error: 0.039122848677907905 at epoch 3
Error: 0.0385319468204186 at epoch 4
Error: 0.0377935128973421 at epoch 5
Error: 0.03686465107541399 at epoch 6
Error: 0.0356996732195892 at epoch 7
Error: 0.03425378104330062 at epoch 8
Error: 0.03248987346396587 at epoch 9
Error: 0.03038878859104696 at epoch 10
Error: 0.027961703578979452 at epoch 11
Error: 0.025260742501149774 at epoch 12
Error: 0.0223814864062635 at epoch 13
Error: 0.019452034389078204 at epoch 14
Error: 0.016609403453979522 at epoch 15
Error: 0.013972092834168056 at epoch 16
Error: 0.011620460718326689 at epoch 17
Error: 0.009591251408103702 at epoch 18
Error: 0.007884307341589809 at epoch 19
Error: 0.006474938341101697 at epoch 20
Error: 0.005326050353008506 at epoch 21
Error: 0.004397101255625077 at epoch 22
Error: 0.003649472023426722 at epoch 23
Error: 0.0030490447787365595 at epoch 24
Error: 0.0025670048634835