In [3]:
# Neural Networks Demystified [Part 4: Backpropagation]
import numpy as np

In [4]:
# X = (hours sleeping, hours studying)
X = np.array(([3,5], [5,1], [10,2]), dtype=float)
# y = Score on test
y = np.array(([75], [82], [93]), dtype=float)

# Normalize
X = X/np.amax(X, axis=0)
y = y/100                              # Max test score is 100

In [5]:
class Neural_Network(object):
    
    def __init__(self):        
        #Define Hyperparameters
        
        self.inputLayerSize = 2        # n0 - input dimension OR #weights for each 
                                       #      neuron in the first hidden layer OR 
                                       #      #neurons in input layer
        self.hiddenLayerSize = 3       # n1 - #neurons in hidden layer
        self.outputLayerSize = 1       # n2 - #neurons in output layer or output dimension
        
        #Weights (parameters)
        self.W1 = np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
        self.W2 = np.random.randn(self.hiddenLayerSize, self.outputLayerSize)
        
        
    def forward(self, X):
        # Propagate inputs though network
        
        ### Layer 1 (Input Layer)
        # X[m x n0]                    # m - number of samples
        
        
        ### Layer 2 (1st hidden layer)
        self.z2 = np.dot(X, self.W1)           
        # z2[m x n1] = X[m x n0] * W1[n0 x n1]        Eqn(1)
        
        self.a2 = self.sigmoid(self.z2)        
        # a2[m x n1]                                  Eqn(2)
        
        
        ### Layer 3 (Output Layer)
        self.z3 = np.dot(self.a2, self.W2)     
        # z3[m x n2] = a2[m x n1] * W2[n1 x n2]       Eqn(3)
        
        yHat = self.sigmoid(self.z3)           
        # yHat[m x n2]                                Eqn(4)
        
        return yHat
        
        
    def sigmoid(self, z):
        # Apply sigmoid activation function to scalar, vector, or matrix
        return 1 / ( 1 + np.exp(-z) )

    
    def sigmoidPrime(self,z):
        # Gradient of sigmoid
        return np.exp(-z) / ( ( 1 + np.exp(-z) )**2 )
    
    
    def costFunction(self, X, y):
        # Compute cost for given X,y, use weights already stored in class.
        self.yHat = self.forward(X)
        J = 0.5 * sum( ( y - self.yHat )**2 )
        return J
        
        
    def costFunctionPrime(self, X, y):
        # Compute derivative with respect to W and W2 for a given X and y:
        self.yHat = self.forward(X)
        # yHat[m x 1]
        
        delta3 = np.multiply(-(y-self.yHat), self.sigmoidPrime(self.z3))
        # delta3[mx1] = [m x 1] * [m x 1]  element-wise multiplication
        dJdW2 = np.dot(self.a2.T, delta3)
        # dJdW2[n1x1] = [n1 x m] . [mx1]   matrix multiplication  
        
        delta2 = np.dot(delta3, self.W2.T) * self.sigmoidPrime(self.z2)
        dJdW1 = np.dot(X.T, delta2)  
        
        return dJdW1, dJdW2
    
    
    ## Helper Functions for interacting with other classes:
    def getParams(self):
        ## Get W1 and W2 unrolled into vector:
        params = np.concatenate((self.W1.ravel(), self.W2.ravel()))
        return params
    
    
    def setParams(self, params):
        ## Set W1 and W2 using single params vector.
        W1_start = 0
        W1_end = self.hiddenLayerSize * self.inputLayerSize
        self.W1 = np.reshape(params[W1_start:W1_end], (self.inputLayerSize , self.hiddenLayerSize))
        
        W2_end = W1_end + self.hiddenLayerSize * self.outputLayerSize
        self.W2 = np.reshape(params[W1_end:W2_end], (self.hiddenLayerSize, self.outputLayerSize))
        
        
    def computeGradients(self, X, y):
        dJdW1, dJdW2 = self.costFunctionPrime(X, y)
        return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))
    


def computeNumericalGradient(N, X, y):
        paramsInitial = N.getParams()
        numgrad = np.zeros(paramsInitial.shape)
        perturb = np.zeros(paramsInitial.shape)
        e = 1e-4

        for p in range(len(paramsInitial)):
            # Set perturbation vector. Changes only a single
            # parameter at a time. 
            perturb[p] = e
            
            N.setParams(paramsInitial + perturb)
            loss2 = N.costFunction(X, y)
            
            N.setParams(paramsInitial - perturb)
            loss1 = N.costFunction(X, y)

            # Compute Numerical Gradient
            numgrad[p] = (loss2 - loss1) / (2*e)

            # Return the value we changed to zero:
            perturb[p] = 0
            
        # Return Params to original value:
        N.setParams(paramsInitial)

        return numgrad 
        

In [6]:
NN = Neural_Network()
cost1 = NN.costFunction(X,y)
dJdW1, dJdW2 = NN.costFunctionPrime(X,y)

In [7]:
## Direction for W1
dJdW1

array([[-0.01551253,  0.01012455,  0.10583073],
       [-0.01129524,  0.007365  ,  0.05792022]])

In [8]:
## Direction for W2
dJdW2

array([[-0.153777  ],
       [-0.18518784],
       [-0.13102407]])

In [14]:
## Going up hill
scalar = 3
NN.W1 = NN.W1 + scalar*dJdW1
NN.W2 = NN.W2 + scalar*dJdW2
cost2 = NN.costFunction(X,y)
print("Initial cost : ", cost1, ". Cost after taking a step up hill : ", cost2)

Initial cost :  [0.45669923] . Cost after taking a step up hill :  [0.70267469]


In [15]:
## Going down hill - This is what gets executed inside the training loop for 'n' epochs
dJdW1, dJdW2 = NN.costFunctionPrime(X,y)
NN.W1 = NN.W1 - scalar*dJdW1
NN.W2 = NN.W2 - scalar*dJdW2
cost3 = NN.costFunction(X, y)
print("Initial cost : ", cost1, ". Cost after taking a step down hill : ", cost3)

Initial cost :  [0.45669923] . Cost after taking a step down hill :  [0.5160258]


In [13]:
## Checking gradient numerically
num_grad = computeNumericalGradient(NN, X, y)
grad = NN.computeGradients(X, y)
print(num_grad)
print(grad)

[-0.00886673  0.01497642  0.10311054 -0.00679799  0.01159261  0.05904691
 -0.14385292 -0.17673359 -0.12768571]
[-0.00886673  0.01497642  0.10311054 -0.00679799  0.01159261  0.05904691
 -0.14385292 -0.17673359 -0.12768571]
