##  Neural  Networks

### 1.1 Load Data

In [1]:
#Is the same data set as exercise 3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
from PIL import Image
%matplotlib inline
data = loadmat('/Users/natalychacon/Documents/Machine Learning/std coursera/machine-learning-ex/ex3/ex3data1.mat') 
X = data["X"]
y = data["y"]
X.shape, y.shape

((5000, 400), (5000, 1))

In [2]:
weight = loadmat('/Users/natalychacon/Documents/Machine Learning/std coursera/machine-learning-ex/ex4/ex4weights.mat')
Theta1 = weight["Theta1"]
Theta2 = weight["Theta2"]
Theta1.shape, Theta2.shape

((25, 401), (10, 26))

### 1.2 Feedforward and cost function

Implement the cost function and gradient for the neural network without regularization.

In [3]:
input_layer_size = 400 # 20*20 Input of Images of Digits
hidden_layer_size = 25 # 25 hidden units
num_labels = 10  # k = 10 (1-10) is the number of possible labels
learning_rate = 0 # lambda or regularization parameter(we set this to 0 here)
# unroll parameters
nn_params = np.concatenate(((Theta1).ravel(),(Theta2).ravel()))
nn_params.shape

(10285,)

In [4]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z)) 

In [5]:
def nncost(nnparams,input_layer_size,hidden_ayer_size,num_labels,X,y,learning_rate):
    m = np.size(y,0)
    X = np.matrix(X)
    y = np.matrix(y)
    
    #reshape nn_params back into the parameters Theta1 and Theta2
    Theta1 = np.reshape(nn_params[0:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size,(input_layer_size+1)))
    Theta1 = np.matrix(Theta1)
    Theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):],((num_labels),(hidden_layer_size+1)))
    Theta2 = np.matrix(Theta2)
    
    #The original labels in y were 1,2...10, for the purpose of the neural network, we need to recode the labels 
    #as vectors
    #containing only values 0 or 1.
    I = np.identity(num_labels)
    Y = np.zeros((m,10),dtype=int)
    for i in range(m):
        Y[i,:]=I[(y[i]-1),:]
        
    #calculate activation for each layer
    a1 = np.hstack((np.ones((m,1)),X))
    a1 = np.matrix(a1)
    z2 = a1 * Theta1.T
    a2 = sigmoid(z2)
    a2 = np.hstack((np.ones((m,1)),a2))
    z3 = a2 * Theta2.T
    a3 = sigmoid(z3)
    
    #cost 
    cost = np.sum(np.multiply(-Y,np.log(a3))-np.multiply((1-Y),np.log(1-a3)))
    J= 1/m*cost
    return J

In [6]:
cost1 = nncost(nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,learning_rate)
print('Cost without regularization: \n', cost1)

Cost without regularization: 
 0.2876291651613189


### 1.3 Regularized cost function

In [7]:
def nncost_reg(nnparams,input_layer_size,hidden_ayer_size,num_labels,X,y,learning_rate):
    m = np.size(y,0)
    Theta1 = np.reshape(nn_params[0:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size,(input_layer_size+1)))
    Theta1 = np.matrix(Theta1)
    Theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):],((num_labels),(hidden_layer_size+1)))
    Theta2 = np.matrix(Theta2)
    
    J = nncost(nnparams,input_layer_size,hidden_ayer_size,num_labels,X,y,learning_rate)
    Theta1_nobias = Theta1[:,1:]
    Theta2_nobias = Theta2[:,1:]
    reg = (learning_rate/(2*m)) * (np.sum(np. power(Theta1_nobias,2)) + np.sum(np. power(Theta2_nobias,2)))
    
    return J+reg

In [8]:
learning_rate = 1
costreg1 = nncost_reg(nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,learning_rate)
print('Regularized cost: \n', costreg1)

Regularized cost: 
 0.38376985909092365


## Backpropagation

### 2.1 Sigmoid Gradient

In [9]:
def sigmoidGradient(z):
    sig = sigmoid(z)
    return np.multiply(sig,(1-sig))

In [10]:
sigmoidGradient(0)

0.25

### 2.2 Random Initialitation

When training neural networks it is important to  randomly initialize the parameters for symmetry breaking.

In [11]:
#L_in = layer in
#L_out = layer out
def randInitializeWeights(L_in, L_out):
    epsilon_init = 0.12
    return np.random.random((L_out,1+L_in))*2*epsilon_init - epsilon_init

In [12]:
initial_Tetha1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)

### 2.3 Backpropagation

In [13]:
def backprop(nnparams,input_layer_size,hidden_ayer_size,num_labels,X,y,learning_rate):
    J = nncost_reg(nnparams,input_layer_size,hidden_ayer_size,num_labels,X,y,learning_rate)
    m = np.size(y,0)
    X = np.matrix(X)
    y = np.matrix(y)
   
    Theta1 = np.reshape(nn_params[0:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size,(input_layer_size+1)))
    Theta1 = np.matrix(Theta1)
    Theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):],((num_labels),(hidden_layer_size+1)))
    Theta2 = np.matrix(Theta2)
    
    I = np.identity(num_labels)
    Y = np.zeros((m,10),dtype=int)
    for i in range(m):
        Y[i,:]=I[(y[i]-1),:]
    
    delta1 = 0
    delta2 = 0
    Theta1_nobias = Theta1[:,1:]
    Theta2_nobias = Theta2[:,1:]
    
    for i in range(m):
        a1 = np.vstack((np.array([1]),np.matrix(X[i]).T))
        z2 = Theta1*a1
        a2 = np.vstack(((np.array(1)),sigmoid(z2)))
        z3 = Theta2*a2
        a3 = sigmoid(z3)
        
        
        d3 = a3-np.matrix(Y[0]).T
        d2 = np.multiply(Theta2_nobias.T*d3,sigmoidGradient(z2))
        
        delta1 = delta1 + d2*(a1.T)
        delta2 = delta2 + d3*(a2.T)
        
        reg_grad1 = np.multiply ((learning_rate/m),Theta1)
        reg_grad1[:,0]=0
        Theta1_grad = (1/m)*delta1+reg_grad1
        
        reg_grad2 = np.multiply ((learning_rate/m),Theta2)
        reg_grad2[:,0]=0
        Theta2_grad = (1/m)*delta2+reg_grad2
        
        
        Theta1_grad = Theta1_grad.ravel(order = "F")
        Theta2_grad = Theta2_grad.ravel(order = "F")
        grad = np.vstack((Theta1_grad.T,Theta2_grad.T))
    
    return J,grad
    

In [14]:
learning_rate = 1
nnparams = np.concatenate(((initial_Tetha1).ravel(),(initial_Theta2).ravel()))
J, grad = backprop(nnparams,input_layer_size,hidden_layer_size,num_labels,X,y,learning_rate)
print(J,grad.shape)

0.38376985909092365 (10285, 1)
