# Exercise 4 - Neural Networks Learning

In [22]:
%matplotlib inline
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.optimize import fmin_cg
import scipy.io as si
plt.rcParams['axes.formatter.limits'] = [-3,3]
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Load Data

In [23]:


data = loadmat('ex4data1.mat')
print(data.keys())
X = data['X']
y = data['y']

yy = data['y'].flatten()

# replace 10 -> 0 
# re-cast y
y = np.zeros((len(yy), 10))
for ii in range(y.shape[0]):
    y[ii, yy[ii]-1] = 1

    
    
print('\nX:',X.shape, '(without intercept)')
print('y:',y.shape)


m,n = X.shape
print('\nNumber of features (n): %.0f'%(n))
print('Number of training examples (nm): %.0f'%(m))

weights = loadmat('ex4weights.mat')
weights.keys()

theta1, theta2 = weights['Theta1'], weights['Theta2']
print('\ntheta1 :', theta1.shape)
print('theta2 :', theta2.shape)
params = np.r_[theta1.ravel(), theta2.ravel()]
print('params :', params.shape)



dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

X: (5000, 400) (without intercept)
y: (5000, 10)

Number of features (n): 400
Number of training examples (nm): 5000

theta1 : (25, 401)
theta2 : (10, 26)
params : (10285,)


### Display Sammples

In [20]:
def displayData(X, nrows = 10, ncols = 10):
    
    # set up array
    fig, axarr = plt.subplots(nrows = nrows, ncols = ncols,
                              figsize = (nrows, ncols))
    
    # loop over randomly drawn numbers
    for ii in range(nrows):
        for jj in range(ncols):
            ind = np.random.randint(X.shape[0])
            tmp = X[ind,:].reshape(20,20, order='F')
            axarr[ii,jj].imshow(tmp, cmap='gray_r')
            plt.setp(axarr[ii,jj].get_xticklabels(), visible=False)
            plt.setp(axarr[ii,jj].get_yticklabels(), visible=False)
    
    fig.subplots_adjust(hspace = 0, wspace = 0)

In [21]:


sample = np.random.randint( 0, X.shape[0], 10 )
data_plot = X[sample, 1: ].reshape( -1, 20 ).T # Make a 20X20 array and rotate
print(data_plot.shape)
img = plt.imshow(data_plot, cmap = cm.Greys_r)
plt.axis('off');



ValueError: cannot reshape array of size 3990 into shape (20)

### Neural Networks - Feed Forward and Cost Function

the Neural Network will have 3 layers, a hidden layers adn and ouput layer. The inputs are pixel valuesof digit images. The images are 20 x 20, this gives 400 inputs layers, 401 including the bias unit


Input layer size = 400 (20 x 20 pixels) <br>
Hidden layer size = 25<br>
Number of labels or output layers = 10<br>

the values of each of the activation nodes ias as follows <br>
$$a_1^{(2)} = g(\Theta_{10}^{(1)}x_0 + \Theta_{11}^{(1)}x_1 + \Theta_{12}^{(1)}x_2 + \Theta_{13}^{(1)}x_3) \Rightarrow a_1^{(2)} = g(z_1^{(2)})$$<br>
$$a_2^{(2)} = g(\Theta_{20}^{(1)}x_0 + \Theta_{21}^{(1)}x_1 + \Theta_{22}^{(1)}x_2 + \Theta_{23}^{(1)}x_3) \Rightarrow a_2^{(2)} = g(z_2^{(2)})$$<br>
$$a_3^{(2)} = g(\Theta_{30}^{(1)}x_0 + \Theta_{31}^{(1)}x_1 + \Theta_{32}^{(1)}x_2 + \Theta_{33}^{(1)}x_3) \Rightarrow a_3^{(2)} = g(z_3^{(2)})$$<br>
$$h_{\Theta}(x) = g(\Theta_{10}^{(2)}a_0 + \Theta_{11}^{(2)}a_1 + \Theta_{12}^{(1)}a_2. +\Theta_{13}^{(1)}a_3) = g(z^{(3)}) $$<br>

in vectorisation mode for settings $x = a^{(i)}$ we can write 
$$z^{(j)} = \Theta^{(j-1)}a^{(j-1)}$$

Vectorised implemetations of Forward Propogation:
$$a^{(1)} = x$$
$$z^{(2)} = \Theta^{(1)} a^{(1)}$$
$$a^{2} = g(z^{(2)}).... add... a_0^{(2)} = 1)$$
$$z^{(3)} = \Theta^{(2)}a^{(2)}$$
$$h_{\Theta}(x) = a^{(3)}  = g(z^{(3)})$$


Sigmoid gradient
$$g′(z)=g(z)(1−g(z))$$

where
$$g(z)=\frac{1}{1+e^{-z}}$$


Cost Function 
$$ J(\theta) = -\frac{1}{m}\sum_{i=1}^{m}\sum_{k=1}^{K}\big[y^{(i)}_{k} log(( h_\theta(x^{(i)}))_k)-(1-y^{(i)}_k)log(1-h_\theta(x^{(i)}))_k)\big]$$

Vectorised Cost Junction
$$ $$

In [5]:
L = 400 # Input layer size = 400 (20x20 pixels)
# Hidden layer size = 25
# Number of labels = 10
# Neural Networks - Feed Forward and Cost Function
input_layer = 400
hidden_layer = 25
output_layer = 10 
n_training_samples = X.shape[0]

def sigmoid(z):
    return(1 / (1 + np.exp(-z)))

def sigmoidGradient(z):
    return(sigmoid(z) * (1 - sigmoid(z)))

#Neural Network Cost Function
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, features, classes, reg):
    
    theta1 = nn_params[0:(hidden_layer_size * (input_layer_size + 1))].reshape(hidden_layer_size,(input_layer_size + 1))
    theta2 = nn_params[(hidden_layer_size * (input_layer_size + 1)):].reshape(num_labels,(hidden_layer_size + 1))

    m = features.shape[0]
    y_matrix = pd.get_dummies(classes.ravel()).as_matrix() 
    
    # Cost
    a1 = features # 5000x401
        
    z2 = theta1.dot(a1.T) # 25x401 * 401x5000 = 25x5000 
    a2 = np.c_[np.ones((features.shape[0],1)),sigmoid(z2.T)] # 5000x26 
    
    z3 = theta2.dot(a2.T) # 10x26 * 26x5000 = 10x5000 
    a3 = sigmoid(z3) # 10x5000
    
    J = -1 * (1/m) * np.sum((np.log(a3.T) * (y_matrix) + np.log(1 - a3).T * (1 - y_matrix))) + \
        (reg/(2 * m)) * (np.sum(np.square(theta1[:,1:])) + np.sum(np.square(theta2[:,1:])))

    # Gradients
    d3 = a3.T - y_matrix # 5000x10
    d2 = theta2[:,1:].T.dot(d3.T) * sigmoidGradient(z2) # 25x10 *10x5000 * 25x5000 = 25x5000
    
    delta1 = d2.dot(a1) # 25x5000 * 5000x401 = 25x401
    delta2 = d3.T.dot(a2) # 10x5000 *5000x26 = 10x26
    
    theta1_ = np.c_[np.ones((theta1.shape[0],1)),theta1[:,1:]]
    theta2_ = np.c_[np.ones((theta2.shape[0],1)),theta2[:,1:]]
    
    theta1_grad = delta1/m + (theta1_ * reg)/m
    theta2_grad = delta2/m + (theta2_ * reg)/m
    
    return(J, theta1_grad, theta2_grad)


### Regularisation


 
Regularized Cost Function
$$ J(\theta) = -\frac{1}{m}\sum_{i=1}^{m}\sum_{k=1}^{K}\bigg[y^{(i)}_{k} log(( h_\theta(x^{(i)}))_k)+(1-y^{(i)}_k)log(1-h_\theta(x^{(i)}))_k)\bigg] + \frac{\lambda}{2m}\bigg[\sum_{l=1}^{L-1}\sum_{i=1}^{s_l}\sum_{j=1}^{s_{l+1}}(\Theta_{j,k}^{(1)})^2(\Theta_{j,k}^{(2)})^2\bigg]$$


In [6]:
# Regularization parameter = 0
Lambda = 0
nnCostFunction(params, input_layer, 25, 10, X, y, Lambda)[0]

#print('For Lambda = %.0f, \n\tCost  = %.2f \n\t Theta1_grad = %.2f \n\t Theta2_grad = %.2f' %(Lambda, J, theta1_grad, theta2_grad ) )


# Regularization parameter = 1]
Lambda = 1
nnCostFunction(params, input_layer, 25, 10, X, y, Lambda)[0]


[sigmoidGradient(z) for z in [-1, -0.5, 0, 0.5, 1]]


[0.19661193324148185,
 0.23500371220159449,
 0.25,
 0.23500371220159449,
 0.19661193324148185]