# Neural Networks - Learning

In [71]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import pandas as pd
from sklearn.preprocessing import label_binarize

%matplotlib inline

In [6]:
data = loadmat('ex4data1.mat')
weights = loadmat('ex4weights.mat')

print data.keys(), weights.keys()

['y', 'X', '__version__', '__header__', '__globals__'] ['Theta2', '__version__', '__header__', 'Theta1', '__globals__']


In [121]:
X_raw = data['X']
X = np.c_[np.ones((data['X'].shape[0], 1)), data['X']]
y = data['y']

print X.shape, y.shape

theta_1 = weights['Theta1']
theta_2 = weights['Theta2']

print theta_1.shape, theta_2.shape

y_binarized = label_binarize(y, classes=[1,2,3,4,5,6,7,8,9,10])

(5000, 401) (5000, 1)
(25, 401) (10, 26)


<b> Feedforward to Compute Cost </b>

In [127]:
from __future__ import division

def sigmoid(z):
    return(1 / (1 + np.exp(-z)))

# Implemented for 3 layers.
def feedforward(X, y_binarized, theta_1, theta_2, reg):
    
    m = X.shape[0]
    
    # Forward prop.
    a1 = X 
    z2 = theta_1.dot(a1.T)
    a2 = np.c_[np.ones((X.shape[0],1)),sigmoid(z2.T)]
    
    z3 = theta_2.dot(a2.T) 
    a3 = sigmoid(z3)
    
    # Cost function w/o reg term.
    J = (1/m) * np.sum(np.log(a3).T * (-y_binarized) - (np.log(1-a3).T * (1-y_binarized)))
    
    # Add in reg.
    reg_term = (reg / (2*m)) * (np.sum(np.square(theta_1[:,1:])) + np.sum(np.square(theta_2[:,1:])) )
        
    return J+reg_term, a1, z2, a2, z3, a3

print [feedforward(X, y_binarized, theta_1, theta_2, reg)[0] for reg in [0,0.5,1]]

[0.28762916516131892, 0.33569951212612131, 0.38376985909092365]


In [134]:
def gradient_sigmoid(z):
    return(sigmoid(z)*(1-sigmoid(z)))

# Implement backprop.
def compute_gradients(X, y_binarized, theta_1, theta_2, reg):
    m = X.shape[0]
    J, a1, z2, a2, z3, a3 = feedforward(X, y_binarized,theta_1, theta_2, reg)
    
    d3 = a3.T - y_binarized
    d2 = theta_2[:,1:].T.dot(d3.T) * gradient_sigmoid(z2) # Note the dot product in the middle (in accompanying PDF)
    delta_1 = d2.dot(a1)
    delta_2 = d3.T.dot(a2)

    grad1 = (delta_1/m) + (theta_1*reg)/m
    grad2 = (delta_2/m) + (theta_2*reg)/m

    return grad1, grad2
    
for item in compute_gradients(X, y_binarized, theta_1, theta_2, 1):
    print item.shape

(25, 401)
(10, 26)
