In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.io import loadmat

data = loadmat("machine_learning_andrewng/ex3data1.mat")
weights = loadmat("machine_learning_andrewng/ex3weights.mat")
print(data.keys())
print(weights.keys())

['y', 'X', '__version__', '__header__', '__globals__']
['Theta2', '__version__', '__header__', 'Theta1', '__globals__']


In [2]:
X = data['X']
y = data['y']
theta1_loaded = weights["Theta1"]
theta2_loaded = weights["Theta2"]

In [3]:
def sigmoid(z):
    return(1 / (1 + np.exp(-z)))

def sigmoid_prime(z):
    return(sigmoid(z)*(1-sigmoid(z)))

In [4]:
def forward_pass(X, theta1, theta2, elaborate=False):
    a1 = X.copy()
    a1 = np.c_[np.ones(X.shape[0]), a1]
    z2 = theta1.dot(a1.T) # 25x401 * 401x5000 = 25x5000
    a2 = sigmoid(z2.T) # 5000x25
    a2 = np.c_[np.ones(a1.shape[0]), a2] # 5000x26
    z3 = theta2.dot(a2.T) # 10x26 * 26x5000 = 10x5000
    a3 = sigmoid(z3.T) # 5000x10
    if elaborate:
        return ((X, a1, a2, a3), (z2, z3))
    return a3

In [5]:
def cross_entropy(X, y, theta1, theta2, lambda_=0):
    m = X.shape[0]
    y_true = pd.get_dummies(y.ravel()).values
    y_pred = forward_pass(X, theta1, theta2)
    positive_loss = np.sum(np.multiply(y_true, np.log(y_pred)).flatten())
    negative_loss = np.sum(np.multiply((1 - y_true), np.log(1 - y_pred)).flatten())
    regularization = (lambda_/(2*m)) * (np.sum(theta1.flatten() ** 2) + np.sum(theta2.flatten() ** 2))
    J = - (1/m) * (positive_loss + negative_loss) + regularization
    return J

In [6]:
cross_entropy(X, y, theta1_loaded, theta2_loaded, lambda_=1)

0.38448779624289398

In [7]:
def backward_pass(X, y, theta1, theta2):
    y_true = pd.get_dummies(y.ravel()).values
    ((X, a1, a2, y_pred), (z2, z3)) = forward_pass(X, theta1, theta2, elaborate=True)
    delta3 = np.multiply((y_pred - y_true), sigmoid_prime(z3.T))
    theta2_grad = a2.T.dot(delta3)
    theta2_grad = theta2_grad.T # theta2_grad.shape is now same as theta2.shape
    delta2 = np.multiply(delta3.dot(theta2[:, 1:]), sigmoid_prime(z2.T))
    theta1_grad = a1.T.dot(delta2)
    theta1_grad = theta1_grad.T
    return theta1_grad, theta2_grad
    

In [8]:
theta1 = np.random.rand(25, 401)
theta2 = np.random.rand(10, 26)

In [9]:
((X, a1, a2, y_pred), (z2, z3)) = forward_pass(X, theta1, theta2, elaborate=True)

In [9]:
def train(X, y, theta1, theta2):
    n_epochs = 10
    alpha = 0.1
    for i in range(n_epochs):
        y_pred = forward_pass(X, y, theta1, theta2)
        cost = cross_entropy(X, y, theta1, theta2)
        print("Iteration: {0} Cost: {1}".format(i, cost))
        theta1_grad, theta2_grad = backward_pass(X, y, theta1, theta2)
        theta1 = theta1 - alpha * theta1_grad
        theta2 = theta2 - alpha * theta2_grad
train(X, y, theta1, theta2)
    

ValueError: shapes (5000,1) and (401,5000) not aligned: 1 (dim 1) != 401 (dim 0)

In [36]:
theta1 = theta1 - theta1_grad
theta2 = theta2 - theta2_grad

In [10]:
cross_entropy(X, y, theta1, theta2)

119.62375466185773

In [11]:
y_pred = forward_pass(X, y, theta1, theta2)
cost = cross_entropy(X, y, theta1, theta2)
theta1_grad, theta2_grad = backward_pass(X, y, theta1, theta2)
theta1 = theta1 - alpha * theta1_grad
theta2 = theta2 - alpha * theta2_grad

ValueError: shapes (5000,1) and (401,5000) not aligned: 1 (dim 1) != 401 (dim 0)

In [47]:
delta3.shape

(5000, 10)

In [53]:
z2.T.shape

(5000, 25)