In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.io import loadmat

data = loadmat("machine_learning_andrewng/ex3data1.mat")
weights = loadmat("machine_learning_andrewng/ex3weights.mat")
print(data.keys())
print(weights.keys())

['y', 'X', '__version__', '__header__', '__globals__']
['Theta2', '__version__', '__header__', 'Theta1', '__globals__']


In [2]:
X = data['X']
y = data['y']
y = pd.get_dummies(y.ravel()).values
theta1_loaded = weights["Theta1"]
theta2_loaded = weights["Theta2"]

In [3]:
def sigmoid(z):
    return(1 / (1 + np.exp(-z)))

def sigmoid_prime(z):
    return(sigmoid(z)*(1-sigmoid(z)))

In [4]:
def forward_pass(X, theta1, theta2, elaborate=False):
    a1 = np.c_[np.ones(X.shape[0]), X]
    z2 = theta1.dot(a1.T) # 25x401 * 401x5000 = 25x5000
    a2 = sigmoid(z2.T) # 5000x25
    a2 = np.c_[np.ones(a1.shape[0]), a2] # 5000x26
    z3 = theta2.dot(a2.T) # 10x26 * 26x5000 = 10x5000
    a3 = sigmoid(z3.T) # 5000x10
    if elaborate:
        return ((X, a1, a2, a3), (z2, z3))
    return a3

In [5]:
def sci_forward_pass(thetas, X, y, elaborate=True, *args):
    print("thetas size:", thetas.shape)
    print("X size:", X.shape)
    print("y size:", y_true.shape)
    theta1 = thetas[:10025]
    theta1 = theta1.reshape(25, 401)
    theta2 = thetas[10025:]
    theta2 = theta2.reshape(10, 26)
    a1 = np.c_[np.ones(X.shape[0]), X]
    z2 = theta1.dot(a1.T) # 25x401 * 401x5000 = 25x5000
    a2 = sigmoid(z2.T) # 5000x25
    a2 = np.c_[np.ones(a1.shape[0]), a2] # 5000x26
    z3 = theta2.dot(a2.T) # 10x26 * 26x5000 = 10x5000
    a3 = sigmoid(z3.T) # 5000x10
    if elaborate:
        return ((X, a1, a2, a3), (z2, z3))
    return a3

In [6]:
def cross_entropy(X, y_true, theta1, theta2, lambda_=0):
    m = X.shape[0]
    y_pred = forward_pass(X, theta1, theta2)
    positive_loss = np.sum(np.multiply(y_true, np.log(y_pred)).flatten())
    negative_loss = np.sum(np.multiply((1 - y_true), np.log(1 - y_pred)).flatten())
    regularization = (lambda_/(2*m)) * (np.sum(theta1.flatten() ** 2) + np.sum(theta2.flatten() ** 2))
    J = - (1/m) * (positive_loss + negative_loss) + regularization
    return J

In [7]:
def sci_cross_entropy(thetas, X, y_true):
    lambda_=0
    m = X.shape[0]
    theta1 = thetas[:10025]
    theta1 = theta1.reshape(25, 401)
    theta2 = thetas[10025:]
    theta2 = theta2.reshape(10, 26)
    y_pred = forward_pass(X, theta1, theta2)
    positive_loss = np.sum(np.multiply(y_true, np.log(y_pred)).flatten())
    negative_loss = np.sum(np.multiply((1 - y_true), np.log(1 - y_pred)).flatten())
    regularization = (lambda_/(2*m)) * (np.sum(theta1.flatten() ** 2) + np.sum(theta2.flatten() ** 2))
    J = - (1/m) * (positive_loss + negative_loss) + regularization
    return J

In [8]:
cross_entropy(X, y, theta1_loaded, theta2_loaded, lambda_=1)

0.38448779624289398

In [9]:
# original
def backward_pass(X, y_true, theta1, theta2, *args):
    ((X, a1, a2, y_pred), (z2, z3)) = forward_pass(X, theta1, theta2, elaborate=True)
    delta3 = np.multiply((y_pred - y_true), sigmoid_prime(z3.T))
    theta2_grad = a2.T.dot(delta3)
    theta2_grad = theta2_grad.T # theta2_grad.shape is now same as theta2.shape
    delta2 = np.multiply(delta3.dot(theta2[:, 1:]), sigmoid_prime(z2.T))
    theta1_grad = a1.T.dot(delta2)
    theta1_grad = theta1_grad.T
    return theta1_grad, theta2_grad    

In [10]:
def sci_backward_pass(thetas, X, y_true, *args):
    theta1 = thetas[:10025]
    theta1 = theta1.reshape(25, 401)
    theta2 = thetas[10025:]
    theta2 = theta2.reshape(10, 26)
    ((X, a1, a2, y_pred), (z2, z3)) = forward_pass(X, theta1, theta2, elaborate=True)
    delta3 = np.multiply((y_pred - y_true), sigmoid_prime(z3.T))
    theta2_grad = a2.T.dot(delta3)
    theta2_grad = theta2_grad.T # theta2_grad.shape is now same as theta2.shape
    delta2 = np.multiply(delta3.dot(theta2[:, 1:]), sigmoid_prime(z2.T))
    theta1_grad = a1.T.dot(delta2)
    theta1_grad = theta1_grad.T
    return np.r_[theta1_grad.flatten(), theta2_grad.flatten()]
    

In [37]:
theta1 = np.random.rand(25, 401)
theta2 = np.random.rand(10, 26)

In [9]:
((X, a1, a2, y_pred), (z2, z3)) = forward_pass(X, theta1, theta2, elaborate=True)

In [12]:
def train(X, y, theta1, theta2):
    n_epochs = 100
    alpha = 0.001
    for i in range(1, n_epochs):
        y_pred = forward_pass(X, theta1, theta2)
        cost = cross_entropy(X, y, theta1, theta2)
        print "Iteration: {0} Cost: {1}\r".format(i, cost),
        theta1_grad, theta2_grad = backward_pass(X, y, theta1, theta2)
        theta1 = theta1 - alpha * theta1_grad
        theta2 = theta2 - alpha * theta2_grad
train(X, y, theta1, theta2)
    

Iteration: 99 Cost: 114.125000263

In [15]:
cross_entropy(X, y, theta1, theta2)

113.44899464670516

In [42]:
from scipy.optimize import minimize

thetas = np.r_[theta1.flatten(), theta2.flatten()]
res = minimize(sci_cross_entropy, thetas, jac=sci_backward_pass, options={'maxiter': 10000}, args=(X, y), method='cg')

In [43]:
res

     fun: 113.02232249649792
     jac: array([  1.37156087e-07,   0.00000000e+00,   0.00000000e+00, ...,
         2.24873345e-02,   2.24873169e-02,   2.24872804e-02])
 message: 'Desired error not necessarily achieved due to precision loss.'
    nfev: 48
     nit: 0
    njev: 37
  status: 2
 success: False
       x: array([ 0.1300368 ,  0.64270156,  0.06006174, ...,  0.91566874,
        0.47741502,  0.35878008])