In [1]:
import numpy as np

from scipy.io import loadmat
from scipy import optimize

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.image import NonUniformImage
from matplotlib import cm

In [2]:
pathToDataFile = 'ex4data1.mat'
data = loadmat(pathToDataFile)
pathToWeightsFile = 'ex4weights.mat'
weights = loadmat(pathToWeightsFile)
print(weights['Theta1'].shape)
print(weights['Theta2'].shape)
print(type(weights))

(25, 401)
(10, 26)
<class 'dict'>


In [3]:
def getData(pathToDataFile):
    data = loadmat(pathToDataFile)
    X = data['X']
    y = data['y']
    return X, y

def flattenX(X):
    flatX = X.flatten()
    return flatX

def reshapeX(flatX, sampleSize):
    reshapedX = flatX.reshape(sampleSize,-1)
    return reshapedX

def generateBeta(layer):
    randomBetaSet = ()
    for i in range(len(layer)-1):
#         recommendation from Andrew Ng window is ±(6/(inLayer + outLayer))**0.5
        low, high = -(6/(layer[i]+layer[i+1]))**0.5, (6/(layer[i]+layer[i+1]))**0.
        randomBetaSet += (np.random.uniform(low,high,(layer[i+1], layer[i]+1)),)
#         betaSet += (np.zeros((outLayer, inLayer+1)),)
    return randomBetaSet

def flattenBeta(betaSet):
    flatBeta = betaSet[0].flatten()
    for beta in betaSet[1:]:
        flatBeta = np.concatenate((flatBeta, beta.flatten()), axis=-1)
    return flatBeta

def reshapeBeta(flatBeta, layer):
    splitIndex = 0
    splitIndices = []
    for i in range(len(layer)-2):
        splitIndex += (layer[i]+1)*layer[i+1]
        splitIndices += [splitIndex]
    splitBeta = np.split(flatBeta, splitIndices)
    reshapedBeta = ()
    for i in range(len(splitBeta)):
        reshapedBeta += (splitBeta[i].reshape(layer[i+1],layer[i]+1),)
    return reshapedBeta
    
def sigmoid(z):
    return 1/(1+np.exp(-z))

def forwardPropagation(flatBeta, layer, flatX, sampleSize):
    """Forward Propagation is the hypothesis function for Neural Networks"""
    betaSet = reshapeBeta(flatBeta, layer)
#     a_0 (5000, 400)
    a = reshapeX(flatX, sampleSize)
#     z_a = ()
    a_byLayer = ()
    for beta in betaSet:
#         z_l (5000, j_l); l is the number of layers [0, ...,L]; j is the number of neurons in a layer l [1,...,J]
        z = np.dot(np.insert(a, 0, 1, axis=1), beta.T)
#         a_l (5000, j_l); l is the number of layers [0, ...,L]; j is the number of neurons in a layer l [1,...,J]
        a = sigmoid(z)
#         z_a += ((z, a),)
        a_byLayer += (a,)
#     a_2 (5000, 10)
    return a_byLayer

def sigmoidGradient(z):
    return sigmoid(z)*(1-sigmoid(z))

"""this function has to be revised;
there should be a way to get cost function using matrix operations without for-loop"""
# def costFunction(beta, X, y, iLambda = 0.):
#     sampleSize, numVariables = X.shape  # X(5000, 401)
#     Y = np.array([np.unique(y)]* y.shape[0]) == y
# #     hypothesis matrix H(5000, 10)
#     H = forwardPropagation(betaSet, X)
# #     cost function matrix J (5000, 5000) = Y.T(5000, 10)*H(10, 5000)
#     J = (- np.dot(Y, np.log(H).T) - np.dot((1-Y), np.log(1-H).T))/sampleSize
# #     regularisation term (R)
#     cummulativeR = 0
#     for beta in betaSet:
#         cummulativeR += np.sum(beta*beta) # element-wise multiplication
#     cummulativeR *= iLambda/(2*sampleSize)
#     return J/sampleSize + cummulativeR

def costFunction(flatBeta, layer, flatX, sampleSize, y, yUnique, iLambda = 0.):
    sampleSize, numVariables = X.shape  # X(1, 401)
    Y = np.array([yUnique]* y.shape[0]) == y
    betaSet = reshapeBeta(flatBeta, layer)
    J = 0
    for n in range(sampleSize):
        x_n = X[n:n+1,:]
        y_n = Y[n:n+1,:]
#         hypothesis vector h_n(1, 10)
        h_n = forwardPropagation(flatBeta, layer, x_n, 1)[len(betaSet)-1]
#         cost function scalar j_n(1, 1) = y_n(1, 10)*h_n.T(10, 1)
        j_n = (- np.dot(y_n, np.log(h_n).T) - np.dot((1-y_n), np.log(1-h_n).T))
        J += j_n
#     regularisation term (R)
    cummulativeR = 0
    for beta in betaSet:
        cummulativeR += np.sum(beta*beta) #element-wise multiplication
    cummulativeR *= iLambda/(2*sampleSize)
    return J[0][0]/sampleSize + cummulativeR

# def gradient(beta, X, y, iLambda=0.):
#     sampleSize, numVariables = X.shape
# #     hypothesis (h)
#     h = hypothesis(beta, X)
# #     error vector e(5000x1) = h(5000x1) - y(5000x1)
#     e = h - y
# #     gradient vector g(401x1) = e.T(1x5000)*X(401x5000)
#     g = np.dot(X.T,e)/(sampleSize)
# #     regularisation term vector (r(400x1)) — derivative of the regularisation term of the cost funtion
#     r = beta[1:, None]*(iLambda/sampleSize)
#     g[1:] = g[1:] + r
#     return g.flatten()

In [4]:
X, y = getData(pathToDataFile)
sampleSize, numVariables = X.shape
flatX = flattenX(X)
layer = 400, 25, 10
yUnique = np.unique(y)
betaSet = (weights['Theta1'], weights['Theta2'])
flatBeta = flattenBeta(betaSet)
betaSet_zero = generateBeta(layer)
flatBeta_zero = flattenBeta(betaSet_zero)
print(X.shape)
print(y.shape)

(5000, 400)
(5000, 1)


In [5]:
Y = np.array([yUnique]* y.shape[0]) == y
Y[0:0+1,:].shape

(1, 10)

In [6]:
forwardPropagation(flatBeta, layer, flatX, sampleSize)[1].shape

(5000, 10)

In [7]:
forwardPropagation(flatBeta, layer, X[0:0+1,:], 1)[1].shape

(1, 10)

In [8]:
X[0][None,:].shape

(1, 400)

In [9]:
print(costFunction(flatBeta, layer, flatX, sampleSize, y, yUnique, iLambda = 0.))
print(costFunction(flatBeta, layer, X[0][None,:], sampleSize, y, yUnique, iLambda = 0.))
print(costFunction(flatBeta, layer, flatX, sampleSize, y, yUnique, iLambda = 1.))
print(costFunction(flatBeta_zero, layer, flatX, sampleSize, y, yUnique, iLambda = 1.))

0.287629165161
0.287629165161
0.384487796243
64.3493922545


In [10]:
sampleSize, numVariables = X.shape
Y = np.array([yUnique]* y.shape[0]) == y
# print (Y.shape)

nLayer = len(layer)-1
deltaSet = ()
inLayer = X.shape[1]
for i in range(len(layer)-1):
    deltaSet += (np.zeros((layer[i+1], layer[i]+1)),)
# print (deltaSet[1].shape)

for n in range(sampleSize):#range(sampleSize)
#         x_n(1, 400)
    x_n = X[n:n+1,:]
#     print(x_n.shape)
#         y_n(1, 10)
    y_n = Y[n:n+1,:]
#     print(y_n.shape)
#         h_n(1, 10)
    h_n = forwardPropagation(flatBeta, layer, x_n, 1)
#     print(h_n.shape)
#         error e_n(1, 10) for a given sample (x_n, y_n)
    e_n = h_n[nLayer-1] - y_n
#     print(e_n.shape)
    for l in reversed(range(nLayer)):
#         print(l)
#         print(h_n[l].shape)
        e_n = np.dot(e_n*sigmoidGradient(h_n[l]), betaSet[l])[:,1:]
        np.dot(h_n[l].T, np.insert(e_n, 0, 1, axis=1))

In [11]:
def backPropagation(flatBeta, layer, flatX, sampleSize, y, yUnique):
    sampleSize, numVariables = X.shape
    Y = np.array([yUnique]* y.shape[0]) == y
    betaSet = reshapeBeta(flatBeta, layer)

    deltaSet = ()
#     hypothesis matrix E(5000, 10)
    H = forwardPropagation(flatBeta, layer, flatX, sampleSize)
#     error matrix E(5000, 10)
    E = H[len(layer)-2] - Y
    for l in reversed(range(len(layer)-1)):
        E = np.dot(E*sigmoidGradient(H[l]), betaSet[l])[:,1:]
        deltaSet = (np.dot(H[l].T, np.insert(E, 0, 1, axis=1)),) + deltaSet
    flatDelta = flattenBeta(deltaSet)
    return flatBeta + flatDelta/sampleSize

In [12]:
randomBetaSet = generateBeta(layer)
flatRandomBetaSet = flattenBeta(randomBetaSet)
a = backPropagation(flatRandomBetaSet, layer, flatX, sampleSize, y, yUnique)

print(np.sum(a))
print(costFunction(a,layer, flatX, sampleSize, y, yUnique, iLambda = 0.))

16554.0188782
202.579344362


In [13]:
def gradientCheck(flatBeta, layer, epsilon):
    for i in np.random.randint(flatBeta.size, size=10):
        epsilonVector = np.zeros(flatBeta.size)
        epsilonVector[i] = epsilon
        betaPlus = betaMinus = flatBeta
        betaPlus = betaPlus + epsilonVector
        costPlus = costFunction(betaPlus,layer, X, y, yUnique, iLambda = 0.)
        betaMinus = betaMinus - epsilonVector
        costMinus = costFunction(betaMinus,layer, X, y, yUnique, iLambda = 0.)
        approximateGradient = (costPlus-costMinus)/(2*epsilon)
#         print (approximateGradient)

# gradientCheck(flatBeta, layer, 0.0001)


http://www.holehouse.org/mlclass/09_Neural_Networks_Learning.html

In [14]:
# randomBetaSet = generateBeta(layer)
# initialBeta = flattenBeta(betaSet)
# iLambda = 0
# a = optimize.fmin_cg(costFunction, x0=initialBeta,
#                        fprime=backPropagation, args=(layer, flatX, sampleSize, y, yUnique),
#                        maxiter=50,disp=True,full_output=True)
# len(a)

In [15]:
def betaOptimisation_1(flatBeta, flatX, sampleSize, y, yUnique, iLambda=0.):

    optimisedBeta = optimize.minimize(costFunction, flatBeta, args=(layer, flatX, sampleSize, y, yUnique),
                                      method=None, jac=backPropagation, options={'maxiter':50})

#     optimisedBeta = optimize.fmin_cg(costFunction, fprime=backPropagation, x0=flatBeta,
#                                      args=(layer, flatX, sampleSize, y, yUnique),
#                                      maxiter=50,disp=True,full_output=True)
    return(optimisedBeta)

In [16]:
def betaOptimisation_2(flatBeta, flatX, sampleSize, y, yUnique, iLambda=0.):

#     optimisedBeta = optimize.minimize(costFunction, flatBeta, args=(layer, flatX, sampleSize, y, yUnique),
#                                       method=None, jac=backPropagation, options={'maxiter':50})

    optimisedBeta = optimize.fmin_cg(costFunction, fprime=backPropagation, x0=flatBeta,
                                     args=(layer, flatX, sampleSize, y, yUnique),
                                     maxiter=50,disp=True,full_output=True)
    return(optimisedBeta)

In [17]:
a = betaOptimisation_1(flatBeta, flatX, sampleSize, y, yUnique, iLambda=0.)

In [18]:
b = betaOptimisation_2(flatBeta, flatX, sampleSize, y, yUnique, iLambda=0.)

         Current function value: 0.287629
         Iterations: 0
         Function evaluations: 110
         Gradient evaluations: 98


In [19]:
def qualityControl(optimisedBeta, layer, flatX, sampleSize, y, yUnique, iLambda = 0.):
    X = reshapeX(flatX, sampleSize)
    yAssignmentVector = []
    misAssignedIndex = []
    for n in range(sampleSize):
        x = X[n]
        yAssignment =  np.argmax(forwardPropagation(optimisedBeta, layer, X[n], 1)[1]) + 1
        if yAssignment == y[n]:
            yAssignmentVector += [True]
        else:
            yAssignmentVector += [False]
            misAssignedIndex += [n]
    return (sum(yAssignmentVector)/sampleSize)

In [20]:
# neuralNetworkClassifier(, flatX, sampleSize, y, yUnique, iLambda=0.)
qualityControl(a['x'], layer, flatX, sampleSize, y, yUnique, iLambda = 0.)

0.9752

In [21]:
qualityControl(b[0], layer, flatX, sampleSize, y, yUnique, iLambda = 0.)

0.9752