In [1]:
from mnist import MNIST
from matplotlib import pyplot as plt
import numpy as np

In [2]:
from keras.datasets import mnist

In [3]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = np.reshape(train_X, (60000, 784))
test_X = np.reshape(test_X, (10000, 784))
m, n = train_X.shape
train_X = train_X.T
test_X = test_X.T
print(m)

60000


In [11]:
print(test_X.shape)

(784, 10000)


In [4]:
def init_params():
    W1 = np.random.rand(72, 784) - 0.5
    b1 = np.random.rand(72, 1) - 0.5
    W2 = np.random.rand(36, 72) - 0.5
    b2 = np.random.rand(36, 1) - 0.5
    W3 = np.random.rand(10, 36) - 0.5
    b3 = np.random.rand(10, 1,) - 0.5
    return  W1, b1, W2, b2, W3, b3

def Sigmoid(Z):
    return 1/(1+np.exp(-Z))

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A



def forward_prop(W1, b1, W2, b2, W3, b3, X):
    Z1 = np.dot(W1, X) + b1
    A1 = Sigmoid(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = Sigmoid(Z2)
    Z3 = np.dot(W3, A2) + b3
    A3 = softmax(Z3)
    return(Z1, A1, Z2, A2, Z3, A3)

def Sigmoid_deriv(Z):
    dsig = (np.exp(-Z))/((np.exp(-Z)+1)**2)
    dsig = np.nan_to_num(dsig)
    return dsig

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y):
    one_hot_Y = one_hot(Y)
    dZ3 = A3 - one_hot_Y
    dW3 = 1/m * dZ3.dot(A2.T)
    db3 = 1/m * np.sum(dZ3)
    dZ2 = W3.T.dot(dZ3) * Sigmoid_deriv(Z2)
    dW2 = 1/m * dZ2.dot(A1.T)
    #print(dW2)
    db2 = 1/m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * Sigmoid_deriv(Z1)
    #print(Sigmoid_deriv(Z1))
    #print('Hi')
    #print(Z1)
    dW1 = 1/m * dZ1.dot(X.T)
    #print(dW1)
    #zprint(dZ1)
    db1 = 1/m * np.sum(dZ1)  
    return(dW1, db1, dW2, db2, dW3, db3)

def update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha):
    #print(W1)
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    W3 = W3 - alpha * dW3
    b3 = b3 - alpha * db3
    #print(dW1)
    #print(dW1, dW2, dW3)
    return W1, b1, W2, b2, W3, b3

In [5]:
def get_predictions(A3):
    #print(A3)
    return np.argmax(A3,0)
    

def get_accuracy(predictions, Y):
    #Clean
    print(predictions, Y)
    return np.sum(predictions == Y)/ Y.size

In [6]:
def gradient_decent(X, Y, alpha, iterations):
    W1, b1, W2, b2, W3, b3 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
        dW1, db1, dW2, db2, dW3, db3 = backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y)
        W1, b1, W2, b2, W3, b3 = update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A3)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2, W3, b3

In [7]:
W1, b1, W2, b2, W3, b3 = gradient_decent(train_X, train_y, 1, 500)

  return 1/(1+np.exp(-Z))
  dsig = (np.exp(-Z))/((np.exp(-Z)+1)**2)
  dsig = (np.exp(-Z))/((np.exp(-Z)+1)**2)
  dsig = (np.exp(-Z))/((np.exp(-Z)+1)**2)


Iteration:  0
[8 6 6 ... 8 8 6] [5 0 4 ... 5 6 8]
0.08748333333333333
Iteration:  10
[2 0 9 ... 7 6 3] [5 0 4 ... 5 6 8]
0.48588333333333333
Iteration:  20
[3 0 4 ... 7 6 8] [5 0 4 ... 5 6 8]
0.62995
Iteration:  30
[5 0 4 ... 7 6 8] [5 0 4 ... 5 6 8]
0.70315
Iteration:  40
[5 0 4 ... 7 6 8] [5 0 4 ... 5 6 8]
0.7473666666666666
Iteration:  50
[5 0 4 ... 7 6 8] [5 0 4 ... 5 6 8]
0.7766833333333333
Iteration:  60
[3 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
0.7993166666666667
Iteration:  70
[3 0 4 ... 5 6 5] [5 0 4 ... 5 6 8]
0.81655
Iteration:  80
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
0.8301666666666667
Iteration:  90
[5 0 4 ... 5 6 5] [5 0 4 ... 5 6 8]
0.8402666666666667
Iteration:  100
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
0.8484166666666667
Iteration:  110
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
0.8558333333333333
Iteration:  120
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
0.8623333333333333
Iteration:  130
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6 8]
0.8673833333333333
Iteration:  140
[5 0 4 ... 5 6 8] [5 0 4 ... 5 6

In [9]:
def forward_prop(W1, b1, W2, b2, W3, b3, X):
    Z1 = np.dot(W1, X) + b1
    A1 = Sigmoid(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = Sigmoid(Z2)
    Z3 = np.dot(W3, A2) + b3
    A3 = softmax(Z3)
    return(Z1, A1, Z2, A2, Z3, A3)

In [10]:
Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b2, test_X)
prediction = get_prediction(A3)
print(get_accuracy(get_prediction, test_y))

  return 1/(1+np.exp(-Z))


ValueError: operands could not be broadcast together with shapes (10,10000) (36,1) 