In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [13]:
data = pd.read_csv('train.csv')
data = np.array(data)
m, n = data.shape
np.random.shuffle(data)

data_val = data[0:1000].T
y_val = data_val[0]
x_val = data_val[1:n]
x_val = x_val / 255.

data_train = data[1000:m].T
y_train = data_train[0]
x_train = data_train[1:n]
x_train = x_train / 255.


In [25]:
x_val.shape

(784, 1000)

In [15]:
def reLU(z):
    return np.maximum(z, 0)

def diff_reLU(z):
    return z > 0

def softmax(z):
    return np.exp(z)/sum(np.exp(z))

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def init_params():
    w1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    w2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return w1, b1, w2, b2

def forward_prop(w1, b1, w2, b2, x):
    z1 = w1.dot(x) + b1
    a1 = reLU(z1)
    z2 = w2.dot(a1) + b2
    a2 = softmax(z2)
    return z1, a1, z2, a2

def back_prop(z1, a1, z2, a2, w1, w2, x, y):
    one_hot_y = one_hot(y)
    dz2 = a2 - one_hot_y
    dw2 = 1 / m * dz2.dot(a1.T)
    db2 = 1 / m * np.sum(dz2)
    dz1 = w2.T.dot(dz2) * diff_reLU(z1)
    dw1 = 1 / m * dz1.dot(x.T)
    db1 = 1 / m * np.sum(dz1)
    return dw1, db1, dw2, db2

def update(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
    w1 = w1 - alpha*dw1
    b1 = b1 - alpha*db1
    w2 = w2 - alpha*dw2
    b2 = b2 - alpha*db2
    return w1, b1, w2, b2



In [16]:
def get_predictions(a2):
    return np.argmax(a2, 0)

def get_accuracy(predictions, y):
    print(predictions, y)
    return np.sum(predictions == y) / y.size

def gradient_descent(x, y, alpha, iterations):
    w1, b1, w2, b2 = init_params()
    for i in range(iterations):
        z1, a1, z2, a2 = forward_prop(w1, b1, w2, b2, x)
        dw1, db1, dw2, db2 = back_prop(z1, a1, z2, a2, w1, w2, x, y)
        w1, b1, w2, b2 = update(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
        if i % 100 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(a2)
            print(get_accuracy(predictions, y))
    return w1, b1, w2, b2


In [19]:
w1, b1, w2, b2 = gradient_descent(x_train, y_train, 0.10, 500)
print(w1, b1, w2, b2)

Iteration:  0
[6 4 8 ... 6 6 4] [5 7 2 ... 1 9 0]
0.09121951219512195
Iteration:  100
[3 7 2 ... 1 8 0] [5 7 2 ... 1 9 0]
0.6271219512195122
Iteration:  200
[3 7 2 ... 1 7 0] [5 7 2 ... 1 9 0]
0.738609756097561
Iteration:  300
[3 7 2 ... 1 7 0] [5 7 2 ... 1 9 0]
0.7892439024390244
Iteration:  400
[3 7 2 ... 1 7 0] [5 7 2 ... 1 9 0]
0.8166097560975609
[[ 1.51495443e-01  2.84630262e-01  4.43857392e-01 ... -2.89720707e-01
  -2.81736105e-01 -4.34746227e-02]
 [-2.68992326e-01  3.70838326e-01  4.60946741e-05 ... -1.73259212e-01
  -3.92729858e-01 -4.78853552e-01]
 [-4.85438579e-01  3.16547547e-01  4.81583603e-01 ... -3.05992282e-01
   3.35745206e-01 -4.61998166e-01]
 ...
 [ 1.16770348e-01  3.08019480e-01  3.56682613e-01 ... -1.80453271e-01
  -4.34181656e-01 -2.22098669e-01]
 [ 4.45008218e-01  3.62864286e-01 -2.11990395e-01 ...  3.48818552e-01
   1.98882969e-01 -3.84915550e-01]
 [ 1.17492602e-01 -3.41039757e-01  1.37319699e-03 ... -3.54008020e-01
   3.11007256e-01 -2.23405165e-01]] [[ 0.700441

In [29]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

dev_predictions = make_predictions(x_val, w1, b1, w2, b2)
get_accuracy(dev_predictions, y_val)


[2 2 8 6 6 6 8 8 5 6 6 2 7 3 1 2 5 9 7 1 5 4 6 9 0 9 1 7 9 0 5 2 3 1 1 8 9
 9 6 4 4 1 1 0 0 6 3 4 0 5 1 7 6 9 8 6 5 9 5 3 1 1 9 7 7 8 2 0 4 4 6 3 4 5
 4 7 4 6 5 9 9 7 3 6 9 4 6 9 1 7 6 3 0 5 3 5 0 1 2 2 8 3 6 5 0 8 8 7 7 9 8
 0 6 1 6 9 3 4 9 7 2 8 9 7 4 7 9 0 6 8 2 3 6 2 4 1 7 6 1 0 4 1 3 4 8 4 9 6
 0 9 9 5 8 1 0 3 4 7 0 3 1 5 0 0 4 6 3 1 8 0 0 0 3 2 2 1 8 7 1 1 3 5 1 9 4
 6 8 1 4 0 3 0 8 0 5 3 5 9 0 4 1 7 9 4 8 5 3 5 2 0 1 0 4 7 9 3 7 2 1 5 6 6
 6 9 7 9 3 7 2 4 7 0 6 1 5 3 1 9 7 6 4 1 7 4 4 1 3 1 5 6 6 7 9 6 0 3 6 9 7
 4 9 6 1 7 3 8 0 1 3 5 9 3 0 3 3 4 3 7 5 7 4 0 6 4 9 0 7 8 0 6 1 0 6 1 2 1
 2 7 7 7 9 7 1 9 6 7 2 8 6 1 4 6 5 7 4 9 9 6 0 6 3 4 1 0 5 1 6 9 4 2 6 7 9
 8 3 8 8 0 9 1 6 0 8 1 6 9 8 4 5 7 8 2 0 6 6 9 0 1 5 3 4 4 6 6 0 6 1 7 3 1
 8 3 3 5 3 3 8 7 1 9 9 1 0 8 7 5 7 4 9 3 7 8 9 1 7 8 2 4 9 3 4 9 0 0 7 1 4
 2 7 5 7 8 6 6 9 9 1 3 9 2 5 9 6 4 9 3 1 2 6 2 7 3 2 8 5 6 2 9 6 3 9 6 8 2
 7 2 1 0 8 9 3 3 6 1 8 8 1 0 9 8 0 6 7 5 5 9 5 4 7 7 9 7 2 9 6 6 7 4 1 7 2
 9 0 7 7 4 6 7 3 2 7 8 1 

0.834

In [20]:
np.save('w1.npy', w1)
np.save('b1.npy', b1)
np.save('w2.npy', w2)
np.save('b2.npy', b2)