In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data = np.array(data)
m,n = data.shape # m test digits(rows), n pixels(columns)
np.random.shuffle(data)

data_dev = data[0:1000].T # transpose data matrix for ease of use, 784xm
Y_dev = data_dev[0] # 1xm matrix with labels, indicates correct answer
X_dev = data_dev[1:n] # 784xm matrix with image data, each column is one digit
X_dev = X_dev / 255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255

In [5]:
def initWB(n_h1,n_h2):
    W1 = np.random.rand(n_h1, 784) # n_h1x784 matrix with random weights from [0,1) (uniformly chosen)
    b1 = np.random.rand(n_h1, 1) # n_h1x1 vector with random biases from [0,1)
    W2 = np.random.rand(n_h2, n_h1) # n_h2xn_h1
    b2 = np.random.rand(n_h2, 1) # n_h2x1
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def ReLU_deriv(Z):
    return Z>0

# def softmax(Z):
#     A = np.exp(Z) / sum(np.exp(Z))
#     return A

# def sig(x):
#     return 1/(1 + np.exp(-x))

def feedForward(W1,b1,W2,b2,X):
    Z1 = np.matmul(W1,X)+b1 # n_h1xm matrix of weighted sum of previous nodes 
    A1 = ReLU(Z1) # n_h1xm
    Z2 = np.matmul(W2,A1)+b2 # n_h2xm 
    A2 = ReLU(Z2) # n_h2xm
    return Z1,A1,Z2,A2
    
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    for i in range(0,Y.size):
        one_hot_Y[i,Y[i]]=1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y
    
def backProp(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = X.shape[1] # number of test digits
    delta2 = (A2-Y)*ReLU_deriv(Z2) # n_h2xm matrix
    dW2 = np.matmul(delta2,A1.T)/m # n_h2xn_h1 matrix, matrix multiplying inherently involves summing up all test case errors which means division by m is necessary
    db2 = np.sum(delta2,axis=1,keepdims=True)/m # n_h2x1 matrix
    delta1 = np.matmul(W2.T,delta2)*ReLU_deriv(Z1) # n_h1xm matrix
    dW1 = np.matmul(delta1,X.T)/m # n_h1xm matrix
    db1 = np.sum(delta1,axis=1,keepdims=True)/m # n_h1x1 matrix
    return dW1, dW2, db1, db2

def updateWB(W1, W2, b1, b2, dW1, dW2, db1, db2, alpha):
    W1 -= alpha*dW1
    W2 -= alpha*dW2
    b1 -= alpha*db1
    b2 -= alpha*db2
    return W1,W2,b1,b2
    

In [7]:
W1, b1, W2, b2 = initWB(10,10)
Z1,A1,Z2,A2 = feedForward(W1,b1,W2,b2,X_train)
Y = one_hot(Y_train)
dW1, dW2, db1, db2 = backProp(Z1, A1, Z2, A2, W1, W2, X_train, Y)
W1, b1, W2, b2 = updateWB(W1, W2, b1, b2, dW1, dW2, db1, db2, 0.3)
W1

array([[0.76707605, 0.87102312, 0.04397033, ..., 0.83448761, 0.87585582,
        0.17242601],
       [0.72223465, 0.70328492, 0.01067233, ..., 0.44243248, 0.82921569,
        0.24418707],
       [0.46953923, 0.80624413, 0.10348022, ..., 0.20618895, 0.60793705,
        0.08388533],
       ...,
       [0.92230382, 0.09039749, 0.73571999, ..., 0.52396755, 0.48418358,
        0.33053155],
       [0.8204838 , 0.92915304, 0.90789433, ..., 0.80381561, 0.30945254,
        0.94660875],
       [0.29383815, 0.2569123 , 0.24200666, ..., 0.79621298, 0.74981879,
        0.13157956]])