In [29]:
import numpy as np
import pandas as pd

In [30]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df = np.array(df)
m, n = df.shape
np.random.shuffle(df) # shuffle before splitting into dev and training sets

df_dev = df[0:1000].T
Y_dev = df_dev[0]
X_dev = df_dev[1:n]
X_dev = X_dev / 255.

df_train = df[1000:m].T
Y_train = df_train[0]
X_train = df_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape


In [32]:
def init_para():
    W1 = np.random.randn(10,784)* np.sqrt(2/784)
    b1 = np.random.randn(10,1)-0.5
    W2 = np.random.randn(10,10)* np.sqrt(2/10)
    b2 = np.random.randn(10,1)-0.5
    W3 = np.random.randn(10,10)* np.sqrt(2/10)
    b3 = np.random.randn(10,1)-0.5
    return W1, b1, W2, b2,W3,b3


def Leaky_ReLU(Z, alpha=0.01):
    return np.where(Z > 0, Z, alpha*Z)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A


def forward(W1, b1, W2, b2,W3,b3,X):
    A0 = X
    Z1 = W1.dot(A0)+b1
    A1 = Leaky_ReLU(Z1)
    Z2 = W2.dot(A1)+b2
    A2 = Leaky_ReLU(Z2)
    Z3 = W3.dot(A2)+b3
    A3 = Leaky_ReLU(Z3)
    return  Z1,A1,Z2,A2,Z3,A3


def Leaky_ReLU_derivative(Z, alpha=0.01):
    dZ = np.ones_like(Z)
    dZ[Z < 0] = alpha
    return dZ


def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, Z3, A3, W3, W2, X, Y):
    m = len(Y)
    one_hot_Y = one_hot(Y)
    dz3 = 2*(A3 - one_hot_Y)
    dw3 = 1/m * dz3.dot(A2.T) 
    db3 = np.sum(dz3) / m
    dz2 = W3.T.dot(dz3) * Leaky_ReLU_derivative(Z2)
    dw2 = dz2.dot(A1.T) / m
    db2 = np.sum(dz2) / m
    dz1 = W2.T.dot(dz2) * Leaky_ReLU_derivative(Z1)
    dw1 = dz1.dot(X.T) / m
    db1 = np.sum(dz1) / m
    return dw3, db3, dw2, db2, dw1, db1


def update_para(W1, b1, W2, b2,W3,b3,dW1,db1,dW2,db2,dW3,db3,alpha):
    W1 = W1 - alpha*dW1
    W2 = W2 - alpha*dW2
    W3 = W3 - alpha*dW3
    b1 = b1 - alpha*db1
    b2 = b2 - alpha*db2
    b3 = b3 - alpha*db3
    return W1,b1,W2,b2,W3,b3


def get_predictions(A):
    return np.argmax(A, 0)

def get_accuracy(predictions, Y):
    print(predictions,Y)
    return np.mean(predictions == Y) * 100




def gradient_descent_no_bacth(X, Y, alpha):
    W1, b1, W2, b2, W3, b3 = init_para()
    Z1, A1, Z2, A2, Z3, A3 = forward(W1, b1, W2, b2, W3, b3, X)
    dW3, db3, dW2, db2, dW1, db1 = backward_prop(Z1, A1, Z2, A2, Z3, A3, W3, W2, X, Y)
    W1, b1, W2, b2, W3, b3 = update_para(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)
    predictions = get_predictions(A3)
    while(get_accuracy(predictions, Y)<90):
        Z1, A1, Z2, A2, Z3, A3 = forward(W1, b1, W2, b2, W3, b3, X)
        dW3, db3, dW2, db2, dW1, db1 = backward_prop(Z1, A1, Z2, A2, Z3, A3, W3, W2, X, Y)
        W1, b1, W2, b2, W3, b3 = update_para(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)
        predictions = get_predictions(A3)
        
    
    
    print(f"Accuracy: {get_accuracy(predictions, Y)}")
    return W1, b1, W2, b2, W3, b3, A1, A2, A3

In [34]:
W1, b1, W2, b2,W3,b3,A1,A2,A3 = gradient_descent_no_bacth(X_train, Y_train, 0.1)

[7 7 7 ... 7 7 7] [0 3 3 ... 1 1 3]
[3 3 3 ... 3 4 3] [0 3 3 ... 1 1 3]
[4 4 4 ... 4 4 4] [0 3 3 ... 1 1 3]
[4 4 4 ... 4 7 4] [0 3 3 ... 1 1 3]
[4 4 4 ... 4 7 4] [0 3 3 ... 1 1 3]
[3 4 4 ... 3 7 4] [0 3 3 ... 1 1 3]
[3 2 4 ... 2 7 3] [0 3 3 ... 1 1 3]
[2 1 2 ... 1 1 2] [0 3 3 ... 1 1 3]
[3 4 4 ... 2 1 4] [0 3 3 ... 1 1 3]
[8 1 5 ... 1 1 5] [0 3 3 ... 1 1 3]
[4 4 4 ... 4 4 4] [0 3 3 ... 1 1 3]
[7 7 7 ... 4 4 7] [0 3 3 ... 1 1 3]
[4 4 4 ... 4 4 4] [0 3 3 ... 1 1 3]
[4 4 4 ... 4 4 4] [0 3 3 ... 1 1 3]
[3 4 4 ... 4 4 4] [0 3 3 ... 1 1 3]
[3 4 3 ... 3 3 3] [0 3 3 ... 1 1 3]
[3 3 3 ... 3 3 3] [0 3 3 ... 1 1 3]
[5 3 3 ... 3 3 3] [0 3 3 ... 1 1 3]
[5 2 3 ... 3 3 3] [0 3 3 ... 1 1 3]
[5 2 2 ... 2 2 5] [0 3 3 ... 1 1 3]
[8 2 2 ... 2 2 5] [0 3 3 ... 1 1 3]
[8 2 2 ... 2 2 5] [0 3 3 ... 1 1 3]
[8 2 2 ... 2 2 5] [0 3 3 ... 1 1 3]
[8 2 2 ... 2 2 5] [0 3 3 ... 1 1 3]
[8 2 2 ... 6 6 6] [0 3 3 ... 1 1 3]
[8 6 2 ... 6 6 5] [0 3 3 ... 1 1 3]
[8 2 2 ... 6 6 6] [0 3 3 ... 1 1 3]
[8 6 6 ... 6 6 8] [0 3 3 ...