In [177]:
import pandas as pd
import numpy as np
import random

In [121]:
train_path="./mnist_train_small.csv"
test_path="./mnist_test.csv"
train=pd.read_csv(train_path)
test=pd.read_csv(test_path)

In [199]:
train=np.array(train)
m,n=train.shape
np.random.shuffle(train)
X=train[:,1:n].T
X=X/255
Y=train[:,0]

In [171]:
test=np.array(test)
x_test=test[:,1:n].T
x_test=x_test/255
y_test=test[:,0]

In [None]:
def _init_params():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5                           
    return W1, b1, W2, b2

def ReLu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    A=np.exp(Z)/sum(np.exp(Z))
    return A

def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLu(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def one_hot_encoding(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def deriv_ReLu(Z):
    return (Z > 0).astype(float) 

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot_encoding(Y)
    dZ2 = A2 - one_hot_Y  
    dW2 = 1/m * dZ2.dot(A1.T)  
    db2 = 1/m * np.sum(dZ2, axis=1, keepdims=True) 
    dZ1 = W2.T.dot(dZ2) * deriv_ReLu(Z1)  
    dW1 = 1/m * dZ1.dot(X.T)  
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)  
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 -= alpha * dW1
    b1 -= alpha * db1
    W2 -= alpha * dW2
    b2 -= alpha * db2
    return W1, b1, W2, b2

def get_predictions(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha = 0.1, iterations=1000):
    W1, b1, W2, b2 = _init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 100 == 0:
            predictions = get_predictions(A2)
            accuracy = get_accuracy(predictions, Y)
            print(f"Iteration: {i}, Accuracy: {100 * accuracy:.4f}%")
    return W1, b1, W2, b2


In [203]:
W1,b1,W2,b2=gradient_descent(X,Y,0.1,1000)

Iteration: 0, Accuracy: 11.8106%
Iteration: 100, Accuracy: 69.1335%
Iteration: 200, Accuracy: 79.0390%
Iteration: 300, Accuracy: 82.8691%
Iteration: 400, Accuracy: 84.8592%
Iteration: 500, Accuracy: 86.0543%
Iteration: 600, Accuracy: 86.9793%
Iteration: 700, Accuracy: 87.5644%
Iteration: 800, Accuracy: 88.1044%
Iteration: 900, Accuracy: 88.5894%


In [209]:
_,_,_,A2=forward_prop(W1, b1, W2, b2, x_test)
predictions=get_predictions(A2)
accuracy=get_accuracy(predictions,y_test)
print(accuracy)

0.8858885888588859
