For this example we'll utilize the [MNIST](https://www.wikiwand.com/en/MNIST_database) dataset. A dataset of handwriten digits.

The pixel training image is 28x28 pixels = 784 pixels overall. Each pixel has a value between 0 and 255. 255 being completely white and 0 being completely black.

The pixels can be represented as a matrix where each row will be 784 columns long, because each row will correspond to each pixel in the image. Then the matrix will be transposed (i.e. columns become rows and rows become columns).



In [1]:
# Import the essential libraries for numerical functions, dataset and plots, respectively.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
# Get the train data
data = pd.read_csv("../src/dataset/train.csv")

In [3]:
# Show the train data
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Convert the data into Numpy Array
data = np.array(data)
rows_amount, features_amount_plus_one = data.shape
np.random.shuffle(data)

# Set the dev data & transpose it
data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:features_amount_plus_one]
X_dev = X_dev / 255.

# Set the train data & transpose it
data_train = data[1000:rows_amount].T
Y_train = data_train[0]
X_train = data_train[1:features_amount_plus_one]
X_train = X_train / 255.
_,m_train = X_train.shape

In [5]:
#X_train[:, 0].shape
Y_train

array([2, 3, 8, ..., 4, 9, 5], dtype=int64)

In [6]:
# Initiate parameters
def initiate_parameters():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A

# Forward propagation
def forward_propagation(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def derivative_ReLU(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y


# Backward propagation
def backward_propagation(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1/rows_amount * dZ2.dot(A1.T)
    db2 = 1/rows_amount * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * derivative_ReLU(Z1)
    dW1 = 1/rows_amount * dZ1.dot(X.T)
    db1 = 1/rows_amount * np.sum(dZ1)
    return dW1, db1, dW2, db2

def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    return W1, b1, W2, b2

In [7]:
#def loss(X, Y, A2):
    #rows_amount = Y.size
    #log_likelihood = - np.log(A2[Y, range(rows_amount)])
    #loss = 1 / rows_amount * np.sum(log_likelihood)
    #return loss

def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = initiate_parameters()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_propagation(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_propagation(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2


In [8]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.1, 500)

Iteration:  0
[6 0 9 ... 9 5 0] [2 3 8 ... 4 9 5]
0.11290243902439025
Iteration:  10
[2 0 8 ... 9 5 0] [2 3 8 ... 4 9 5]
0.16385365853658537
Iteration:  20
[2 0 8 ... 6 4 7] [2 3 8 ... 4 9 5]
0.22541463414634147
Iteration:  30
[2 0 8 ... 6 4 7] [2 3 8 ... 4 9 5]
0.2795609756097561
Iteration:  40
[2 0 8 ... 6 4 7] [2 3 8 ... 4 9 5]
0.3298536585365854
Iteration:  50
[2 0 8 ... 6 8 0] [2 3 8 ... 4 9 5]
0.37939024390243903
Iteration:  60
[2 0 8 ... 6 8 0] [2 3 8 ... 4 9 5]
0.42617073170731706
Iteration:  70
[2 0 8 ... 6 8 0] [2 3 8 ... 4 9 5]
0.47053658536585363
Iteration:  80
[2 0 8 ... 6 8 0] [2 3 8 ... 4 9 5]
0.5092439024390244
Iteration:  90
[2 3 8 ... 6 8 0] [2 3 8 ... 4 9 5]
0.5432926829268293
Iteration:  100
[2 3 8 ... 9 8 0] [2 3 8 ... 4 9 5]
0.5741463414634146
Iteration:  110
[2 3 8 ... 9 8 0] [2 3 8 ... 4 9 5]
0.6011463414634146
Iteration:  120
[2 3 8 ... 9 9 0] [2 3 8 ... 4 9 5]
0.6246829268292683
Iteration:  130
[2 3 8 ... 9 9 0] [2 3 8 ... 4 9 5]
0.6440975609756098
Iteration: 