In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
# Get data in np array
data = pd.read_csv("data/train.csv")

In [12]:
# Change format to np
data = np.array(data)

# Get shape of data and calculate where to split test set
m, n = data.shape
split = (int)(0.2 * m)

In [13]:
m

42000

In [14]:
# Randomize data order
np.random.shuffle(data)

# Get test data
data_test = data[0:split].T
y_test = data_test[0]
X_test = data_test[1:n]
X_test = X_test / 255

In [15]:
# Get train data
data_train = data[split:m].T
y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255

In [16]:
def init_params():
    '''
    Randomizes weights (W) and biases (b) for initialization

    Returns W1, b1, W2, b2
    '''

    # Randomized values from -0.5 to 0.5 in matrices for weights and biases
    # Layer 1
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5

    # Layer 2
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5

    return W1, b1, W2, b2

def ReLU(Z):
    '''
    Applies ReLU function to unactivated layer (Z)

    Returns A
    '''

    # By returning max of current value or 0, makes all negatives 0 aka ReLU function
    A = np.maximum(0, Z)

    return A

def softmax(Z):
    '''
    Applies softmax function to unactivated layer (Z)

    Returns A
    '''

    # Returns e to the power of each value in the current layer divided by the sum of all e to the powers in current layer aka softmax
    A = np.exp(Z) / sum(np.exp(Z))

    return A

def forward_prop(W1: np.ndarray, b1: np.ndarray, W2: np.ndarray, b2: np.ndarray, X: np.ndarray):
    '''
    Creates unactivated layers (Z) from input, weights, and biases
        then activates those layers with activation functions, turning them into A

    Returns Z1, A1, Z2, A2
    '''

    # Creates Z1 layer from W1, X, and b1 and activates with ReLU
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)

    # Creates Z2 layer from W2, A1, and b2 and activates with softmax
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)

    return Z1, A1, Z2, A2

def one_hot(Y: np.ndarray, n_classes: int):
    '''
    Makes an array with a new array of size equal to number of classes with the nth value as 
        one to indicate class and all others as 0 for each n in Y

    Returns one_hot_Y
    '''

    # Creates number of arrays equal to size of Y made up of amount of 0s equal to n_classes
    one_hot_Y = np.zeros((Y.size, n_classes))

    # For each array, make the index, given by current value in 1, 0
    one_hot_Y[np.arange(Y.size), Y] = 1
    
    # Transpose matrix
    one_hot_Y = one_hot_Y.T

    return one_hot_Y

def deriv_ReLU(Z: np.ndarray):
    '''
    Applies the derivative of the ReLU function by replacing value with 1 if greater than 0,
        as slope of ReLU is 1 for 0 and beyond, and 0 if less than 0, as the slope of ReLU
        is 0 for less than 0
    
    Returns g
    '''

    g = Z > 0

    return g

def back_prop(Z1: np.ndarray, A1: np.ndarray, Z2: np.ndarray, A2: np.ndarray, W2: np.ndarray, X: np.ndarray, Y: np.ndarray, n_classes: int):
    '''
    Gets the difference from the predicted and actual values and essentially performs steps in
        reverse to find out how much the original weights were off

    Returns dW1, db1, dW2, db2 
    '''

    # Get one hot encoded Y values
    one_hot_Y = one_hot(Y, n_classes)

    # Calculate difference of Z2 by getting difference between activated layer 2 and actual Ys
    dZ2: np.ndarray = 1 / m * (A2 - one_hot_Y)

    # Calculate difference of W2 by calculating 1/m times the dot-product of the differences of Z2 and the tranpose of A1
    dW2: np.ndarray = dZ2.dot(A1.T)

    # Calculate difference of b2 by calculating 1/m times the sum of the values of difference of Z2
    db2: np.ndarray = np.sum(dZ2)

    # Calculate difference of Z1 by the dot-product of the transpose of W2 and the differences of Z2 times the derivative of ReLU applied to Z1
    dZ1: np.ndarray = W2.T.dot(dZ2) * deriv_ReLU(Z1)

    # Calculate difference of W1 by calculating 1/m times the dot-product of the differences of Z1 and the tranpose of X
    dW1: np.ndarray = dZ1.dot(X.T)

    # Calculate difference of b1 by calculating 1/m times the sum of the values of difference of Z1
    db1: np.ndarray = np.sum(dZ1)

    return dW1, db1, dW2, db2

def update_params(W1: np.ndarray, b1: np.ndarray, W2: np.ndarray, b2: np.ndarray, dW1: np.ndarray, db1: np.ndarray, dW2: np.ndarray, db2: np.ndarray, alpha: float):
    '''
    Updates params by subtracting the calculated difference multiplied by 
        alpha, the learning rate

    Returns W1, b1, W2, b2
    '''
    
    #Update weights and biases
    # Layer 1
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1

    # Layer 2
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2

    return W1, b1, W2, b2



In [17]:
def get_preds(A2: np.ndarray):
    '''
    Gets predictions from last activated layer (A2) by getting the max value
        for each array
    
    Returns preds
    '''

    preds = np.argmax(A2, 0)

    return preds

def get_accuracy(preds: np.ndarray, Y: np.ndarray):
    '''
    Gets accuracy of predictions vs real by summing up all the occurences of
        predictions being equal to real and dividing by the size
    
    Returns accuracy_score
    '''

    accuracy_score = np.sum(preds == Y) / Y.size

    return accuracy_score

def get_accuracy_test(W1: np.ndarray, b1: np.ndarray, W2: np.ndarray, b2: np.ndarray, X: np.ndarray, Y: np.ndarray):
    '''
    Gets accuracy of predictions vs real by summing up all the occurences of
        predictions being equal to real and dividing by the size
    
    Returns accuracy_score
    '''

    # Get last activated layer with forward_prop
    Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)

    # Get predictions from data in last activated layer
    preds = get_preds(A2)

    # Calculate accuracy score by summing up the ones that match the real and divide by size
    accuracy_score = np.sum(preds == Y) / Y.size

    return accuracy_score

def gradient_descent(X: np.ndarray, Y: np.ndarray, iterations: int, alpha: float, n_classes: int, X_test_data: np.ndarray, Y_test_data: np.ndarray):
    '''
    Goes through learning process, printing out accuracy every 
    '''

    # Get initial random weights
    W1, b1, W2, b2 = init_params()

    # Loop for each iteration
    for i in range(iterations + 1):
        
        # Get layers with forward_prop
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)

        # Find error with back_prop
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W2, X, Y, n_classes)

        # Update params based off error and learning rate
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)

        # Every 50 iterations print out stats
        if i % 50 == 0:
            print(f"Iteration: {i}")
            print(f"Train Accuracy: {get_accuracy(get_preds(A2), Y)}")
            print(f"Test Accuracy: {get_accuracy_test(W1, b1, W2, b2, X_test_data, Y_test_data)}")
            print()
    
    # Return final weights and biases of model
    return W1, b1, W2, b2

In [18]:
W1, b1, W2, b2 = gradient_descent(X_train, y_train, 1000, 0.5, 10, X_test, y_test)

Iteration: 0
Train Accuracy: 0.10401785714285715
Test Accuracy: 0.1767857142857143

Iteration: 50
Train Accuracy: 0.6432738095238095
Test Accuracy: 0.6422619047619048

Iteration: 100
Train Accuracy: 0.7748214285714285
Test Accuracy: 0.7783333333333333

Iteration: 150
Train Accuracy: 0.825
Test Accuracy: 0.8228571428571428



KeyboardInterrupt: 