In [1]:
"""import our data"""

import numpy as np
from data import get_data_from_csv, get_labels_and_data_1st_column

# DATA FROM HERE: https://pjreddie.com/projects/mnist-in-csv/
file_test = '../data/MNIST/mnist_test.csv'
file_train = '../data/MNIST/mnist_train.csv'

data_test, m_test, n_test = get_data_from_csv(file_test)
Y_test, X_test = get_labels_and_data_1st_column(data_test)

data_train, m_train, n_train = get_data_from_csv(file_train)
Y_train, X_train = get_labels_and_data_1st_column(data_train)

assert n_test == n_train
n = n_test
m = m_test + m_train



In [2]:
"""making sure that our Y_test/Y_train are actually labels"""

assert max(Y_test) == 9
assert max(Y_train) == 9
assert max(X_test[0]) != 9
assert max(X_train[0]) != 9

# display(Y_test[:100])
# display(Y_train[:100])
# display(X_test[500][:100])
# display(X_train[500][:100])

In [3]:
# normalize pixel data to values between 0 and 1. Max = 255, min = 0, thus divide by 255
X_train = X_train / 255.
X_test = X_test / 255.

In [4]:
"""
FORWARD PASS
Give X
A0 = X :: [784,1]
A1[10,1] = W1[10,784] * X[784,1] + b1[10,1]
Z1[10,1] = RelU(A1[10,1])
A2[10,1] = W2[10,10] * Z1[10,1] + b2[10,1]
Y_hat[10,1] = softmax(A2[10,1])
Receive Y_hat
"""


def initialize_w_b():
    W1 = np.random.randn(10, 784)
    b1 = np.random.randn(10,)
    W2 = np.random.randn(10,10)
    b1 = np.random.randn(10,)
    return W1, b1, W2, b1

def ReLU(n: float) -> float:
    """rectified linear unit activation function"""
    return np.maximum(n, 0)

def ReLU_deriv(n: float) -> int:
    """"derivative of ReLU"""
    return n > 0

def softmax(array: np.array) -> np.array:
    return np.exp(array) / sum(np.exp(array))

def forward_pass(X, W1, b1, W2, b2):
    A0 = X
    Z1 = W1.dot(A0) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return A2, Z2, A1, Z1

def one_hot_encode(Y: np.array, classes = None):
    # first instantiate 0's which should be an array of len(Y) max(Y) 
    if classes == None:
        classes = np.max(Y)+1
    one_hot = np.zeros((Y.size,classes))
    one_hot[np.arange(Y.size), Y] = 1
    return one_hot.T


def backwards_propagation(Y_hat, Y, Z2, A1, Z1, W1, b1, W2, b2, m, X):
    """
    Y_hat = [.1, .1, .7, .1]
    Y = [0,0,1,0]
    dZ2 = [.1, .1, -.3, .1]

    L = Y - Y_hat
    dL/dY_hat = -1 
    dY_hat/dZ2 = 1
    
    dZ2 = dL/dZ2 = dL/dY_hat * dY_hat/dZ2 = (Y_hat - Y)     # * 1 / m
    
    dW2 = dL/dW2 = dL/dZ2 * dZ2/dW2 = dZ2 * 1 / m * A1
    db2 = dL/db2 = dL/dZ2 * dZ2/db2 = dZ2 * 1 / m 
    
    dA1 = dL/dA1 = dL/Z2 * dZ2/dA1 = dZ2 * W2
    
    dZ1 = dL/dZ1 = dL/dA1 * dA1/dZ1 = dA1 * ReLU_deriv(Z1)

    dW1 = dL/dW1 = dL/dZ1 * dZ1/dW1 = dZ1 * 1 / m  * X
    db1 = dL/db1 = dL/dZ1 * dZ1/db1 = dZ1 * 1 / m 


    CALCULATIONS
    dZ2[10,] = Y_hat[10,] - Y[10,]
    dW2[10,10] = 1/m * dZ2[10,1] * A1[10,1].T[1,10]
    dA1[10,1] = W2.T[10,10].dot(dZ2[10,1]) # something weird here
    dZ1[10,1] = dA1[10,1] * ReLU_deriv(Z1[10,1])[10,1]


    """
    m = Y.size
    print(f"{Y_hat.shape=}, {Y.shape=}")
    dZ2 = Y_hat - Y

    print(f"{dZ2.shape=}, {dZ2=}")
    dW2 = 1/m * dZ2.dot(A1.T)
    db2 = 1/m * np.sum(dZ2) 
    dA1 = W2.T.dot(dZ2)
    print(f"{dA1.shape=}")
    dZ1 = dA1 * ReLU_deriv(Z1)

    dW1 = 1/m * dZ1.dot(X.T)
    db1 = 1/m * np.sum(dZ1)

    
    return dW1, db1, dW2, db2




def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    
    return W1, b1, W2, b1


In [5]:
# Y = np.array([0,1,3,4,4,2])
# one_hot = one_hot_encode(Y)
# print(one_hot)

In [6]:
W1, b1, W2, b2 = initialize_w_b()

X = X_train[:,0]
Y = Y_train[0]

A2, Z2, A1, Z1 = forward_pass(X.T, W1, b1, W2, b2)

one_hot_Y = one_hot_encode(Y,classes=10)
print(one_hot_Y)
print(A2)
print(f"{A2.shape=}, {one_hot_Y.shape=}")

print(X)


[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]]
[3.66658505e-16 1.00000000e+00 6.95733620e-41 2.53263020e-21
 4.62930106e-33 7.30932477e-15 1.01901723e-38 2.47827613e-28
 5.26258381e-38 5.40396410e-13]
A2.shape=(10,), one_hot_Y.shape=(10, 1)
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.05098039 0.87843137 0.99607843 0.2

In [166]:
A2[:,None] - one_hot_Y

array([[ 1.36764689e-17],
       [ 2.83178220e-05],
       [ 3.69813936e-20],
       [ 1.49551229e-21],
       [-1.00000000e+00],
       [ 5.34516582e-10],
       [ 1.73603246e-17],
       [ 9.99971682e-01],
       [ 6.59569643e-27],
       [ 2.69530050e-27]])

In [154]:
A2

array([1.36764689e-17, 2.83178220e-05, 3.69813936e-20, 1.49551229e-21,
       4.55864409e-22, 5.34516582e-10, 1.73603246e-17, 9.99971682e-01,
       6.59569643e-27, 2.69530050e-27])

In [8]:
dW1, db1, dW2, db2 = backwards_propagation(A2[:,None], one_hot_Y, Z2, A1, Z1, W1, b1, W2, b2, m, X)

Y_hat.shape=(10, 1), Y.shape=(10, 1)
dZ2.shape=(10, 1), dZ2=array([[ 3.66658505e-16],
       [ 1.00000000e+00],
       [ 6.95733620e-41],
       [ 2.53263020e-21],
       [ 4.62930106e-33],
       [ 7.30932477e-15],
       [-1.00000000e+00],
       [ 2.47827613e-28],
       [ 5.26258381e-38],
       [ 5.40396410e-13]])


ValueError: shapes (10,1) and (10,) not aligned: 1 (dim 1) != 10 (dim 0)

In [156]:
update_params(dW1, db1, dW2, db2)

NameError: name 'dW1' is not defined

In [None]:
"""
a = [0, .5, 2, 1]

sums = e^0 + e^.5 + e^2 + e^1


softmax(a) = [e^0/sums, e^.5/sums, e^2/sums, e^1/sums]

"""