In [8]:
"""import our data"""

import numpy as np
from data import get_data_from_csv, get_labels_and_data_1st_column

# DATA FROM HERE: https://pjreddie.com/projects/mnist-in-csv/
file_test = '../data/MNIST/mnist_test.csv'
file_train = '../data/MNIST/mnist_train.csv'

data_test, m_test, n_test = get_data_from_csv(file_test)
Y_test, X_test = get_labels_and_data_1st_column(data_test)

data_train, m_train, n_train = get_data_from_csv(file_train)
Y_train, X_train = get_labels_and_data_1st_column(data_train)

assert n_test == n_train
n = n_test
m = m_test + m_train



In [9]:
"""making sure that our Y_test/Y_train are actually labels"""

assert Y_test.max() == 9
assert Y_train.max() == 9
assert X_test[0].max() != 9
assert X_train[0].max() != 9

# display(Y_test[:100])
# display(Y_train[:100])
# display(X_test[500][:100])
# display(X_train[500][:100])

In [10]:
# normalize pixel data to values between 0 and 1. Max = 255, min = 0, thus divide by 255
X_train = X_train / 255.
X_test = X_test / 255.

In [11]:
def loss(Y, Y_hat):
    return 1/2 * np.sum((Y - Y_hat)**2)

def softmax(s):
    return np.exp(s) / sum(np.exp(s))

def softmax_deriv(s):
    sm = softmax(s)
    return sm.dot(1-sm)

LR = 1
Y = np.array([0,0,1,0])
A1 = np.array([-.5, 2, -1, 2])
W1 = W2 = np.random.randn(4, 4)
b1 = b2 = np.random.randn(4)
epochs = 100

print()
print("GOING WITH softmax_deriv for real")
for epoch in range(epochs):
    # print()
    # print(f"{epoch = }")
    # print(f"{Y = }")
    Z2 = np.dot(W2,A1) + b2

    Y_hat = softmax(Z2)
    # print(f"{Y_hat = }")
    Loss  =  loss(Y,Y_hat)
    if not epoch%(epochs//2):
        print(f"\n{W2 = }\n{b2 = }\n{Loss = }")

    DY_hat = -Y + Y_hat
    # print(f"{DY_hat = }")
    dY_hatdZ2  =  softmax_deriv(Z2)
    # print(f"{dY_hatdZ2 = }")
    DZ2  =  DY_hat * dY_hatdZ2
    # print(f"{DZ2 = }")
    dZ2dW2 = A1
    dZ2db2 = 1
    DW2 = DZ2 * A1
    Db2 = DZ2

    W2 = W2 - LR*DW2
    b2 = b2 - LR*Db2

print()
print("GOING WITH softmax_deriv is just 1")
for epoch in range(epochs):
    # print()
    # print(f"{epoch = }")
    # print(f"{Y = }")
    Z2 = np.dot(W1,A1) + b1

    Y_hat = softmax(Z2)
    # print(f"{Y_hat = }")
    Loss  =  loss(Y,Y_hat)
    if not epoch%(epochs//2):
        print(f"\n{W1 = }\n{b1 = }\n{Loss = }")

    DY_hat = -Y + Y_hat
    # print(f"{DY_hat = }")
    dY_hatdZ2  =  1
    # print(f"{dY_hatdZ2 = }")
    DZ2  =  DY_hat * dY_hatdZ2
    # print(f"{DZ2 = }")
    dZ2dW1 = A1
    dZ2db1 = 1
    DW1 = DZ2 * A1
    Db1 = DZ2

    W1 = W1 - LR*DW1
    b1 = b1 - LR*Db1


GOING WITH softmax_deriv for real

W2 = array([[-0.99425939, -0.90462065,  0.70144063, -0.17468478],
       [ 0.48365069,  1.24297392,  1.11278452, -0.70666226],
       [ 0.03642154,  0.82449719,  0.89398637, -1.24945377],
       [-0.32258708,  1.13457837,  0.73162431, -0.20000658]])
b2 = array([-0.36804239, -0.31741067,  0.141857  , -0.03714414])
Loss = 0.7940905229516391

W2 = array([[-0.90595812, -2.86148666, -2.95014179, -5.16777855],
       [ 0.57195195, -0.7138921 , -2.53879791, -5.69975603],
       [ 0.12472281, -1.13236882, -2.75759605, -6.24254754],
       [-0.23428581, -0.82228764, -2.91995812, -5.19310035]])
b2 = array([-0.54464493, -1.29584368,  3.79343943, -2.53369102])
Loss = 0.0032520580413038314

GOING WITH softmax_deriv is just 1

W1 = array([[-0.99425939, -0.90462065,  0.70144063, -0.17468478],
       [ 0.48365069,  1.24297392,  1.11278452, -0.70666226],
       [ 0.03642154,  0.82449719,  0.89398637, -1.24945377],
       [-0.32258708,  1.13457837,  0.73162431, -0.200

In [176]:
"""
FORWARD PASS
Give X
A0 = X :: [784,m]
Z1[10,m] = W1[10,784] * X[784,m] + b1[10]
A1[10,m] = RelU(Z1[10,m])
Z2[10,m] = W2[10,10] * A1[10,m] + b2[10]
Y_hat[10,m] = softmax(A2[10,m])
Receive Y_hat
"""


def initialize_w_b():
    W1 = np.random.randn(10, 784)
    b1 = np.random.randn(10,1)
    W2 = np.random.randn(10,10)
    b1 = np.random.randn(10,1)
    return W1, b1, W2, b1

def ReLU(n: float) -> float:
    """rectified linear unit activation function"""
    return np.maximum(n, 0)

def ReLU_deriv(n: float) -> int:
    """"derivative of ReLU"""
    return n > 0

def softmax(array: np.array) -> np.array:
    # collapses 1 dimension of array
    return np.exp(array) / np.sum(np.exp(array), axis=0)

def forward_pass(X, W1, b1, W2, b2):
    Z1 = np.dot(W1, X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    Y_hat = softmax(Z2) 
    return X, Z1, A1, Z2, Y_hat

def one_hot_encode(Y: np.array, classes = 10):
    # first instantiate 0's which should be an array of len(Y) max(Y) 
    one_hot = np.zeros((Y.size,classes))
    one_hot[np.arange(Y.size), Y] = 1
    return one_hot.T


def backwards_propagation(Y_hat, Y, Z2, A1, Z1, W1, b1, W2, b2, m, X):
    """


    
    n = number of possible encodings # in general, n can change through a network,
        but we're assuming that n is used for encodings and also layer size
    m = number of inputs, batch size

    x = number of datapoints per input
    X[x,m] = Input - m examples of x data points
    Y[n,m] = one-hot encoding of results: e.g. [[0,0,0,1,0,0], ... ] (with m encodings)
    W1[n,x] = weights applied to X
    b1[n,1] = biases applied to values going into Z1_n
    Z1[n,m] = pre-activation function values = W1[n,x] dot X[x,m] + b1[n,1]
    A1[n,m] = ReLU(Z1[n,m])
    W2[n,n] = weights applied to A1
    b2[n,1] = biases applied to values going into Z2_n
    Z2[n,m] = pre-softmax function values = W2[n,n] dot A1[n,m] + b2[n,1]
    Y_hat[n,m] = the estimate of Y = softmax(Z2[n,m])
    
    definition: Loss = L[m] = [L_0, L_1, ..., L_m] # possiblly L[1,m]
        = 1/2*np.sum((Y - Y_hat)**2)
        in other words for each row m:
            L_m = 1/2*sum[j from 0 to n] of (Y[j,m] - Y_hat[j,m])**2 
            = sum[j from 0 to n] of 1/2*Y[j,m]**2 - Y[j,m]*Y_hat[j,m] + 1/2*Y_hat[j,m]**2
(1)     L = np.sum(1/2*Y**2 - Y*Y_hat + 1/2*Y_hat**2)
    We set this loss specifically so that the derivative works out nicely

    To minimize L, we want to see how L will change with respect to the variable
    that we can control, namely Y_hat.
    We can calculate how changing one of the values in Y_hat effects Y.
    DY_hat[j,m] = dL/dY_hat[j,m] = -Y[j,m] + Y_hat[j,m] 
    This generalizes because dL/dY_hat should be an array of shape Y.shape, with each
    value j corresponding to how much Y_hat[j] effects L given some specific input (m).
    NOTE: dL/dY_hat will show how to change Y_hat to *increase* L by 1 unit 
    (this will be true for all derivatives we calculate).
    dL/dY_hat = -Y + Y_hat (calculated derivative from (1))
    call this DY_hat for simplicity

    REMINDER: Y_hat[n,m] = softmax(Z2[n,m])
    NOTE: dSoftmax(X)/dX = softmax(X) dot (1-softmax(X)), but we will use 1 instead FOR SOME REASON???
    dY_hat/dZ2 = 1                    ####### not: softmax(Z2)*(1-softmax(Z2))
    thus, by chain rule:
    DZ2[n,m] = dL/dZ2 = dL/dY_hat * dY_hat/dZ2 = DY_hat * 1

    REMINDER: Z2[n,m] = W2[n,n] dot A1[n,m] + b2[n,1]
    dZ2/dW2[m,n] = A1[n,m].T[m,n] / m
    dZ2/db2[m,1] = np.sum(Z2[n,m])[m] / m
    Now, were ready to calculate DW2, Db2
    DW2[n,n] = dL/dW2 = dL/dZ2 * dZ2/dW2 = DZ2[n,m] dot A1.T[m,n] / m
    Db2[n,1] = dL/db2 = dL/dZ2 * dZ2/db2 = DZ2[n,m] dot np.sum(Z2[n,m])[m,1] / m 

    
    dZ2/dA1 = W2[n,n]
    DA1[n,n] = dL/dA1 = dL/dZ2 * dZ2/dA1 = DZ2[n,m] * W2[n,n]
    DZ1[x,x] = dL/dA1 * dA1/dZ1 = DA1[n,n] * ReLU_deriv(Z1)

    REMINDER: Z1[n,m] = W1[n,x] dot X[x,m] + b1[n,1]
    dZ1/dW1[m,n] = X[n,m].T[m,n] / m
    dZ1/db1[m,1] = np.sum(Z1[n,m])[m] / m
    Now, were ready to calculate DW1, Db1
    DW1[x,n] = dL/dW1 = dL/dZ1 * dZ1/dW1 = DZ1[x,x] dot X.T[x,n] / m
    Db1[n,1] = dL/db1 = dL/dZ2 * dZ1/db1 = DZ2[n,m] dot np.sum(Z2[n,m])[m,1] / m 
    
    """
    m = Y.size
    Y = one_hot_encode(Y)
    print(f"{Y.shape}")
    DY_hat = -Y + Y_hat
    DZ2 = DY_hat * 1
    
    DW2 = np.dot(DZ2,A1.T) / m
    Db2 = np.dot(DZ2, np.sum(Z2)) / m

    DA1 = np.dot(W2.T, DZ2)
    DZ1 = DA1 * ReLU_deriv(Z1)

    DW1 = np.dot(DZ1, X.T) / m
    Db1 = np.dot(DZ2, np.sum(Z2)) / m

    return DW1, Db1, DW2, Db2


def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    
    return W1, b1, W2, b1


In [177]:
W1, b1, W2, b2 = initialize_w_b()



batch_size = 8
m = batch_size
# X = X_test[:,:m]
# Y = Y_test[:m]


# Y = one_hot_encode(Y)

Y = Y_train
X = X_train
X, Z1, A1, Z2, Y_hat = forward_pass(X, W1, b1, W2, b2)

DW1, Db1, DW2, Db2 = backwards_propagation(Y_hat, Y, Z2, A1, Z1, W1, b1, W2, b2, m, X)


(10, 59999)


In [178]:
print(f"{DW1.shape=}, {Db1.shape=}, {DW2.shape=}, {Db2.shape=}")

DW1.shape=(10, 784), Db1.shape=(10, 59999), DW2.shape=(10, 10), Db2.shape=(10, 59999)


In [179]:
W1, b1, W2, b1 = update_params(W1, b1, W2, b2, DW1, Db1, DW2, Db2, 0.01)

In [None]:
"""
a = [0, .5, 2, 1]

sums = e^0 + e^.5 + e^2 + e^1


softmax(a) = [e^0/sums, e^.5/sums, e^2/sums, e^1/sums]

"""