# Now (finally) we can implement the whole thing

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data/train.csv")

data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
data = data.sample(frac=1)

data

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
9292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17859,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24717,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35923,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15084,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19445,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32024,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4291,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38547,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Data Preparation

In [25]:
X = data.iloc[:, 1:].values
Y = data.iloc[:, 0].values

# normalize the data
X = X / 255.0

# takes 90% of the data for training
n = int(X.shape[0]*0.9)

X_train = X[:n].T # now each column is a digit
Y_train = Y[:n]

X_test = X[n:].T # also here each column is a digit
Y_test = Y[n:]

# print all the shapes
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)


(784, 37800) (37800,)
(784, 4200) (4200,)


In [26]:
# One hot encoding
Y_train = np.eye(10)[Y_train].T
Y_test = np.eye(10)[Y_test].T

# print the first 10 training and testsamples
print(Y_train[:10])
print(Y_test[:10])

[[1. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 1. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


In [27]:
Y_train.shape

(10, 37800)

# Some functions

In [28]:
def ReLu(Z):
    return np.maximum(0, Z)

def SoftMax(Z):
    return np.exp(Z) / np.sum(np.exp(Z), axis=0)

def dReLu(Z):
    return Z > 0

def dSoftMax(Z):
    return Z - np.eye(10)[Z]

def prediction(Z):
    return np.argmax(Z, axis=0)

def accuracy(Y_pred, Y_true):
    return np.sum(Y_pred == Y_true) / Y_true.shape[0]


In [29]:
def init_params():
    W1 = np.random.randn(10, 784) * 0.01
    b1 = np.zeros((10, 1))
    W2 = np.random.randn(10, 10) * 0.01
    b2 = np.zeros((10, 1))

    return W1, b1, W2, b2

def forward_prop(X, W1, b1, W2, b2):
    Z1 = W1 @ X + b1
    A1 = ReLu(Z1)
    Z2 = W2 @ A1 + b2
    A2 = SoftMax(Z2)

    return Z1, A1, Z2, A2

def backward_prop(X, Y, A1, A2, Z1, Z2, W1, W2, b1, b2):
    m = X.shape[1]
    dZ2 = A2 - Y
    dW2 = 1/m * dZ2 @ A1.T
    db2 = 1/m * np.sum(dZ2)
    dZ1 = W2.T @ dZ2 * dReLu(Z1)
    dW1 = 1/m * dZ1 @ X.T
    db1 = 1/m * np.sum(dZ1)

    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2

    return W1, b1, W2, b2

def gradient_descent(X_train, Y_train, alpha, num_iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(num_iterations):
        Z1, A1, Z2, A2 = forward_prop(X_train, W1, b1, W2, b2)
        dW1, db1, dW2, db2 = backward_prop(X_train, Y_train, A1, A2, Z1, Z2, W1, W2, b1, b2)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        
        if i % 10 == 0:
            print(f"Iteration {i} - Accuracy: {accuracy(prediction(A2), Y_train)}")

    return W1, b1, W2, b2



In [30]:
gradient_descent(X_train, Y_train, 0.01, 1000)

Iteration 0 - Accuracy: 3088.5


ValueError: operands could not be broadcast together with shapes (10,37800) (10,10) 

# Model

**Data**

* $\mathbb{X} \in \R^{m \times n}$ where $ m = 784$ and $n = 41000$, is our dataset where each column is a digit (aka an observation)
* $Y \in \R^n$, the labels


**Forward Propagation**

$$ Z^{[1]} = W^{[1]}\cdot \mathbb{X} + b^{[1]}$$
$$ A^{[1]} = ReLu(Z^{[1]}) $$
$$ Z^{[2]} = W^{[2]}\cdot A^{[1]} + b^{[2]} $$
$$ A^{[2]} = SoftMax(Z^{[2]}) $$

**Backward Propagation**

$$ dZ^{[2]} = A^{[2]} - Y $$
$$ dW^{[2]} = \frac{1}{m} dZ^{[2]} A^{[1]T} $$
$$ db^{[2]} = \frac{1}{m} \Sigma {dZ^{[2]}} $$
$$ dZ^{[1]} = W^{[2]T} dZ^{[2]} .* g^{[1]\prime} (z^{[1]}) $$
$$ dW^{[1]} = \frac{1}{m} dZ^{[1]} A^{[0]T} $$
$$ db^{[1]} = \frac{1}{m} \Sigma {dZ^{[1]}} $$


**Parameter updates**

$$W^{[2]} := W^{[2]} - \alpha dW^{[2]}$$
$$b^{[2]} := b^{[2]} - \alpha db^{[2]}$$
$$W^{[1]} := W^{[1]} - \alpha dW^{[1]}$$
$$b^{[1]} := b^{[1]} - \alpha db^{[1]}$$

**Vars and shapes**

Forward prop

- $A^{[0]} = X$: 784 x m
- $Z^{[1]} \sim A^{[1]}$: 10 x m
- $W^{[1]}$: 10 x 784 (as $W^{[1]} A^{[0]} \sim Z^{[1]}$)
- $B^{[1]}$: 10 x 1
- $Z^{[2]} \sim A^{[2]}$: 10 x m
- $W^{[1]}$: 10 x 10 (as $W^{[2]} A^{[1]} \sim Z^{[2]}$)
- $B^{[2]}$: 10 x 1

Backprop

- $dZ^{[2]}$: 10 x m ($~A^{[2]}$)
- $dW^{[2]}$: 10 x 10
- $dB^{[2]}$: 10 x 1
- $dZ^{[1]}$: 10 x m ($~A^{[1]}$)
- $dW^{[1]}$: 10 x 10
- $dB^{[1]}$: 10 x 1


# Define some stuff