# Import dependencies

In [6]:
import numpy as np
import pandas as pd

# Read Data

In [119]:
data = pd.read_csv('datasets/train.csv')

In [120]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
data.shape

(42000, 785)

In [122]:
X = data.drop('label', axis=1)
X.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
y = data['label']
y.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

In [125]:
y = np.array(y)
y[:5]

array([1, 0, 1, 4, 0], dtype=int64)

In [126]:
y.shape

(42000,)

In [127]:
X = X.T
X.shape

(784, 42000)

In [128]:
X = np.array(X)
X.shape

(784, 42000)

In [134]:
X = X/255

In [106]:
m, n = data.shape
print(m)
print(n)

42000
785


# Split training and test set

In [135]:
test_ratio = 0.25
train_ratio = 1 - test_ratio

test_size = int(len(y) * test_ratio)
train_size = len(y) - test_size

X_train = X[:, :train_size]
y_train = y[:train_size]

X_test = X[:, :test_size]
y_test = y[:test_size]

# Neural Network

## Initialize parameters

In [137]:
def init_params():
    w1 = np.random.randn(16, 784)
    b1 = np.random.randn(16, 1)
    w2 = np.random.randn(10, 16)
    b2 = np.random.randn(10, 1)
    return w1, b1, w2, b2

## Feedforward

In [69]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [70]:
def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

In [71]:
def feed_forward(w1, b1, w2, b2, X):
    z1 = w1.dot(X) + b1
    a1 = sigmoid(z1)
    z2 = w2.dot(a1) + b2
    a2 = sigmoid(z2)
    return z1, a1, z2, a2

## Backpropagation

In [72]:
def one_hot(y): # encoding y
    res = np.zeros((y.size, 10))
    res[np.arange(y.size), y] = 1
    res = res.T
    return res

In [73]:
def backprop(w1, b1, w2, b2, z1, z2, a1, a2, X, y):
    y_one_hot = one_hot(y)
    dz2 = a2 - y_one_hot
    dw2 = 1/m * dz2.dot(a1.T)
    db2 = 1/m * np.sum(dz2)
    dz1 = w2.T.dot(dz2) * sigmoid_prime(z1)
    dw1 = 1/m * dz1.dot(X.T)
    db1 = 1/m * np.sum(dz1)
    return dw1, db1, dw2, db2

In [74]:
def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, eta):
    w1 = w1 - eta*dw1
    b1 = b1 - eta*db1
    w2 = w2 - eta*dw2
    b2 = b2 - eta*db2
    return w1, b1, w2, b2

In [75]:
def predict(a2):
    return np.argmax(a2, 0)

def accuracy(predictions, y):
    print(predictions, y)
    return np.sum(predictions == y) / y.size

In [76]:
def gradient_descent(X, y, iterations, eta):
    w1, b1, w2, b2 = init_params()
    for i in range(iterations):
        z1, a1, z2, a2 = feed_forward(w1, b1, w2, b2, X)
        dw1, db1, dw2, db2 = backprop(w1, b1, w2, b2, z1, z2, a1, a2, X, y)
        w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, eta)
        if i % 10 == 0:
            print('Iteration: ', i)
            predictions = predict(a2)
            print('Accuracy: ', accuracy(predictions, y))
    return w1, b1, w2, b2

In [141]:
w1, b1, w2, b2 = gradient_descent(X_train, y_train, 500, 1)

Iteration:  0
[8 7 8 ... 8 9 8] [1 0 1 ... 2 9 5]
Accuracy:  0.09123809523809524
Iteration:  10
[9 4 6 ... 8 4 8] [1 0 1 ... 2 9 5]
Accuracy:  0.1782857142857143
Iteration:  20
[9 0 1 ... 8 4 8] [1 0 1 ... 2 9 5]
Accuracy:  0.27276190476190476
Iteration:  30
[1 0 1 ... 8 4 8] [1 0 1 ... 2 9 5]
Accuracy:  0.3520634920634921
Iteration:  40
[1 0 1 ... 8 4 8] [1 0 1 ... 2 9 5]
Accuracy:  0.415015873015873
Iteration:  50
[1 0 1 ... 8 3 8] [1 0 1 ... 2 9 5]
Accuracy:  0.4661904761904762
Iteration:  60
[1 0 1 ... 8 3 8] [1 0 1 ... 2 9 5]
Accuracy:  0.5014603174603175
Iteration:  70
[1 0 1 ... 8 3 8] [1 0 1 ... 2 9 5]
Accuracy:  0.5315873015873016
Iteration:  80
[1 0 1 ... 8 3 8] [1 0 1 ... 2 9 5]
Accuracy:  0.5547619047619048
Iteration:  90
[1 0 1 ... 8 3 8] [1 0 1 ... 2 9 5]
Accuracy:  0.5758730158730159
Iteration:  100
[1 0 1 ... 8 3 2] [1 0 1 ... 2 9 5]
Accuracy:  0.5948253968253968
Iteration:  110
[1 0 1 ... 2 3 2] [1 0 1 ... 2 9 5]
Accuracy:  0.611015873015873
Iteration:  120
[1 0 1 ... 

In [145]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = feed_forward(W1, b1, W2, b2, X)
    predictions = predict(A2)
    return predictions

def test_prediction(index, X, y, W1, b1, W2, b2):
    current_image = X[:, index, None]
    prediction = make_predictions(X[:, index, None], W1, b1, W2, b2)
    label = y[index]
    print("Prediction: ", prediction)
    print("Label: ", label)
    
    current_image = current_image.reshape((28, 28)) * 255
    plt.gray()
    plt.imshow(current_image, interpolation='nearest')
    plt.show()

In [161]:
import matplotlib.pyplot as plt

def mae(predictions, y):
    return np.sum(np.abs(predictions - y)) / y.size

def mse(predictions, y):
    return np.sum(np.linalg.norm(predictions - y)**2) / y.size
    
predictions = make_predictions(X_test, w1, b1, w2, b2)
print('Mean absolute error: ')
print(mae(predictions, y_test))

print('Mean square error: ')
print(mse(predictions, y_test))

Mean absolute error: 
0.6435238095238095
Mean square error: 
2.9448571428571424
