In [None]:
import numpy as np
import mnist_loader
from utils import show, barplot, accuracy, misclassified, inference
from matplotlib import pyplot as plt
import IPython
from IPython.display import Image
from random import randint
Image('img/00.png')


In [None]:
# load dataset
x_train, t_train, x_test, t_test = mnist_loader.load()

print('Training set size: [{},{}]'.format(*x_train.shape))
print('Test set size: [{},{}]'.format(*x_test.shape))
_ = plt.pie([x_train.shape[0], x_test.shape[0]])

In [None]:
sample = 7000
show(x_train[sample, :])
print('Digit: {}'.format(t_train[sample]))

In [None]:
# Neuron
Image('img/01.png')

In [None]:
def sigmoid(activation):
    return 1 / (1 + np.exp(-activation))

sigmoid(-0)

In [None]:
# avoiding running into overflow
def sigmoid(a):
    a[a>=0] = 1 / (1 + np.exp(-a[a>=0]))
    a[a<0] = np.exp(a[a<0]) / (np.exp(a[a<0]) + 1)
    return a
sigmoid(np.float32([-800]))

In [None]:
# Layer
Image('img/02.png')

In [None]:
# Network
Image('img/03.png')

In [None]:
class Net:
    def __init__(self, input_size, hidden_size, output_size):
        
        self.Wih = self.initialize_weights([input_size, hidden_size])
        self.Bih = np.zeros([1, hidden_size])
        self.Who = self.initialize_weights([hidden_size, output_size])
        self.Bho = np.zeros([1, output_size])
        
        num_param = self.Wih.size + self.Bih.size + self.Who.size + self.Bho.size
        print('\nNetwork initialized with {} parameters.'.format(num_param))
        
        self.hidden_state = None
        self.input = None
    
    @staticmethod
    def initialize_weights(shape):
        # initialization with random uniform distribution between [-1, 1]
        return np.random.random_sample(shape) * 2 - 1
    
    @staticmethod
    def preprocess(input):
        # shifting pixels intensities [0, 255] to [-1, 1]
        return input/127.5 - 1
    
    def forward(self, input):
        x = self.preprocess(input)
        
        z = x.dot(self.Wih) + self.Bih  # hidden activation
        h = sigmoid(z)                  # hidden state
        l = h.dot(self.Who) + self.Bho  # output activations
        y = softmax(l)                  # output


        # store some values for backprop
        self.hidden_state = h
        self.input = x
        return y

In [None]:
def softmax(activation):
    exponentiated = np.exp(activation)
    return exponentiated/np.sum(exponentiated)

activation = np.array([[0.5, 0.1, 1, 0.2]])
y = softmax(activation)
barplot(y)

In [None]:
# unfortunately we need to complicate it a bit: numerical stability issues and handling batch inputs
def softmax(activation):
    # subtracting max value for numerical stability (avoiding exponentiating large values)
    max_value = np.max(activation, axis=1)
    max_value = np.repeat(np.expand_dims(max_value, 1), activation.shape[1], axis=1)
    offset_values = activation - max_value
    exponentiated = np.exp(offset_values)
    sum_exp = np.repeat(np.expand_dims(np.sum(exponentiated, axis=1), 1), activation.shape[1], axis=1)
    return exponentiated/sum_exp


In [None]:
# let's create our network
net = Net(input_size=784, hidden_size=30, output_size=10)

In [None]:
# try running forward propagation on arbitrary input image
random_input = randint(0, 60000)
input = x_train[random_input, :]
output = net.forward(input)

# plot input image and output distribution
target = t_train[random_input]
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
show(input, ax1)
barplot(output, ax2)
_ = ax1.set_title('Input: {}'.format(target))
_ = ax2.set_title('Output')

In [None]:
# Let's check how well we do on the test set:
acc = accuracy(x_test, t_test, net)
print('Accuracy: {:02}%'.format(acc * 100))

In [None]:
# Learning
Image('img/04_grad_desc.png')

In [None]:
def cross_entropy_error(target, output):
    tiny = 1e-40
    log_outputs = np.log(output + tiny)
    loss = -target.transpose().dot(log_outputs)
    return loss

In [None]:
# play a bit with Cross-Entropy error
cross_entropy_error(target=np.array([1, 0, 0]), output=np.array([1, 0, 0]))

In [None]:
# we are re-shaping the training set to mini-batches for Stochastic Gradient Descent

batch_size = 64
num_batches = x_train.shape[0] // batch_size
batches = np.reshape(x_train[:num_batches * batch_size, :], [num_batches, batch_size, -1])

# also, for the loss calculation we need to encode targets as one-hot vectors
identity_matrix = np.eye(10)
onehot_encoding = identity_matrix[t_train]
target_batches = np.reshape(onehot_encoding[:num_batches * batch_size, :], [num_batches, batch_size, -1])

In [None]:
# let's add backprop to our network
class Net:

    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        # initialize parameters
        self.Wih = np.random.random_sample([input_size, hidden_size]) * 2 - 1
        self.Bih = np.zeros([1, hidden_size])
        self.Who = np.random.random_sample([hidden_size, output_size]) * 2 - 1
        self.Bho = np.zeros([1, output_size])
        self.input = None
        self.hidden_state = None
        self.batch_size = None

        self.d_Bho = None
        self.d_Who = None
        self.d_Bih = None
        self.d_Wih = None



    def forward(self, input):
        self.batch_size = input.shape[0]
        self.input = input

        hidden_activation = input.dot(self.Wih) + self.Bih
        self.hidden_state = sigmoid(hidden_activation)
        logits = self.hidden_state.dot(self.Who) + self.Bho
        output = softmax(logits)

        return output

    def backprop(self, output, target):
        
        d_logits = target - output

        self.d_Bho = np.ones(self.batch_size).dot(d_logits)
        self.d_Who = self.hidden_state.transpose().dot(d_logits)

        d_hidden_state = d_logits.dot(self.Who.transpose())

        d_k = np.multiply(np.multiply(self.hidden_state, 1 - self.hidden_state), d_hidden_state)


        self.d_Bih = np.ones(self.batch_size).dot(d_k)
        self.d_Wih = self.input.transpose().dot(d_k)


    def apply_gradients(self, learning_rate):

        self.Wih += self.d_Wih * learning_rate
        self.Bih += self.d_Bih * learning_rate
        self.Who += self.d_Who * learning_rate
        self.Bho += self.d_Bho * learning_rate
        

In [None]:
net = Net(input_size=784, hidden_size=100, output_size=10)

In [None]:
# TRAIN!
epochs = 10
accuracies = []
for j in range(epochs):
    for i in range(num_batches):

        input = batches[i] / 255
        target = target_batches[i]

        output = net.forward(input)
        net.backprop(output, target)
        net.apply_gradients(0.001)
        


        if i % 1000 == 0:
            #print(mean_squared_error(target, output))
            acc = accuracy(x_test, t_test, net) * 100
            print('Epoch: {0} \t accuracy: {1:3.2f}%'.format(j, acc))
            accuracies.append(acc)
        

In [None]:
plt.plot(accuracies)

In [None]:
# try running forward propagation on arbitrary input image
random_input = randint(0, 10000)
input = x_test[random_input, :]
output = net.forward(input)

# plot input image and output distribution
target = t_test[random_input]
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
show(input, ax1)
barplot(output, ax2)
ax1.set_title('Input: {}'.format(target))
ax2.set_title('Output')

In [None]:
missed = misclassified(x_test, t_test, net)

In [None]:
sample = missed[100]
input = x_test[sample, :]
output = net.forward(input)

# plot input image and output distribution
target = t_test[sample]
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,5))
show(input, ax1)
barplot(output, ax2)
ax1.set_title('Input: {}'.format(target))
ax2.set_title('Output')