In [None]:
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

# Classifying hand-written digits

### Loading the MNIST data set

In this document, we will use both our own neural network algorithm, and Tensorflow library. To make it easier for people using Google Colab to follow, we will load the MNIST data set from the TensorFlow library that comes preinstalled with colab. Users loading these notebooks in their own environments will have to install Tensorflow anyways.

First we import the TensorFlow library as `tf`:

In [None]:
import tensorflow as tf

Then we load the MNIST dataset as `mnist`:

In [None]:
from tensorflow.keras.datasets import mnist

With the function `load_data`, we load the MNIST dataset into the variable `dataset`:

In [None]:
dataset = mnist.load_data()

We can further unpack our dataset into 4 variables containing training and validation data and labels:

In [None]:
(train_data, train_labels), (validation_data, validation_labels) = dataset

Check the length/shape of our training and validation data and labels:

Since this dataset does not have a testing set, create one from the training set. Take the last 10000 data points from it and put it into variables `test_data` and `test_labels`. Then remove these points from the training set:

Print the shape of the testing set:

### Exploring the MNIST data set

Print the shape of the first training data point:

The following function visualises the matrix of numbers as an image. Visualise some other data points:

In [None]:
plt.imshow(train_data[50], cmap=cm.Greys)
plt.show()

For the selected training data point, print the corresponding label:

Print the labels of the first 100 training data points:

Print the values of 14th row of the first training data point:

Print the values of the 8th column of the first data point:

### Normalising the values in the MNIST data set

Find the maximum value in our data set:

Normalise the values of all data sets between 0 and 1:

Print the values of 14th row of the first training data point:

### Reshaping the values in the MNIST data set

Print the shape of the first data point in the training set:

Reshape this data point into a column vector and store it in a variable `dp`:

What is the shape of the vector stored in the variable `dp`:

Reshape all the data as column vectors:

Print the shape of our training data:

## Using our own Neural Network algorithm for classification

### Converting the labels of the MNIST data set

For this purpose we can reuse the function `convert_label` that we defined before:

In [None]:
def convert_label(x):
    vec = np.zeros((10,1))
    vec[x] = 1
    return vec

Convert the label of the first data point in the training set:

Convert all the label sets into numpy arrays: `train_labels_new`,  `validation_labels_new` and `test_labels_new`:

Print the shape of our new training labels:

### Changes in the algorithm

Apart for two elements, the algorithm we will use here is identical to the algorithms we have been using so far. What differs is:
- When initialising weights, we divide each weight by the square root of the number of activations in the previous layer. This makes weights values smaller for the layer after the input one. This is done because the input layer has 784 activations which would make weights in the following layer too large.
- We are using mini-batch training. Online training would be too slow, and batch training is not feasible since we have 50.000 data points. To learn step-by-step how we made mini batches, check the notebook named 'Appendix 01 - Mini batch'

To use mini batches, we will zip together data with labels into a set:

In [None]:
train_set = np.array(list(zip(train_data, train_labels_new)))
validation_set = np.array(list(zip(validation_data, validation_labels_new)))
testing_set = np.array(list(zip(test_data, test_labels_new)))

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def evaluate_accuracy(dset, round_digits):
    num_correct = 0;
    for a, y in dset:
        for W, b in zip(weights, biases):
            a = sigmoid(np.dot(W, a) + b)
        if (np.argmax(a)==np.argmax(y)):
            num_correct += 1
    return np.round(100*(num_correct / len(dset)),round_digits)

In [None]:
sizes = [784, 100, 10]

num_epochs = 8
step_size = 3
biases = [np.random.randn(a,1) for a in sizes[1:]]
weights = [np.random.randn(nout, nin)/np.sqrt(nin) for nout, nin in zip(sizes[1:], sizes[:-1])]
d_biases = [np.zeros(b.shape) for b in biases]
d_weights = [np.zeros(w.shape) for w in weights]
no_layers = len(sizes)
datalen = len(train_set)
mini_batch_length = 10

for epoch in np.arange(num_epochs):
    TC = 0
    np.random.shuffle(train_set)
    mini_batches = [train_set[a:a+mini_batch_length] 
                    for a in range(0, datalen, mini_batch_length)]
    for batch in mini_batches:
        dd_biases = [np.zeros(b.shape) for b in biases]
        dd_weights = [np.zeros(w.shape) for w in weights]
        # single point (a,y)
        for a, y in batch:
            activations = [a]
            weighted_sums = []
            for W, b in zip(weights, biases):
                z = np.dot(W, a) + b
                weighted_sums.append(z)
                a = sigmoid(z)
                activations.append(a)
            # cost
            C = np.sum((a-y)**2)
            TC += C
            # backward pass
            dC = 2*(a-y)
            delta = dC * a * (1 - a)
            d_biases[-1] = delta
            d_weights[-1] = np.dot(delta, activations[-2].T)
            for i in range(2, no_layers):
                delta = activations[-i]*(1-activations[-i])*np.dot(weights[-i+1].T,delta)
                d_biases[-i] = delta
                d_weights[-i] = np.dot(delta, activations[-i-1].T)
            dd_weights = [dw+ddw for dw, ddw in zip(d_weights, dd_weights)]
            dd_biases = [db+ddb for db, ddb in zip(d_biases, dd_biases)]
        blen = len(batch)
        weights = [d-dw/blen*step_size for d, dw in zip(weights, dd_weights)]
        biases = [d-db/blen*step_size for d, db in zip(biases, dd_biases)]
    acc_train = evaluate_accuracy(train_set,4)
    acc_validation = evaluate_accuracy(validation_set,4)
    print (f'epoch: {epoch+1} | total cost: {np.round(TC,4)}')
    print (f"Training set prediction accuracy:   {acc_train}%")
    print (f"Validation set prediction accuracy: {acc_validation}%\n")

Let's evaluate the accuracy of the testing set:

In [None]:
acc_test = evaluate_accuracy(testing_set, 4)
print (f"Testing set prediction accuracy: {acc_test}%")

### Plotting the wrong predictions

In [None]:
counter = 0
for dp, y in testing_set:
    a = dp
    for W, b in zip(weights, biases):
        a = sigmoid(np.dot(W, a) + b)
    prediction = np.argmax(a)
    label = np.argmax(y)
    if not (prediction==label):
        counter += 1
        print (f'prediction: {prediction}')
        print (f'label: {label}')
        plt.imshow(dp.reshape(28,28), cmap=cm.Greys)
        plt.show()
    if counter==10:
        break

## Using Tensorflow Keras for classification

### Recreating our own model in Tensorflow Keras

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(784,1)),
  tf.keras.layers.Dense(100, activation='sigmoid'),
  tf.keras.layers.Dense(10, activation='sigmoid')
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate = 3.0),
              loss='mse',
              metrics=['accuracy'])

history = model.fit(train_data, 
                    train_labels_new, 
                    validation_data=(validation_data,validation_labels_new), 
                    epochs=8)

In [None]:
model.evaluate(test_data, test_labels_new, verbose=2)

### Using the recommended Tensorflow Keras parameters 

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(784,1)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10)
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(train_data, 
                    train_labels, 
                    validation_data=(validation_data, validation_labels),
                    epochs=8)

In [None]:
model.evaluate(test_data, test_labels, verbose=2)