This notebook uses the code from chapter 18 of the book, "Data Science from Scratch" by Joel Grus, available on [github][1].

[1]: https://github.com/joelgrus/data-science-from-scratch

In [None]:
import math, random
import matplotlib
import matplotlib.pyplot as plt

A few helper functions:

In [None]:
def dot(v, w):
    """v_1 * w_1 + ... + v_n * w_n"""
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def sigmoid(t):
    return 1 / (1 + math.exp(-t))

def argmax(l):
    return l.index(max(l))

Functions for evaluating the network:

In [None]:
def neuron_output(weights, inputs):
    return sigmoid(dot(weights, inputs))

def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    # outputs = two arrays (one array of size 4 for the hidden layer plus one array of size 10 for the output layer)
    return outputs 

def predict(network, input):
    """run input through the network and return output of last layer"""
    return feed_forward(network, input)[-1]

Define the function for backpropagation that we'll need to train the network:

In [None]:
def backpropagate(network, input_vector, target, rate = 1.0):

    hidden_outputs, outputs = feed_forward(network, input_vector)

    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target[i])
                     for i, output in enumerate(outputs)]

    # adjust weights for output layer (network[-1])
    for i, output_neuron in enumerate(network[-1]): # loop over weights of neurons in output layer
        for j, hidden_output in enumerate(hidden_outputs + [1]): # loop over output of neurons in hidden layer
            output_neuron[j] -= output_deltas[i] * hidden_output * rate

    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) *
                      dot(output_deltas, [n[i] for n in network[-1]]) # (*)
                     for i, hidden_output in enumerate(hidden_outputs)]

    # adjust weights for hidden layer (network[0])
    for i, hidden_neuron in enumerate(network[0]): # loop over weights of neurons in hidden layer
        for j, input in enumerate(input_vector + [1]): # loop over output of neurons in first layer, i.e. the inputs
            hidden_neuron[j] -= hidden_deltas[i] * input * rate

To understand equation (*) in the above code, which has not been spelled out in the [short introduction][2] on backpropagation, look again at the [Backpropagation Algorithm][1]:

$$\delta_j^k = g'(a_j^k) \sum_{l=1}^{r^{k+1}} w_{jl}^{k+1}\delta_l^{k+1}$$

Here, $k$ is our (single) hidden layer and $k+1$ is the output layer, i.e. $a_j^k$ are the outputs of the hidden layer.
$g'(x) = x(1-x)$ is again the derivative of the sigmoid. $r^{k+1}$ is the number of nodes in the output layer and the sum corresponds to the `dot` product multiplying the `output_deltas` ($\delta_l^k+1$) and the weights `network[-1]` of the output layer ($w_{jl}^{k+1}$).

[1]: https://brilliant.org/wiki/backpropagation/
[2]: NN_Activation.ipynb

The stylized figures that will serve as inputs to train on (we only have one training set here with one input data per label):

In [None]:
raw_digits = [
   0, """11111
         1...1
         1...1
         1...1
         11111""",

   1, """..1..
         ..1..
         ..1..
         ..1..
         ..1..""",

   2, """11111
         ....1
         11111
         1....
         11111""",

   3, """11111
         ....1
         11111
         ....1
         11111""",

   4, """1...1
         1...1
         11111
         ....1
         ....1""",

   5, """11111
         1....
         11111
         ....1
         11111""",

   6, """11111
         1....
         11111
         1...1
         11111""",

   7, """11111
         ....1
         ....1
         ....1
         ....1""",

   8, """11111
         1...1
         11111
         1...1
         11111""",

   9, """11111
         1...1
         11111
         ....1
         11111"""]

def make_digit(raw_digit):
    return [1 if c == '1' else 0
            for row in raw_digit.split("\n")
            for c in row.strip()]

Define the inputs (pixel images) and targets (one-hot labels):

In [None]:
inputs  = list(map(make_digit, raw_digits[1::2]))

targets = [[1 if i == j else 0 for i in raw_digits[0::2]]
           for j in range(10)]

Define the network structure and initialize:

In [None]:
random.seed(0)   # to get repeatable results
input_size = 25  # each input is a vector of length 25 (25 pixels)
num_hidden = 4  # number of neurons in the hidden layer
output_size = 10 # we need 10 outputs for each input

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)]
                for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)]
                for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

Now we run the training using the backpropagation:

In [None]:
# 10,000 iterations seems enough to converge
num_run = 10000
for x in range(num_run):
    for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector) # 65 µs/loop (4 hidden), 134 µs/loop (10 hidden), 380 µs/loop (32 hidden)
    if x % (num_run / 10) == 0:
        accuracy = sum([argmax(predict(network, input)) == i for i, input in enumerate(inputs)])
        print("Iterations done: %d, accuracy: %d" % (x, accuracy))


Look at the probabilities of the labels the network predicts to the training data:

In [None]:
m = []
for i, input in enumerate(inputs):
    outputs = predict(network, input)
    print(i, [round(p,2) for p in outputs])
    m.append(outputs)

# This is not a confusion matrix.
plt.imshow(m, plt.cm.Blues);
plt.xlabel("Score for label")
plt.ylabel("True label");

In [None]:
print([round(x, 2) for x in
      predict(network,
                [0,1,1,1,0,    # .@@@.
                 0,0,0,1,1,    # ...@@
                 0,0,1,1,0,    # ..@@.
                 0,0,0,1,1,    # ...@@
                 0,1,1,1,0])]) # .@@@.

print([round(x, 2) for x in
      predict(network, 
                [0,1,1,1,0,    # .@@@.
                 1,0,0,1,1,    # @..@@
                 0,1,1,1,0,    # .@@@.
                 1,0,0,1,1,    # @..@@
                 0,1,1,1,0])]) # .@@@.

Show the weights the network has learned for each of the five hidden neurons:

In [None]:
def patch(x, y, hatch, color):
    """return a matplotlib 'patch' object with the specified
    location, crosshatch pattern, and color"""
    return matplotlib.patches.Rectangle((x - 0.5, y - 0.5), 1, 1,
                                        hatch=hatch, fill=False, color=color)


def show_weights(neuron_idx, ax):
    weights = network[0][neuron_idx]

    grid = [weights[row:(row+5)]      # turn the weights into a 5x5 grid
            for row in range(0,25,5)] # [weights[0:5], ..., weights[20:25]]

    pos = ax.imshow(grid,
                    cmap=matplotlib.cm.coolwarm,
                    interpolation='none', # plot blocks as blocks
                    vmin = -8, vmax = 8) # define a unique range for all subplots
    
    # print bias
    ax.set_xlabel("bias = %.2f" % weights[25])
    return pos

fig, ax = plt.subplots(figsize=(15, 3), ncols=num_hidden)
for idx in range(num_hidden):
    pos = show_weights(idx, ax[idx])
    #fig.colorbar(pos, ax = ax[0])


(blue = large negative, red = large positive)

In [None]:
plt.imshow(output_layer, cmap=matplotlib.cm.coolwarm)
plt.xlabel("Weight of hidden neuron (and bias)")
plt.ylabel("Output label");

(blue = large negative, red = large positive)

See how it discriminates e.g. 0 and 8 or 5 and 9?

# Now with the digits dataset

In [None]:
import numpy as np
from sklearn.datasets import load_digits
digits = load_digits()
digits.images.shape

In [None]:
# normalize images and flatten -- note we're normalizing to a mean value << 1
images = [list(image.flatten()/16/8) for image in digits.images]

In [None]:
np.mean(images)

In [None]:
# convert target to one-hot encoding
numbers = [[1 if i == j else 0 for i in range(10)]
            for j in digits.target]

In [None]:
fig, axes = plt.subplots(figsize=(20, 8), ncols=10, nrows=5)
idx = 0
for g in axes:
    for ax in g:
        if idx >= len(images): break
        figure = images[idx]
        ax.imshow(np.array(figure).reshape(8,8))
        idx += 1

In [None]:
random.seed(0)   # to get repeatable results
input_size = 64  # each input is a vector of 64 pixels
num_hidden = 10  # number of neurons in the hidden layer
output_size = 10 # we need 10 outputs for each input

# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)]
                for __ in range(num_hidden)]

# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)]
                for __ in range(output_size)]

# the network starts out with random weights
network = [hidden_layer, output_layer]

In [None]:
# compute accuracy
def accuracy(network, X, y):
    total = float(len(y))
    correct = sum([argmax(predict(network, input)) == argmax(y[idx]) for idx, input in enumerate(X)])
    return correct / total

In [None]:
rate = 1 # learning rate
frac = 1 # fraction of sample to use in each round
num_run = int(50/frac) # iterations over the subsample
for x in range(num_run):
    for input_vector, target_vector in random.sample(list(zip(images, numbers)), int(len(images)*0.1)):
    #for input_vector, target_vector in zip(inputs, targets):
        backpropagate(network, input_vector, target_vector, rate) # 65 µs/loop (4 hidden), 134 µs/loop (10 hidden), 380 µs/loop (32 hidden)
    if x % (num_run / 10) == 0:
        print("Iterations done: %d, accuracy: %.3f" % (x, accuracy(network, images, numbers)))
        #print("Iterations done: %d, accuracy: %.3f" % (x, accuracy(network, inputs, targets)))


In [None]:
def patch(x, y, hatch, color):
    """return a matplotlib 'patch' object with the specified
    location, crosshatch pattern, and color"""
    return matplotlib.patches.Rectangle((x - 0.5, y - 0.5), 1, 1,
                                        hatch=hatch, fill=False, color=color)


def show_weights(neuron_idx, ax):
    weights = network[0][neuron_idx]

    grid = [weights[row:(row+8)]      # turn the weights into a 5x5 grid
            for row in range(0,64,8)] # [weights[0:5], ..., weights[20:25]]

    pos = ax.imshow(grid,
                    cmap=matplotlib.cm.coolwarm,
                    interpolation='none', # plot blocks as blocks
                    vmin = -8, vmax = 8) # define a unique range for all subplots
    
    # print bias
    ax.set_xlabel("bias = %.2f" % weights[25])
    return pos

fig, ax = plt.subplots(figsize=(15, 3), ncols=num_hidden)
for idx in range(num_hidden):
    pos = show_weights(idx, ax[idx])
    #fig.colorbar(pos, ax = ax[0])


In [None]:
plt.imshow(output_layer, cmap=matplotlib.cm.coolwarm)
plt.xlabel("Weight of hidden neuron (and bias)")
plt.ylabel("Output label");