# Artificial Neural Networks - Seoul AI Meetup, June 25
Martin Kersner, <m.kersner@gmail.com>

Inspired by http://neuralnetworksanddeeplearning.com/ and https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/src/network.py.

* Perceptron
* Multilayer perceptron
* Backpropagation
* Forward pass
    * Activation functions
* Backward pass
    * Stochastic Gradient Descent
    * Weights update
* MNIST
* Keras

In [None]:
from __future__ import division
import random
import numpy as np
from utils import *
import matplotlib.pyplot as plt

%matplotlib inline

## Perceptron

<img src="https://raw.githubusercontent.com/martinkersner/ann-meetup/master/files/perceptron.png" />

In [None]:
# PERCEPTRON - MATRIX MULTIPLICATION EXAMPLE

# weights
# w_1 = 1
# w_2 = 1.1
# w_3 = 0.1
# w_4 = 0.2
W = np.matrix([[1.0, 1.1, 0.1, 0.2]])

# input data
# x_1 = 0.5
# x_2 = 0.7
# x_3 = 3
# x_4 = 1.0
X = np.matrix([[0.5],
               [0.7],
               [3],
               [1.0]])

# bias
b = 1

# forward pass
z = np.dot(W, X) + b
print(z)

# Activation function
# a = activation_function(z)

## Multilayer perceptron

<img src="https://raw.githubusercontent.com/martinkersner/ann-meetup/master/files/multi-layer-perceptron.png" />

In [None]:
# MULTILAYER PERCEPTRON - MATRIX MULTIPLICATION EXAMPLE

# weights
# w_11 = 1.0
# w_12 = 1.1
# w_13 = 1.3
# w_21 = 0.1
# w_22 = 0.2
# w_23 = 0.6
# w_31 = 0.4
# w_32 = 1.2
# w_32 = 1.9
W = np.matrix([[1.0, 1.1, 1.3], 
               [0.1, 0.2, 0.6],
               [0.4, 1.2, 1.9]])

# input data
# x_1 = 0.5
# x_2 = 0.7
# x_3 = 0.6
X = np.matrix([[0.5],
               [0.7],
               [0.6]])

# biases
b = np.matrix([[1.0],
               [1.0],
               [1.0]])


# forward pass
z = np.dot(W, X) + b
print(z)

# Activation function
# a = activation_function(z)

## Bias and weight initialization

http://cs231n.github.io/neural-networks-2/#init

In [None]:
# The biases and weights for the network are initialized randomly, using a Gaussian distribution
# with mean 0, and variance 1.
def init_biases(layers, seed=None):
    if seed:
        np.random.seed(seed)

    return [np.random.randn(y, 1) for y in layers[1:]]
    
def init_weights(layers, seed=None):
    if seed:
        np.random.seed(seed)

    return [np.random.randn(y, x) for x, y in zip(layers[:-1], layers[1:])]

# 3-layer neural network
# input layer 10 neurons
# hidden layer 15 neurons
# output layer 7 neurons
layers_tmp = [10, 15, 7]

biases_tmp = init_biases(layers_tmp)
weights_tmp = init_weights(layers_tmp)

In [None]:
# BIASES in layers
for b in biases_tmp:
    print(len(b))

In [None]:
# BIAS VALUES
print(biases_tmp)

In [None]:
# WEIGHTS in layers
for w in weights_tmp:
    print(w.shape)

In [None]:
# WEIGHTS
print(weights_tmp)

## Sigmoid activation function

* Introduces non-linearity
* Saturation problem

<img src="https://raw.githubusercontent.com/martinkersner/ann-meetup/master/files/sigmoid.png" />

Other types of activation functions http://cs231n.github.io/neural-networks-1/#actfun:
* Tanh
* ReLU
* Leaky ReLU
* ELU
* Maxout

In [None]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

In [None]:
# Sigmoid function plot
x = np.linspace(-7, 7, 80)
plt.plot(x, sigmoid(x))

# center of sigmoid
plt.plot([0,0], [0, 1], c='orange')
plt.plot([-7,7], [0.5, 0.5], c='orange')

## Forward pass

In [None]:
def feedforward(biases, weights, a):
    for b, w in zip(biases, weights):
        a = sigmoid(np.dot(w, a)+b)
        
    return np.argmax(a)

## Backward pass

* Derivative of sigmoid (http://www.ai.mit.edu/courses/6.892/lecture8-html/sld015.htm)

In [None]:
# Take a single training example, forward it through network, computes cost
# and compute gradient of weights and biases for each neuron.
def backprop(biases, weights, x, y):
    # Derivate of sigmoid
    def sigmoid_prime(z):
        return sigmoid(z)*(1-sigmoid(z))
    
    # Cost function
    def cost_derivative(output_activations, y):
        return (output_activations-y)    
    
    # gradients of the cost function for each layer
    nabla_b = zeros_like(biases)
    nabla_w = zeros_like(weights)
    
    # FORWARD PASS
    activation = x
    activations = [x] # list to store all the activations, layer by layer
    zs = [] # list to store all the z vectors, layer by layer
    
    for b, w in zip(biases, weights):
        z = np.dot(w, activation)+b        
        activation = sigmoid(z)
        
        # store all previous z and a vectors
        zs.append(z)
        activations.append(activation)
        
    # BACKWARD PASS
    # the last layer
    delta = cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
    nabla_b[-1] = delta
    nabla_w[-1] = np.dot(delta, activations[-2].transpose())
    
    # previous layers
    # going backwards layer by layer
    num_layers = len(weights)+1
    for l in xrange(2, num_layers):
        z = zs[-l]
        sp = sigmoid_prime(z)
        delta = np.dot(weights[-l+1].transpose(), delta) * sp
        
        nabla_b[-l] = delta
        nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        
    return (nabla_b, nabla_w)

## Minibatch

Minibatch is randomly selected training inputs of fixed size.

## Stochastic Gradient Descent (aka SGD)
* Stochastic
* Gradient
* Descent

<img src="https://raw.githubusercontent.com/martinkersner/ann-meetup/master/files/gradient-descent.png" />

<img src="https://raw.githubusercontent.com/martinkersner/ann-meetup/master/files/sgd-update.png" />

**Eta** hyperparameter represent size of step during gradient descent.

In [None]:
# Update weights.
def update_mini_batch(biases, weights, mini_batch, eta):
    def update(v, nv, eta, mini_batch_size):
        return v-(eta/mini_batch_size)*nv
    
    def cumulate(nabla_v, delta_nabla_v):
        return [nv+dnv for nv, dnv in zip(nabla_v, delta_nabla_v)]
        
    nabla_b = zeros_like(biases)
    nabla_w = zeros_like(weights)
    
    # Collect weights and bias gradients for each minibatch and sum them
    for x, y in mini_batch:
        delta_nabla_b, delta_nabla_w = backprop(biases, weights, x, y)
        nabla_b = cumulate(nabla_b, delta_nabla_b)
        nabla_w = cumulate(nabla_w, delta_nabla_w)
    
    mini_batch_size = len(mini_batch)
    
    weights = [update(w, nw, eta, mini_batch_size)
               for w, nw in zip(weights, nabla_w)]
    
    biases = [update(b, nb, eta, mini_batch_size)
              for b, nb in zip(biases, nabla_b)]
    
    return biases, weights

## Training
* **Epoch** full pass through the training set. ([reference](http://ufldl.stanford.edu/tutorial/supervised/OptimizationStochasticGradientDescent/))

In [None]:
def SGD(biases, weights, training_data, epochs, mini_batch_size, eta, test_data=None):
    def evaluate(biases, weights, data, training_data_evalution=False):
        if training_data_evalution:
            test_results = [(feedforward(biases, weights, x), np.argmax(y)) for (x, y) in data]
        else:
            test_results = [(feedforward(biases, weights, x), y) for (x, y) in data]

        return sum(int(x == y) for (x, y) in test_results)
    
    if test_data:
        n_test = len(test_data)

    n = len(training_data)
    test_acc_lst = []
    train_acc_lst = []
    
    for j in xrange(epochs):
        random.shuffle(training_data)
        
        mini_batches = [training_data[k:k+mini_batch_size] for k in xrange(0, n, mini_batch_size)]
        
        for mini_batch in mini_batches:
            biases, weights = update_mini_batch(biases, weights, mini_batch, eta)
            
        if test_data:
            test_acc = evaluate(biases, weights, test_data)
            test_acc_lst.append(test_acc/n_test)
            
            train_acc = evaluate(biases, weights, training_data, training_data_evalution=True)
            train_acc_lst.append(train_acc/n)
            
            print("Epoch {0}: {1} / {2}".format(j, test_acc, n_test))
        else:
            print("Epoch {0} complete".format(j))
            
    return biases, weights, train_acc_lst, test_acc_lst

## MNIST dataset
* http://yann.lecun.com/exdb/mnist/
* 28 by 28 pixels
* grayscale images

In [None]:
import mnist_loader
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

print("number of training examples: {0:1d}".format(len(training_data)))
print("number of validation examples: {0:1d}".format(len(validation_data)))
print("number of test examples: {0:1d}".format(len(test_data)))

## MNIST Training data

In [None]:
def get_sample(data, idx):
    X_rand = data[idx][0]
    y_rand = data[idx][1]
    
    return X_rand, y_rand

def display_digit(data, idx=None):
    """ If idx parameter is not specified display random data sample. """
    if idx == None:
        idx = random.randint(0, len(data)-1)
    
    X_tmp, y_tmp = get_sample(data, idx)

    img = X_tmp.reshape((28, 28))
    plt.figure()
    plt.imshow(img, cmap='gray')
    
    if isinstance(y_tmp, int):
        print(y_tmp)
    else:
        print(np.argmax(y_tmp))
    
display_digit(training_data)

## MNIST One-hot encoding

In [None]:
def display_random_one_hot_encoding(data):
    rand_idx = random.randint(0, len(data)-1)    
    _, y_rand = get_sample(data, rand_idx)
    
    print(y_rand)
    plt.figure()
    plt.imshow(y_rand.T, cmap='gray')
    print(np.argmax(y_rand))
    
display_random_one_hot_encoding(training_data)

## MNIST Training

In [None]:
# The first layer containing 784 neurons (784 pixels as input), 
# the second layer 30 neurons (arbitrarily chosen),
# and the output layer has 10 neurons (one-hot encoding for 10 different values).
layers = [784, 30, 10]

seed = 5 # to get always the same results

# randomly initialize weights and biases
biases = init_biases(layers, seed=seed)
weights = init_weights(layers, seed=seed)

# Training hyperparameters
epochs=5
mini_batch_size=10
eta=3.0

# Train
biases_final, weights_final, train_acc, test_acc = SGD(biases, weights, training_data, epochs, mini_batch_size, eta, test_data=validation_data)
# biases_final, weights_final, _, _test_acc = SGD(biases, weights, training_data, epochs, mini_batch_size, eta)

In [None]:
# Plot training curves
plt.plot(range(len(test_acc)), test_acc, c="r", label='Test accuracy')
plt.plot(range(len(train_acc)), train_acc, c="b", label='Train accuracy')
plt.legend()
plt.ylabel("accuracy")
plt.xlabel("epochs")

## MNIST Evaluation on test dataset

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_pred = []
y_true = []

for test_idx in range(len(test_data)):
    X_test = test_data[test_idx][0]
    y_test = test_data[test_idx][1]
    
    y_true.append(y_test)
    y_pred.append(feedforward(biases_final, weights_final, X_test))

# Confusion matrix
plt.imshow(confusion_matrix(y_true, y_pred), cmap="jet")

# Precision, Recall, F1-score, Support
print(classification_report(y_true, y_pred))

## Keras