In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

# Intermediate Q&A 1
***

Agenda:
* Tips and tricks for the challenge
    * Layers, optimizers, losses, hyperparameters
    * My solution
* Q&A

# Useful layers
***
`Dense` layers are fully-connected
* $784$ input pixels for $n$ `Dense` neurons give $784n$ weights!
* Each output depends on each input

`Conv2D` layers setup small *kernels* that are applied across the input
* Only $k_w*k_h$ weights per neuron!
<img src="https://anhreynolds.com/img/cnn.png" style="height:300px;">

# Useful layers 2
***
`Pooling(w,h)` layers reduce the dimensionality of the input
* `AvgPool2D`: Take the average value of each patch
* `MaxPool2D`: Take the maximum value each patch
* No weights and learning, but helps sorting out unecessary input

In [10]:
input_layer = keras.layers.Input((28,28,))
l = keras.layers.Reshape((28,28,1))(input_layer)

l = keras.layers.Conv2D(filters=32, kernel_size=(3,3), 
                        activation="relu", padding="same")(l)
l = keras.layers.AvgPool2D((2,2))(l)

l = keras.layers.Conv2D(filters=32, kernel_size=(3,3), 
                        activation="relu", padding="same")(l)
l = keras.layers.AvgPool2D((2,2))(l)

model = keras.models.Model(input_layer, l)
model.summary(50)

Model: "functional_7"
__________________________________________________
Layer (type)          Output Shape        Param # 
input_7 (InputLayer)  [(None, 28, 28)]    0       
__________________________________________________
reshape_6 (Reshape)   (None, 28, 28, 1)   0       
__________________________________________________
conv2d_6 (Conv2D)     (None, 28, 28, 32)  320     
__________________________________________________
average_pooling2d_6 ( (None, 14, 14, 32)  0       
__________________________________________________
conv2d_7 (Conv2D)     (None, 14, 14, 32)  9248    
__________________________________________________
average_pooling2d_7 ( (None, 7, 7, 32)    0       
Total params: 9,568
Trainable params: 9,568
Non-trainable params: 0
__________________________________________________


# Useful optimizers
***
All optimizers in `keras` are based on gradient descent
* Idea: A negative gradient indicates decreasing function values
* The `learning_rate` determines how far to move into the steepest direction

`SGD` is the vanilla gradient descent method
* Gets stuck in local optima fast

Others, like `Adam`, `Adamax`, `Adadelta` dynamically set learning rate
* E.g based on gradient norms, dimensionality
* In practice, they almost always outperform `SGD`

# Which to choose?
***
The effectivity of an optimizer depends on the network and the training data

Use case:
* Proof of concept, i.e. "Wonder if this would learn anything"
    * Start with standard methods, e.g. `SGD`
* Scientific research
    * Try any *reasonable* methods
* Trying to beat Niklas
    * Try out everything

# Useful losses
***
Our problem involves classification. In almost all cases the one below works very well
* `categorical_crossentropy` is a measure of closeness between discrete distributions
    * I.e. set the parameters such that our model approximates the input distribution

For classification it might also be interesting to play around with
* `MSE`: Usually worse, but has its perks for some optimizers
 

# Hyperparameters
***
Getting the most out of your model usually involves tuning hyperparameters:
* `learning_rate`
    * Large steps might skip over more local minima, but may also not converge
    * Small steps might converge to local minima and never leave them again
        * Decreasing the learning during training can help a lot!
* `batch_size`
    * Theoretically, `batch_size=1` has the fastest convergence properties
        * But per epoch, not per second!
    * Start with a `batch_size` that allows for rapid protoype training
        * Depends on your hardware
    * For promising candidates: Lower it until you see diminishing returns

# Try it out
***
All of the design choices available are interconnected:
* Network architecture
    * Type of layers
    * Activation functions
    * Weights per layer
* Optimizer
    * Learning rate, momentum
    * batch size
* Losses

There are no perfect solutions
* It is a lot of trial and error
* Very good solutions for scenario A might be far from good for scenario B

# My current solution
***
Result of tons of trial and error.

In [3]:
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

def evaluate_model(model, x_test, y_test):
    num_weights = model.count_params()
    acc_metric = keras.metrics.CategoricalAccuracy()
    out = model(x_test)
    acc_metric.update_state(y_test, out)
    acc = acc_metric.result().numpy()
    return {"Test accuracy":acc, "Number of parameters":num_weights}

def get_data():
    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
    y_train = keras.utils.to_categorical(y_train)
    y_test = keras.utils.to_categorical(y_test)

    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = get_data()

In [23]:
tf.random.set_seed(1)

input_layer = keras.layers.Input((28,28,))
l = keras.layers.Reshape((28,28,1))(input_layer)
l = keras.layers.AvgPool2D((2,2))(l)
l = keras.layers.Conv2D(3, 3, activation="sigmoid", padding="valid")(l)
l = keras.layers.Conv2D(1, 3, activation="sigmoid", padding="valid")(l)
l = keras.layers.AvgPool2D((3,3))(l)
l = keras.layers.Conv2D(10, 3, activation="softmax", padding="valid")(l)

l = keras.layers.Reshape((-1,))(l)

model = keras.models.Model(input_layer, l)
model.summary(50)

Model: "functional_17"
__________________________________________________
Layer (type)          Output Shape        Param # 
input_9 (InputLayer)  [(None, 28, 28)]    0       
__________________________________________________
reshape_16 (Reshape)  (None, 28, 28, 1)   0       
__________________________________________________
average_pooling2d_16  (None, 14, 14, 1)   0       
__________________________________________________
conv2d_24 (Conv2D)    (None, 12, 12, 3)   30      
__________________________________________________
conv2d_25 (Conv2D)    (None, 10, 10, 1)   28      
__________________________________________________
average_pooling2d_17  (None, 3, 3, 1)     0       
__________________________________________________
conv2d_26 (Conv2D)    (None, 1, 1, 10)    100     
__________________________________________________
reshape_17 (Reshape)  (None, 10)          0       
Total params: 158
Trainable params: 158
Non-trainable params: 0
______________________________________________

In [24]:
model.compile("nadam", "categorical_crossentropy", ["accuracy"])

callbacks = [keras.callbacks.ReduceLROnPlateau(monitor="loss", factor=0.5, patience=3, verbose=1),
            keras.callbacks.EarlyStopping(monitor="loss", patience=5, min_delta=1e-6)]

_ = model.fit(x_train, y_train, epochs=1000, batch_size=1024, verbose=2, callbacks = callbacks)
print(evaluate_model(model, x_test, y_test))

Epoch 1/1000
59/59 - 0s - loss: 2.3183 - accuracy: 0.0987
Epoch 2/1000
59/59 - 0s - loss: 2.2952 - accuracy: 0.1422
Epoch 3/1000
59/59 - 0s - loss: 2.2811 - accuracy: 0.1242
Epoch 4/1000
59/59 - 0s - loss: 2.2589 - accuracy: 0.1469
Epoch 5/1000
59/59 - 0s - loss: 2.2239 - accuracy: 0.1992
Epoch 6/1000
59/59 - 0s - loss: 2.1729 - accuracy: 0.2957
Epoch 7/1000
59/59 - 0s - loss: 2.1109 - accuracy: 0.3647
Epoch 8/1000
59/59 - 0s - loss: 2.0447 - accuracy: 0.4726
Epoch 9/1000
59/59 - 0s - loss: 1.9780 - accuracy: 0.5224
Epoch 10/1000
59/59 - 0s - loss: 1.9159 - accuracy: 0.5637
Epoch 11/1000
59/59 - 0s - loss: 1.8564 - accuracy: 0.5857
Epoch 12/1000
59/59 - 0s - loss: 1.7996 - accuracy: 0.6066
Epoch 13/1000
59/59 - 0s - loss: 1.7448 - accuracy: 0.6205
Epoch 14/1000
59/59 - 0s - loss: 1.6912 - accuracy: 0.6356
Epoch 15/1000
59/59 - 0s - loss: 1.6406 - accuracy: 0.6471
Epoch 16/1000
59/59 - 0s - loss: 1.5928 - accuracy: 0.6575
Epoch 17/1000
59/59 - 0s - loss: 1.5476 - accuracy: 0.6672
Epoch 

Epoch 140/1000
59/59 - 0s - loss: 0.4407 - accuracy: 0.8691
Epoch 141/1000
59/59 - 0s - loss: 0.4396 - accuracy: 0.8688
Epoch 142/1000
59/59 - 0s - loss: 0.4374 - accuracy: 0.8698
Epoch 143/1000
59/59 - 0s - loss: 0.4366 - accuracy: 0.8695
Epoch 144/1000
59/59 - 0s - loss: 0.4346 - accuracy: 0.8698
Epoch 145/1000
59/59 - 0s - loss: 0.4333 - accuracy: 0.8702
Epoch 146/1000
59/59 - 0s - loss: 0.4315 - accuracy: 0.8713
Epoch 147/1000
59/59 - 0s - loss: 0.4308 - accuracy: 0.8716
Epoch 148/1000
59/59 - 0s - loss: 0.4295 - accuracy: 0.8713
Epoch 149/1000
59/59 - 0s - loss: 0.4276 - accuracy: 0.8718
Epoch 150/1000
59/59 - 0s - loss: 0.4262 - accuracy: 0.8719
Epoch 151/1000
59/59 - 0s - loss: 0.4257 - accuracy: 0.8718
Epoch 152/1000
59/59 - 0s - loss: 0.4241 - accuracy: 0.8720
Epoch 153/1000
59/59 - 0s - loss: 0.4232 - accuracy: 0.8722
Epoch 154/1000
59/59 - 0s - loss: 0.4223 - accuracy: 0.8729
Epoch 155/1000
59/59 - 0s - loss: 0.4207 - accuracy: 0.8726
Epoch 156/1000
59/59 - 0s - loss: 0.4195

Epoch 277/1000
59/59 - 0s - loss: 0.3475 - accuracy: 0.8920
Epoch 278/1000
59/59 - 0s - loss: 0.3472 - accuracy: 0.8926
Epoch 279/1000
59/59 - 0s - loss: 0.3473 - accuracy: 0.8926
Epoch 280/1000
59/59 - 0s - loss: 0.3465 - accuracy: 0.8928
Epoch 281/1000
59/59 - 0s - loss: 0.3464 - accuracy: 0.8929
Epoch 282/1000
59/59 - 0s - loss: 0.3460 - accuracy: 0.8925
Epoch 283/1000
59/59 - 0s - loss: 0.3454 - accuracy: 0.8937
Epoch 284/1000
59/59 - 0s - loss: 0.3456 - accuracy: 0.8930
Epoch 285/1000
59/59 - 0s - loss: 0.3446 - accuracy: 0.8929
Epoch 286/1000
59/59 - 0s - loss: 0.3447 - accuracy: 0.8931
Epoch 287/1000
59/59 - 0s - loss: 0.3442 - accuracy: 0.8935
Epoch 288/1000
59/59 - 0s - loss: 0.3444 - accuracy: 0.8938
Epoch 289/1000
59/59 - 0s - loss: 0.3437 - accuracy: 0.8936
Epoch 290/1000
59/59 - 0s - loss: 0.3432 - accuracy: 0.8935
Epoch 291/1000
59/59 - 0s - loss: 0.3432 - accuracy: 0.8936
Epoch 292/1000
59/59 - 0s - loss: 0.3430 - accuracy: 0.8942
Epoch 293/1000
59/59 - 0s - loss: 0.3426

{'Test accuracy': 0.9112, 'Number of parameters': 158}


# Question time