# Classification with Flux

In [1]:
# Explanation to go here

In [2]:
# Including the Math

In [3]:
# Explanation of MNIST

In [18]:
using Pkg

In [19]:
Pkg.add("BSON")

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[32m[1m Installed[22m[39m RecursiveArrayTools ─ v2.0.4
[32m[1m Installed[22m[39m BSON ──────────────── v0.2.5
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
 [90m [fbb218c0][39m[92m + BSON v0.2.5[39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
 [90m [fbb218c0][39m[92m + BSON v0.2.5[39m
 [90m [731186ca][39m[93m ↑ RecursiveArrayTools v2.0.2 ⇒ v2.0.4[39m


In [20]:
using Flux
using Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle
using Base.Iterators: repeated, partition
using Printf, BSON

┌ Info: Precompiling BSON [fbb218c0-5317-5bc6-957e-2ee96dd4b1f0]
└ @ Base loading.jl:1242


## 1. Classification using a multi-layer-perceptron (MLP)

In [10]:
# Definition of the MLP to go here

In [7]:
# Load dataset
imgs = Flux.Data.MNIST.images()

# Stack into one batch
X = hcat(float.(reshape.(imgs, :))...);

In [9]:
# Load labels
labels = Flux.Data.MNIST.labels()

# One-hot-encode the labels
Y = onehotbatch(labels, 0:9);

In [12]:
# Set up the MLP
m = Chain(
    Dense(28^2, 32, relu),
    Dense(32, 10),
    softmax
)

loss(x, y) = crossentropy(m(x), y)
accuracy(x, y) = mean(onecold(m(x)) .== onecold(y));

In [13]:
# Set up the training
dataset = repeated((X, Y), 200)
evalcb = () -> @show(loss(X, Y))
opt = ADAM()

Flux.train!(loss, params(m), dataset, opt, cb = throttle(evalcb, 10))

loss(X, Y) = 2.3216996f0
loss(X, Y) = 2.0873454f0
loss(X, Y) = 1.8983456f0
loss(X, Y) = 1.7171309f0
loss(X, Y) = 1.5390797f0
loss(X, Y) = 1.3663688f0
loss(X, Y) = 1.2474085f0
loss(X, Y) = 1.1391566f0
loss(X, Y) = 1.0431044f0
loss(X, Y) = 0.9589184f0
loss(X, Y) = 0.88494235f0
loss(X, Y) = 0.80148685f0
loss(X, Y) = 0.73233825f0
loss(X, Y) = 0.6742944f0
loss(X, Y) = 0.6369574f0
loss(X, Y) = 0.60460365f0
loss(X, Y) = 0.57656044f0
loss(X, Y) = 0.55203193f0
loss(X, Y) = 0.5237859f0
loss(X, Y) = 0.4997492f0
loss(X, Y) = 0.47909763f0
loss(X, Y) = 0.46123615f0
loss(X, Y) = 0.44564414f0
loss(X, Y) = 0.4318972f0
loss(X, Y) = 0.41966504f0
loss(X, Y) = 0.40868494f0
loss(X, Y) = 0.39876556f0
loss(X, Y) = 0.38974342f0
loss(X, Y) = 0.38147405f0
loss(X, Y) = 0.37383783f0
loss(X, Y) = 0.366751f0
loss(X, Y) = 0.3601332f0
loss(X, Y) = 0.35392383f0
loss(X, Y) = 0.34807175f0
loss(X, Y) = 0.34254754f0
loss(X, Y) = 0.33732703f0
loss(X, Y) = 0.3323816f0
loss(X, Y) = 0.3276865f0
loss(X, Y) = 0.32321966f0
loss(X

In [14]:
# Assess the accuracy
accuracy(X, Y)

0.9237166666666666

In [15]:
# Compute the test set accuracy
tX = hcat(float.(reshape.(Flux.Data.MNIST.images(:test), :))...)
tY = onehotbatch(Flux.Data.MNIST.labels(:test), 0:9)

accuracy(tX, tY)

0.924

## 2. Classification using a convolutional neural network

In [22]:
# Load labels and images
train_labels = Flux.Data.MNIST.labels()
train_imgs = Flux.Data.MNIST.images();

In [44]:
# Construct minibatches
function make_minibatch(X, Y, idxs)
    X_batch = Array{Float32}(undef, size(X[1])..., 1, length(idxs))
    for i in 1:length(idxs)
        X_batch[:, :, :, i] = Float32.(X[idxs[i]])
    end
    Y_batch = onehotbatch(Y[idxs], 0:9)
    return (X_batch, Y_batch)
end

batch_size = 128
mb_idxs = partition(1:length(train_imgs), batch_size)
train_set = [make_minibatch(train_imgs, train_labels, i) for i in mb_idxs]

# Test set as one minibatch
test_imgs = Flux.Data.MNIST.images(:test)
test_labels = Flux.Data.MNIST.labels(:test)
test_set = make_minibatch(test_imgs, test_labels, 1:length(test_imgs));

In [38]:
# Convolutional architecture with three iterations of Conv -> ReLU -> MaxPool followed by
# a final dense layer fed into a softmax probability output
model = Chain(
    # 1st convolutional layer, taking a 28x28 image
    Conv((3, 3), 1=>16, pad=(1, 1), relu),
    MaxPool((2, 2)),
    
    # 2nd convolutional layer, taking a 14x14 image
    Conv((3, 3), 16=>32, pad=(1, 1), relu),
    MaxPool((2, 2)),
    
    # 3rd convolutional layer, taking a 7x7 image
    Conv((3, 3), 32=>32, pad=(1, 1), relu),
    MaxPool((2, 2)),
    
    # Reshape 3d tensor into a 2d tensor of shape (3, 3, 32, N)
    x -> reshape(x, :, size(x, 4)),
    Dense(288, 10),
    
    # Softmax output layer
    softmax,
);

In [39]:
# If GPU is enabled uncomment the lines below to load the model onto a GPU
#train_set = gpu.(train_set)
#test_set = gpu.(test_set)
#model = gpu(model)

In [40]:
# Precompile the model
model(train_set[1][1])

10×128 Array{Float32,2}:
 0.0856447  0.0855172  0.0793483  …  0.080293   0.0805311  0.0820767
 0.0863398  0.088711   0.0882837     0.0896656  0.0868811  0.0903626
 0.112502   0.108703   0.108787      0.114588   0.108995   0.109684 
 0.102752   0.107586   0.102949      0.110352   0.106419   0.108801 
 0.109906   0.111776   0.107399      0.104055   0.108576   0.103853 
 0.0880292  0.0881556  0.101727   …  0.0896815  0.0892337  0.0925875
 0.122918   0.121647   0.106211      0.124256   0.12633    0.115889 
 0.100094   0.0967535  0.10436       0.0969488  0.0951824  0.099685 
 0.0926575  0.0917759  0.101075      0.0935281  0.098061   0.100009 
 0.0991572  0.0993751  0.09986       0.0966319  0.0997908  0.0970526

In [41]:
# Crossentropy loss between prediction and ground truth, add Gaussian noise to make model more robust
function loss(x, y)
    # Add random noise to x
    x_aug = x .+ 0.1f0 * randn(eltype(x), size(x))
    
    y_hat = model(x_aug)
    return crossentropy(y_hat, y)
end

accuracy(x, y) = mean(onecold(model(x)) .== onecold(y));

In [42]:
# Use ADAM as the optimizer
opt = ADAM(0.001)

best_acc = 0.0
last_improvement = 0;

In [45]:
# Define the training loop and train for 100 epochs
for epoch_idx in 1:100
    global best_acc, last_improvement
    
    # Train for one epoch
    Flux.train!(loss, params(model), train_set, opt)
    
    # Calculate the accuracy
    acc = accuracy(test_set...)
    @info(@sprintf("[%d]: Test accuracy: %.4f", epoch_idx, acc))
    
    # Stop if accuracy is good enough
    if acc >= 0.999
        @info(" -> Early-exiting: We reached our target accuracy of 99.9%")
        break
    end
    
    # Reduce learning rate if there has been no improvement for 5 epochs
    if epoch_idx - last_improvement >= 5 && opt.eta > 1e-6
        opt.eta /= 10.0
        @warn(" -> Haven't improved in a while, dropping learning rate to $(opt.eta)!")
        
        last_improvement = epoch_idx
    end
    
    if epoch_idx - last_improvement >= 10
        @warn(" -> The model has converged.")
        break
    end
end

┌ Info: [1]: Test accuracy: 0.9731
└ @ Main In[45]:10


InterruptException: InterruptException:

## 3. Exercise: Construct a different neural network for classification

In [None]:
# Look up two papers, which are more state-of-the-art and urge people to have a go at creating these models
# Paper 1
# Paper 2