In [None]:
using MLDatasets: MNIST
using Flux
using Flux.Data: DataLoader
using Flux: onehotbatch, onehot, onecold
using Flux.Losses: crossentropy, logitcrossentropy
using Flux.Optimise
using Images

using ProgressMeter: @showprogress
using Printf

using LinearAlgebra

In [None]:
using CUDA

if CUDA.functional()
    device = gpu
else
    device = cpu
end

---

# MNIST

In [None]:
X_train, y_train = MNIST.traindata(Float32)
X_test, y_test = MNIST.testdata(Float32)

Ntrain, Ntest = length(y_train), length(y_test)

In [None]:
size(X_train), size(X_test)

In [None]:
# Images have one channel since they are grayscale

X_train = reshape(X_train, 28, 28, 1, :)
X_test = reshape(X_test, 28, 28, 1, :)

size(X_train), size(X_test)

In [None]:
k = rand(1:Ntrain)
@show k
@show y_train[k]
MNIST.convert2image(1 .- X_train[:,:,1,k])

---

# One Hot / One Cold

In [None]:
k = rand(1:Ntrain)
@show k
@show y_train[k]
onehot(y_train[k], 0:9)

In [None]:
yv_train = onehotbatch(y_train, 0:9)
yv_test  = onehotbatch(y_test, 0:9)

y_train[k], onecold(yv_train[:,1], 0:9)

In [None]:
k = rand(1:Ntrain)
@show k
@show onecold(yv_train[:,k], 0:9)
MNIST.convert2image(1 .- X_train[:,:,1,k])

---

# CNN (Convolutional Neural Net)

In [None]:
model = Chain(
    Conv((3, 3), 1=>1, relu),
    MaxPool((2, 2)),
    flatten,
    Dense(169, 10)
#    softmax
) |> device

ps = params(model)

num_params = sum(length, ps)

In [None]:
train_data = DataLoader((X_train, yv_train), batchsize=32,
    shuffle=true)

test_data = DataLoader((X_test, yv_test), batchsize=10000)

loss(ŷ, y) = logitcrossentropy(ŷ, y)

for (x, y) in train_data
    x, y = x |> device, y |> device
    ŷ = model(x)
    @show size(ŷ)
    break
end

In [None]:
function accuracy(data)
    acc = 0
    ntot = 0
    for (x, y) in data
        x, y = x |> device, y |> device
        ŷ = model(x)
        acc += sum(onecold(ŷ |> cpu) .== onecold(y |> cpu))
        ntot += size(y,2)
    end
    acc *= 100/ntot
    return acc
end

accuracy(train_data), accuracy(test_data)

In [None]:
opt = Momentum()

In [None]:
function train(epochs)
    @printf("%6s %6s %6s %6s\n", "epoch", "train", "test", "time")
    for epoch = 1:epochs
        tt = @elapsed for (x, y) in train_data
            x, y = x |> device, y |> device
            gs = Flux.gradient(ps) do
                ŷ = model(x)
                loss(ŷ, y)
            end
            update!(opt, ps, gs)
        end
        train = accuracy(train_data)
        test = accuracy(test_data)
        @printf("%6d %6.2f %6.2f %6.2f\n", epoch, train, test, tt)
    end
end

In [None]:
train(5)

In [None]:
prediction = onecold(model(X_test |> device), 0:9) |> cpu;

In [None]:
k = rand(1:Ntest)
@show k
@show prediction[k]
@show y_test[k];
MNIST.convert2image(1 .- X_test[:,:,1,k])

In [None]:
inds = findall(prediction .!= y_test)

length(inds)

In [None]:
num = 8
@show prediction[inds[1:num]]
@show y_test[inds[1:num]]
[MNIST.convert2image(1 .- X_test[:,:,1,k]) for k in inds[1:num]]

In [None]:
# Confusion matrix

C = onehotbatch(prediction, 0:9)*yv_test'

In [None]:
for k in inds
    if prediction[k] == 2
        @show prediction[k], y_test[k]
    end
end

In [None]:
sum(diag(C))

In [None]:
sum(C)