In [None]:
using MLDatasets: MNIST
using Flux
using Flux.Data: DataLoader
using Flux: onehotbatch, onehot, onecold
using Flux.Losses: crossentropy, logitcrossentropy
using Flux.Optimise
#using ImageShow
#using Images
#using Plots

using ProgressMeter: @showprogress
using Printf

In [None]:
using CUDA

CUDA.functional()

In [None]:
size(MNIST.traintensor())

In [None]:
X_train, y_train = MNIST.traindata(Float32)
X_test, y_test = MNIST.testdata(Float32)

Ntrain, Ntest = length(y_train), length(y_test)

In [None]:
size(X_train), size(X_test)

In [None]:
X_train = reshape(X_train, 28, 28, 1, :)
X_test = reshape(X_test, 28, 28, 1, :)

size(X_train), size(X_test)

In [None]:
k = rand(1:Ntrain)
@show k
@show y_train[k]
MNIST.convert2image(1 .- X_train[:,:,1,k])

---

# One Hot / One Cold

In [None]:
k = rand(1:Ntrain)
@show k
@show y_train[k]
onehot(y_train[k], 0:9)

In [None]:
y_train = onehotbatch(y_train, 0:9)
y_test  = onehotbatch(y_test, 0:9)

onecold(y_train[:,1], 0:9)

In [None]:
k = rand(1:Ntrain)
@show k
@show onecold(y_train[:,k], 0:9)
MNIST.convert2image(1 .- X_train[:,:,1,k])

In [None]:
?Conv

In [None]:
?crossentropy

In [None]:
?logitcrossentropy

In [None]:
?onecold

In [None]:
model = Chain(
    Conv((5, 5), 1=>6, relu),
    MaxPool((2, 2)),
    Conv((5, 5), 6=>16, relu),
    MaxPool((2, 2)),
    flatten,
    Dense(256, 128, relu),
    Dense(128, 64, relu),
    Dense(64, 10),
#    softmax
)

ps = params(model)

train_data = DataLoader((X_train, y_train), batchsize=64,
    shuffle=true)

test_data = DataLoader((X_test, y_test), batchsize=10000)

loss(ŷ, y) = logitcrossentropy(ŷ, y)

for (x, y) in train_data
    @show onecold(y, 0:9)
    ŷ = model(x)
    @show size(ŷ)
    @show onecold(ŷ, 0:9)
    @show onecold(softmax(ŷ), 0:9)
    @show loss(ŷ, y)
    @show logitcrossentropy(ŷ, y)
    @show crossentropy(softmax(ŷ), y)
    break
end

In [None]:
accuracy(X, y) = sum(onecold(model(X)) .== onecold(y))/size(y,2)*100

In [None]:
accuracy(X_train, y_train), accuracy(X_test, y_test)

In [None]:
num_params = sum(length, ps)

In [None]:
opt = ADAM()

In [None]:
epochs = 4
@printf("%6s %6s %6s %6s\n", "epoch", "train", "test", "time")
for epoch = 1:epochs
    tt = @elapsed for (x, y) in train_data
        gs = Flux.gradient(ps) do
            ŷ = model(x)
            loss(ŷ, y)
        end
        update!(opt, ps, gs)
    end
    train = accuracy(X_train, y_train)
    test = accuracy(X_test, y_test)
    @printf("%6d %6.2f %6.2f %6.2f\n", epoch, train, test, tt)
end

In [None]:
prediction = onecold(model(X_test), 0:9);

In [None]:
k = rand(1:Ntest)
@show k
@show prediction[k]
MNIST.convert2image(1 .- X_test[:,:,1,k])