In [None]:
using Plots, LaTeXStrings
using LinearAlgebra, Optim

---

# A simple neural network

In [None]:
# Himmelblau function

f(x) = (x[1]^2 + x[2] - 11)^2 + (x[1] + x[2]^2 - 7)^2
f(x,y) = f([x,y])

In [None]:
m, k, N = 2, 1, 1000

X = 12*rand(Float32, m, N) .- 6
y = reshape([f(X[:,i]) for i=1:N],1,N)

In [None]:
N_train, N_test = 800, 200
X_train, X_test = X[:,1:N_train], X[:,N_train+1:N]
y_train, y_test = y[:,1:N_train], y[:,N_train+1:N]
size(X_train), size(X_test)

In [None]:
ax, bx = -6, 6
ay, by = -6, 6

xx = range(ax, bx, length=200)
yy = range(ay, by, length=200)
flevels = [0, 5, 20, 40, 60, 80, 100, 120, 150, 180, 300, 400, 600]

plt1 = plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by), legend=:none)
contour!(xx, yy, f, levels=flevels, color=1, contour_labels=true)
#scatter!(X_train[1,:], X_train[2,:], c=2, label="train")
#scatter!(X_test[1,:], X_test[2,:], c=3, label="test")

In [None]:
N0, N1 = m, 16

σ = sqrt(2/N1)

# Neural network parameters
A1 = σ*randn(N1, N0); b1 = σ*randn(N1)
A2 = σ*randn(N1);     b2 = σ*randn()

ReLU(z) = max.(0, z)

F1(v0) = ReLU(A1*v0 + b1)
F2(v1) = dot(A2,v1) + b2

F(v) = F2(F1(v))
F(x,y) = F([x,y])

In [None]:
F(1.0, 1.0), f(1.0, 1.0)

In [None]:
plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contour!(xx, yy, F, levels=-10:0.1:10, color=:black, contour_labels=true)

In [None]:
n = (N1*N0 + N1) + (N1 + 1)

x0 = [A1[:]; b1; A2[:]; b2]

length(x0)

In [None]:
F1(x,v0) = ReLU(reshape(x[1:N1*N0],N1,N0)*v0 
    + x[N1*N0+1:(N1*N0+N1)])

F2(x,v1) = dot(x[(N1*N0+N1)+1:end-1], v1) + x[end]

F(x,vi) = F2(x,F1(x,vi))

In [None]:
F(x0, [1.0, 1.0]), F([1.0, 1.0])

In [None]:
# Prediction function
ŷ(x, X) = [F(x,X[:,i]) for i=1:size(X,2)]

loss(x, X, y) = 1/length(y)*sum((ŷ(x, X) - y[:]).^2)

loss(x0, X_train, y_train), loss(x0, X_test, y_test)

In [None]:
# Random direction
d = randn(length(x0)); d ./= norm(d)

tt = range(0, 50, length=200)
ltrain = [loss(x0+t*d, X_train, y_train) for t in tt]
ltest = [loss(x0+t*d, X_test, y_test) for t in tt]

plot(tt, ltrain, label="train loss")
plot!(tt, ltest, label="test loss")

In [None]:
@time res = optimize(x -> loss(x, X_train, y_train), x0, LBFGS(), autodiff=:forward)

xmin = res.minimizer

loss(xmin, X_train, y_train), loss(xmin, X_test, y_test)

In [None]:
plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contour!(xx, yy, f, levels=flevels, color=1, contour_labels=true)
contour!(xx, yy, (s,t)->F(xmin,[s,t]), levels=flevels, color=:black, contour_labels=true)

In [None]:
plt1 = plot(aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by), legend=:none)
contour!(xx, yy, f, levels=flevels, color=1, contour_labels=true)

plt2 = plot(aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contour!(xx, yy, (s,t)->F(xmin,[s,t]), levels=flevels, color=:black, contour_labels=true)

plot(plt1, plt2, layout=(1,2), size=(900,500))

In [None]:
d = randn(length(x0)); d ./= norm(d)
tt = range(0, 1, length=200)
ltrain = [loss(xmin+t*d, X_train, y_train) for t in tt]
ltest = [loss(xmin+t*d, X_test, y_test) for t in tt]
plot(tt, ltrain, label="train loss")
plot!(tt, ltest, label="test loss")

---

# Flux.jl

In [None]:
using Flux
using Flux: params
using Flux.Losses: mse
using Flux.Data: DataLoader
using Flux.Optimise

In [None]:
model = Chain(
    Dense(2, 64, relu),
    Dense(64, 32, relu),
    Dense(32, 16, relu),
    Dense(16, 8, relu),
    Dense(8, 1))

loss(x, y) = mse(model(x), y)

ps = params(model)

F(x,y) = model([x,y])[1]

plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contour!(xx, yy, F, levels=-1:0.05:1, color=:black, contour_labels=true)

In [None]:
data = DataLoader(X, y, batchsize=200)

#opt = Descent(1e-6)
#opt = Momentum(1e-6)
#opt = Nesterov(1e-6)
opt = ADAM()

In [None]:
@time begin
    epochs = 1000
    for epochs = 1:epochs
        for d in data
            gs = gradient(ps) do
                l = loss(d...)
            end
            update!(opt, ps, gs)
        end
    end
end
@show loss(X, y)

plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contour!(xx, yy, f, levels=flevels, color=1, contour_labels=true)
contour!(xx, yy, F, levels=flevels, color=:black, contour_labels=true)

In [None]:
plt1 = plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by), legend=:none)
contour!(xx, yy, f, levels=flevels, color=1, contour_labels=true)

plt2 = plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, colorbar=:none, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contour!(xx, yy, F, levels=flevels, color=:black, contour_labels=true)

plot(plt1, plt2, layout=(1,2), size=(900,500))

---

# Classification

In [None]:
using Flux.Losses: binarycrossentropy

In [None]:
cutoff = 60

posinds = findall(y_train[:] .<= cutoff)
neginds = findall(y_train[:] .> cutoff)

length(posinds), length(neginds)

In [None]:
plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contourf!(xx, yy, (x,y) -> f(x,y) <= cutoff, c=:binary)
Plots.scatter!(X_train[1,posinds], X_train[2,posinds], c=2, label=:none)
Plots.scatter!(X_train[1,neginds], X_train[2,neginds], c=3, label=:none)

In [None]:
yb = 1f0*(y .<= cutoff)
yb_train, yb_test = yb[:,1:N_train], yb[:,N_train+1:N]
size(X_train), size(yb_train)

In [None]:
model = Chain(
    Dense(2, 64, relu),
    Dense(64, 32, relu),
    Dense(32, 16, relu),
    Dense(16, 8, relu),
    Dense(8, 1, sigmoid))

loss(x, y) = binarycrossentropy(model(x), y)
accuracy(x,y) = sum(abs.(round.(model(x)) - y))/length(y)

ps = params(model)

@show loss(X_train, yb_train)
@show accuracy(X_train, yb_train)

F(x,y) = round(model([x,y])[1])

plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contourf!(xx, yy, F, c=:binary)

In [None]:
data = DataLoader(X_train, yb_train, batchsize=100)

opt = ADAM()

In [None]:
@time begin
    epochs = 100
    for epochs = 1:epochs
        for d in data
            gs = gradient(ps) do
                l = loss(d...)
            end
            update!(opt, ps, gs)
        end
    end
end
@show loss(X_train, yb_train)
@show accuracy(X_train, yb_train)
@show accuracy(X_test, yb_test)

plot(xlabel=L"x", ylabel=L"y", aspect_ratio=:equal, size=(600,600),
    xlims=(ax,bx), ylims=(ay,by))
contourf!(xx, yy, F, c=:binary)

In [None]:
plt1 = plot(aspect_ratio=:equal, size=(600,600), xlims=(ax,bx), ylims=(ay,by), legend=:none)
contourf!(xx, yy, (x,y) -> f(x,y) <= cutoff, c=:binary)
Plots.scatter!(X_train[1,posinds], X_train[2,posinds], c=2, label=:none)
Plots.scatter!(X_train[1,neginds], X_train[2,neginds], c=3, label=:none)

plt2 = plot(aspect_ratio=:equal, size=(600,600), xlims=(ax,bx), ylims=(ay,by), legend=:none)
contourf!(xx, yy, F, c=:binary)
Plots.scatter!(X_train[1,posinds], X_train[2,posinds], c=2, label=:none)
Plots.scatter!(X_train[1,neginds], X_train[2,neginds], c=3, label=:none)
    
plot(plt1, plt2, layout=(1,2), size=(900,500))

---