# Setup project

In [1]:
using Pkg
Pkg.activate(".")
Pkg.instantiate()
# Pkg.add("DataLoaders")
# Pkg.add("Plots")
# Pkg.add("CUDA")
# Pkg.add("Distributions")

[32m[1m  Activating[22m[39m project at `g:\桌面\2022 Fall\cs268\final_proj\Project-Optimizer`


In [2]:
using CSV
using DataLoaders
using Plots
using CUDA
using Distributions 
using LinearAlgebra
using Statistics
using Printf

# Create a custom ML structure to apply custom optimizers.

## Prepare data

The Iris dataset is from UC Irvine Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/iris

### Create a one-hot encoder

In [3]:
function one_hot(y)
    rslt = zeros(3)
    rslt[trunc(Int, y) + 1] = 1.0
    return rslt
end

one_hot (generic function with 1 method)

### Read all lines of data in train and test

Iris dataset

In [4]:
# Normalizing data
features = [[],[],[],[]]
means = zeros(4)
stds = zeros(4)
for line in eachline("iris.txt")
    splitted = collect(split(line, " "))
    push!(features[1], parse(Float64, splitted[4]))
    push!(features[2], parse(Float64, splitted[7]))
    push!(features[3], parse(Float64, splitted[10]))
    push!(features[4], parse(Float64, splitted[13]))
end
for i = 1:4
    means[i] = mean(features[i])
    stds[i] = stdm(features[i], means[i])
end

In [5]:
# train and test
train_x_data = []
train_y_data = []
# 118, 30
test_x_data = []
test_y_data = []
ct = zeros(3)
for line in eachline("iris.txt")
    splitted = collect(split(line, " "))
    y = parse(Float64, splitted[16])
    if ct[trunc(Int, y)+1] < 8 && rand() <= 0.3
        ct[trunc(Int, y)+1] += 1
        push!(test_x_data, 
            [(parse(Float64, splitted[4]) - means[1])/stds[1], 
            (parse(Float64, splitted[7]) - means[2])/stds[2], 
            (parse(Float64, splitted[10]) - means[3])/stds[3], 
            (parse(Float64, splitted[13]) - means[4])/stds[4]])
        push!(test_y_data, one_hot(y))
    else
        push!(train_x_data, 
            [(parse(Float64, splitted[4]) - means[1])/stds[1], 
            (parse(Float64, splitted[7]) - means[2])/stds[2], 
            (parse(Float64, splitted[10]) - means[3])/stds[3], 
            (parse(Float64, splitted[13]) - means[4])/stds[4]])
        push!(train_y_data, one_hot(y))
    end
end

Inspect shape of train and test

In [6]:
print(size(train_x_data))
print(size(train_y_data))
print(size(test_x_data))
print(size(test_y_data))

(124,)(124,)(24,)(24,)

### Create dataloader

In [7]:
batch_size = 16
dl_train = DataLoader((train_x_data, train_y_data), batch_size)
dl_test = DataLoader((test_x_data, test_y_data), 1)

DataLoaders.GetObsParallel{DataLoaders.BatchViewCollated{Tuple{Vector{Any}, Vector{Any}}}}(batchviewcollated() with 24 batches of size 1, false)

## Create whole machine learning pipeline

### Create an activation function: ReLU 

In [8]:
function Relu!(Xs)
    for i = 1 : length(Xs)
        Xs[i] = max(0.0,Xs[i])
    end
end

Relu! (generic function with 1 method)

### Create fully connected layers

In [9]:
# Xs in only 1 set of input in a batch.
function fc(Xs, num_in_channel, num_out_channel, W)
    rslt = zeros(num_out_channel)
    Xs = reshape(Xs, 1, num_in_channel)
    b = rand()
    for i = 1:num_out_channel
        temp = reshape(W[:, i], num_in_channel, 1)
        rslt[i] = (Xs * temp)[1]+b
    end
    return rslt
end

fc (generic function with 1 method)

### Create a simple MLP model

In [10]:
# Xs in only 1 set of input in a batch.
function MLP(Xs, out_channel; num_hidden_layers = 0, hidden_layer_size = 64, Ws = nothing, para_init=1)
    Ws_idx = 1
    hidden_outs = []
    # layer in
    if Ws == nothing
        Wss = []
        W = rand(Uniform(-para_init, para_init), size(Xs)[1], hidden_layer_size)
        push!(Wss, W)
    else
        W = Ws[Ws_idx]
        Ws_idx += 1
    end
    out = fc(Xs, size(Xs)[1], hidden_layer_size, W)
    Relu!(out)
    push!(hidden_outs, deepcopy(out))
    # layer hidden
    for i = 1 : num_hidden_layers
        if Ws == nothing
            W = rand(Uniform(-para_init, para_init), hidden_layer_size, hidden_layer_size)
            push!(Wss, W)
        else
            W = Ws[Ws_idx]
            Ws_idx += 1
        end
        out = fc(out, hidden_layer_size, hidden_layer_size, W)
        Relu!(out)
        push!(hidden_outs, deepcopy(out))
    end
    # layer out
    if Ws == nothing
        W = rand(Uniform(-para_init, para_init), hidden_layer_size, out_channel)
        push!(Wss, W)
    else
        W = Ws[Ws_idx]
        Ws_idx += 1
    end
    out = fc(out, hidden_layer_size, out_channel, W)
    if Ws == nothing
        return out, hidden_outs, Wss
    end
    return out, hidden_outs
end

MLP (generic function with 1 method)

### Create a loss function: soft-max loss
Soft-max loss is a combination of a soft-max activation layer and cross entropy loss

In [11]:
# Y_pred in only 1 set of prediction in a batch. Ys in only 1 set of truth label in a batch.
function softmax_loss(Y_pred, Ys)
    # softmax
    temp_sum = 0.0
    for x in Y_pred
        temp_sum += exp(x)
    end
    l = length(Y_pred)
    scores = zeros(l)
    for i = 1:l
        scores[i] = exp(Y_pred[i])/temp_sum
    end
    # loss
    losses = zeros(l)
    for i = 1:l
        losses[i] = -Ys[i] * log(scores[i])
    end
    total_loss = sum(losses)
    
    return scores, losses, total_loss
end

softmax_loss (generic function with 1 method)

How many threads we have for cpu?

In [12]:
Threads.nthreads()

32

### Create a back-propagation procedure

In [13]:
#Ws[idx of layer][idx of node, idx of weights]
function ∇(Xs, Ys, Ws, scores, outs, hidden_outs)
    num_in_channel = size(Xs)[1]
    num_out_channel = size(Ys)[1]
    hidden_layer_size = size(hidden_outs[1])[1]
    num_hidden_layers = size(hidden_outs)[1]
    # step -2
    ∂loss_∂netout = zeros(num_out_channel)
    for i = 1:num_out_channel
        ∂loss_∂netout[i] = -Ys[i]/scores[i]
    end
    # println("step -2: $∂loss_∂netout")
    # step -1
    outs_exp_sum = 0.0
    for out in outs
        outs_exp_sum += exp(out)
    end
    # println("step -1: $outs_exp_sum")
    for i = 1:num_out_channel
        ∂loss_∂netout[i] *= (exp(outs[i])*(outs_exp_sum - exp(outs[i])))/(outs_exp_sum^2)
    end
    # println("step -1: $∂loss_∂netout")
    # Assume we use same hidden_size of all hidden layers for the sake for brevity
    ∇s = []
    # first layer
    push!(∇s, zeros(num_in_channel, hidden_layer_size))
    # hidden layer
    for i = 1 : num_hidden_layers-1
        push!(∇s, zeros(hidden_layer_size, hidden_layer_size))
    end
    # last layer
    push!(∇s, zeros(hidden_layer_size, num_out_channel))

    # Start back_propagations
    ∂loss_∂prev_nodes = zeros(hidden_layer_size)
    # layer weights from outs to last of hidden layer
    for i = 1 : hidden_layer_size
        ∂loss_∂prev_nodes[i] = back_propagation(hidden_outs[num_hidden_layers][i],
        num_hidden_layers+1, i, ∇s,
        Ws[num_hidden_layers+1][i, :], ∂loss_∂netout)
    end
    # layer weights from i-th hidden layer to i-1th hidden layer
    for k = num_hidden_layers : -1 : 2
        ∂loss_∂curr_nodes = deepcopy(∂loss_∂prev_nodes)
        for i = 1 : hidden_layer_size
            ∂loss_∂prev_nodes[i] = back_propagation(hidden_outs[k-1][i],
            k, i, ∇s,
            Ws[k][i, :], ∂loss_∂curr_nodes)
        end
    end
    # layer weights from 1st hidden layer to input layer
    for i = 1 : num_in_channel
        back_propagation(Xs[i],
        1, i, ∇s,
        Ws[1][i, :], ∂loss_∂prev_nodes)
    end
    return ∇s
end

∇ (generic function with 1 method)

In [14]:
function back_propagation(node_out, layer_idx, i, ∇s, weights, ∂loss_∂nodes)
    ∂loss_∂node_i = (node_out==0) ? 0 : dot(weights, ∂loss_∂nodes) # considering d of ReLU
    ∇s[layer_idx][i, :] = ∂loss_∂nodes*node_out
    return ∂loss_∂node_i
end

back_propagation (generic function with 1 method)

### Create a training procedure

In [461]:
function batch_step(optimizer, Xs_batch, Ys_batch, Ws_wrapped; hidden_layer_size = 64)
    b_s = size(Xs_batch)[2]
    t = typeof(optimizer)
    # b_s = 33
    cache_new_Ws_unwrapped = zeros(b_s, num_Ws)
    total_losses = zeros(b_s)
    Ws_unwrapped = _unwrap(Ws_wrapped)
    optimizer.k = optimizer.k + 1
    for i in 1:b_s
        if t <: LimitedMemoryBFGS
            m = length(optimizer.δs)
            δs, γs, qs = deepcopy(optimizer.δs), deepcopy(optimizer.γs), deepcopy(optimizer.qs)
        elseif t <: ConjugateGD
            d_prev, g_prev = deepcopy(optimizer.d), deepcopy(optimizer.g)
        end
        Xs = Xs_batch[:, i]
        Y_truth = Ys_batch[:, i]
        # println("*********$Y_truth*********")
        #forward
        Y_pred, hidden_outs = MLP(Xs, 3; num_hidden_layers = 2, Ws=Ws_wrapped, hidden_layer_size = hidden_layer_size)
        scores, losses, total_loss = softmax_loss(Y_pred, Y_truth)
        total_losses[i] = total_loss
        #backward
        if  t <: NesterovMomentum_SGD || t <: Nelder_Mead
            Ws_unwrapped = step!(optimizer, Ws_unwrapped, Xs, Y_truth)
        else
            gradients = ∇(Xs, Y_truth, Ws_wrapped, scores, Y_pred, hidden_outs);
            if t <: ConjugateGD
                Ws_unwrapped = 
                step!(optimizer, _unwrap(gradients), Ws_unwrapped, Xs, Y_truth, total_loss,
                d_prev, g_prev)
            elseif t <: LimitedMemoryBFGS
                Ws_unwrapped = 
                step!(optimizer, _unwrap(gradients), Ws_unwrapped, Xs, Y_truth, total_loss, m,
                deepcopy(δs), deepcopy(γs), deepcopy(qs))
            elseif t <: DFP || t <: semi_DFP
                Ws_unwrapped = step!(optimizer, _unwrap(gradients), Ws_unwrapped,
                total_loss, Xs, Y_truth)
            else
                Ws_unwrapped = step!(optimizer, _unwrap(gradients), Ws_unwrapped)
            end
        end
    end    
    # if t <: LimitedMemoryBFGS
    #     m = length(optimizer.δs)
    #     δs, γs, qs = deepcopy(optimizer.δs), deepcopy(optimizer.γs), deepcopy(optimizer.qs)
    # elseif t <: ConjugateGD
    #     d_prev, g_prev = deepcopy(optimizer.d), deepcopy(optimizer.g)
    # end
    # Threads.@threads for i in 1:b_s-1
    #     Xs = Xs_batch[:, i]
    #     Y_truth = Ys_batch[:, i]
    #     # println("*********$Y_truth*********")
    #     #forward
    #     Y_pred, hidden_outs = MLP(Xs, 3; num_hidden_layers = 2, Ws=Ws_wrapped, hidden_layer_size = hidden_layer_size)
    #     scores, losses, total_loss = softmax_loss(Y_pred, Y_truth)
    #     total_losses[i] = total_loss
    #     #backward
    #     if  t <: NesterovMomentum_SGD || t <: Nelder_Mead
    #         cache_new_Ws_unwrapped[i,:] = step_without_update!(optimizer, Ws_unwrapped, Xs, Y_truth)
    #     else
    #         gradients = ∇(Xs, Y_truth, Ws_wrapped, scores, Y_pred, hidden_outs);
    #         if t <: ConjugateGD
    #             cache_new_Ws_unwrapped[i,:] = 
    #             step_without_update!(optimizer, _unwrap(gradients), Ws_unwrapped, Xs, Y_truth, total_loss,
    #             d_prev, g_prev)
    #         elseif t <: LimitedMemoryBFGS
    #             cache_new_Ws_unwrapped[i,:] = 
    #             step_without_update!(optimizer, _unwrap(gradients), Ws_unwrapped, Xs, Y_truth, total_loss, m,
    #             deepcopy(δs), deepcopy(γs), deepcopy(qs))
    #         elseif t <: DFP
    #             cache_new_Ws_unwrapped[i,:] = step_without_update!(optimizer, _unwrap(gradients), Ws_unwrapped,
    #             total_loss, Xs, Y_truth)
    #         else
    #             cache_new_Ws_unwrapped[i,:] = step_without_update!(optimizer, _unwrap(gradients), Ws_unwrapped)
    #         end
    #     end
    #     # if i==1
    #     #     println()
    #     #     println(Y_pred)
    #     #     println(scores)
    #     #     println(Y_truth)
    #     #     println()
    #     # end
    # end
    # i = b_s
    # Xs = Xs_batch[:, i]
    # Y_truth = Ys_batch[:, i]
    # # println("*********$Y_truth*********")
    # #forward
    # Y_pred, hidden_outs = MLP(Xs, 3; num_hidden_layers = 2, Ws=Ws_wrapped, hidden_layer_size = hidden_layer_size)
    # scores, losses, total_loss = softmax_loss(Y_pred, Y_truth)
    # total_losses[i] = total_loss
    # #backward
    # gradients = ∇(Xs, Y_truth, Ws_wrapped, scores, Y_pred, hidden_outs)
    # if  t <: NesterovMomentum_SGD || t <: Nelder_Mead
    #     cache_new_Ws_unwrapped[i,:] = step!(optimizer, Ws_unwrapped, Xs, Y_truth)
    # else
    #     gradients = ∇(Xs, Y_truth, Ws_wrapped, scores, Y_pred, hidden_outs);
    #     if t <: ConjugateGD
    #         cache_new_Ws_unwrapped[i,:] = 
    #         step!(optimizer, _unwrap(gradients), Ws_unwrapped, Xs, Y_truth, total_loss,
    #         d_prev, g_prev)
    #     elseif t <: LimitedMemoryBFGS
    #         cache_new_Ws_unwrapped[i,:] = 
    #         step!(optimizer, _unwrap(gradients), Ws_unwrapped, Xs, Y_truth, total_loss, m,
    #         deepcopy(δs), deepcopy(γs), deepcopy(qs))
    #     elseif t <: DFP
    #         cache_new_Ws_unwrapped[i,:] = step!(optimizer, _unwrap(gradients), Ws_unwrapped,
    #         total_loss, Xs, Y_truth)
    #     else
    #         cache_new_Ws_unwrapped[i,:] = step!(optimizer, _unwrap(gradients), Ws_unwrapped)
    #     end
    # end
    # #update parameters
    # new_Ws_unwrapped = mean(cache_new_Ws_unwrapped, dims=1)
    return _wrap(Ws_unwrapped, params_shape), mean(total_losses)
    # return _wrap(new_Ws_unwrapped, params_shape), mean(total_losses)
end

batch_step (generic function with 1 method)

In [16]:
function epoch_step(optimizer, new_Ws_wrapped; hidden_layer_size = 64, print_info = true)
    ct_batch = 1
    curr_loss = 0
    for (xs, ys) in dl_train
        if print_info
            print("batch# $ct_batch --> ")
            flush(stdout)
        end
        ct_batch += 1
        new_Ws_wrapped, curr_loss = batch_step(optimizer, xs, ys, new_Ws_wrapped; hidden_layer_size = hidden_layer_size)
    end
    if print_info
        println("\n\ttrain loss =  $curr_loss")
    end
    return new_Ws_wrapped, curr_loss
end

epoch_step (generic function with 1 method)

# Create optimizers

#### Helper functions to make inputs compatible with optimizers

In [17]:
function _unwrap(wrapped)
    unwrapped = []
    for layer in wrapped
        unwrapped = vcat(unwrapped, reduce(vcat,layer))
    end
    return unwrapped
end
function _wrap(unwrapped, shape)
    wrapped = []
    curr_idx = 1
    for s in shape
        ct = s[1]*s[2]
        push!(wrapped, Float64.(reshape(unwrapped[curr_idx:curr_idx-1+ct], s)))
        curr_idx += ct
    end
    return wrapped
end

_wrap (generic function with 1 method)

#### A simple and fast line search algorithm by David

In [495]:
function line_search(f, g, x, curr_loss, train_x, train_y; ϵ = 0.01, α=0.05, p=0.5, β=1e-4, loop_limit = 30)
    if curr_loss < ϵ
        return x, nothing, nothing, nothing, nothing # Vanishing gradient
    end
    # d = -g
    d = -g/norm(g)
    new_x = x + α*g
    loss_new, hidden_outs, Y_pred, scores = f(train_x, train_y, new_x) 
    ct = 0
    # println("\tloss = $(curr_loss)")
    # println("\tnew loss = $loss_new, $(curr_loss + β*α*(g⋅d))")
    while loss_new >= ϵ && loss_new > curr_loss + β*α*(g⋅d) && ct <= loop_limit
        α *= p
        new_x = x + α*d
        loss_new, hidden_outs, Y_pred, scores = f(train_x, train_y, new_x)
        ct += 1
        # println("\tnew loss = $loss_new, $(curr_loss + β*α*(g⋅d))")
    end
    # println("\t here α = $α")
    # println()
    return new_x, scores, Y_pred, hidden_outs, loss_new
end
    

line_search (generic function with 1 method)

In [19]:
# function line_serach(F, d, x, train_x, train_y; r=0.5,c=1e-4,nmax=20)
    
#     # params
#     # F: function to be optimized
#     # x: variable
#     # d: direction
#     # r: factor by which to reduce step size at each iteration
#     # c: parameter [0,1]
#     # nmax: max iteration

#     # return
#     # α step size
#     # fk1: function value at new x
#     # gkk: gradient at new x

#     #https://en.wikipedia.org/wiki/Backtracking_line_search
#     α=1

#     fk, hidden_outs, Y_pred, scores = F(train_x, train_y, x)
#     # println("(train_x,train_y,x,scores,Y_pred,hidden_outs)=>
#     # [$(size(train_x))],[$(size(train_y))],[$(size(x))],[$(size(scores))],[$(size(Y_pred))],[$(size(hidden_outs))]")
#     gk = _unwrap(∇(train_x, train_y, _wrap(x, params_shape), scores, Y_pred, hidden_outs))
#     # fk,gk=F(x)

#     xx=x
#     x=x+α*d

#     fk1, hidden_outs, Y_pred, scores = F(train_x, train_y, x) 
#     gk1 = _unwrap(∇(train_x, train_y, _wrap(x, params_shape), scores, Y_pred, hidden_outs))
#     # fk1,gk1=F(x)
#     n=1
    
#     while fk1 > fk+c*α*(gk'*d) && n < nmax
#         n = n+1
#         α = α*0.5
#         x = xx+α*d

#         fk1, hidden_outs, Y_pred, scores = F(train_x, train_y, x) 
#         gk1 = _unwrap(∇(train_x, train_y, _wrap(x, params_shape), scores, Y_pred, hidden_outs))
#         # fk1,gk1=F(x)
#     end
#     if α == 1
#         α = -1
#         while fk1>fk+c*α*(gk'*d) && n < nmax
#             n=n+1
#             α=α*0.5
#             x=xx+α*d
    
#             fk1, hidden_outs, Y_pred, scores = F(train_x, train_y, x) 
#             gk1 = _unwrap(∇(train_x, train_y, _wrap(x, params_shape), scores, Y_pred, hidden_outs))
#             # fk1,gk1=F(x)
#         end
#     end
#     return α, fk1, gk1
# end

#### Steepest gradient descent
Credit: Wenbo

In [20]:
abstract type DescentMethod end
# Adam from K&W page 81
mutable struct SteepestGD <: DescentMethod
    α # learning rate
    k # dumb variable
end
function init!(M::SteepestGD)
    M.k = 0
end
function step!(M::SteepestGD, ∇f, x)
    return x - M.α*∇f
end
function step_without_update!(M::SteepestGD, ∇f, x)
    return x - M.α*∇f
end

step_without_update! (generic function with 1 method)

#### Adagrad accelerated gradient descent 
Credit: Jinghua

In [21]:
# K&W page 78
mutable struct Adagrad <: DescentMethod
    α # learning rate
    ϵ # small value
    s # sum of squared gradient
    k # dumb var
end
function init!(M::Adagrad, x_length)
    M.s = zeros(x_length)
    M.k = 0
end
function step!(M::Adagrad, g, x)
    α, ϵ = M.α, M.ϵ
    M.s += g.*g
    return x - α*g ./ (sqrt.(M.s) .+ ϵ)
end
function step_without_update!(M::Adagrad, g, x)
    α, ϵ = M.α, M.ϵ
    ss = M.s + g.*g
    return x - α*g ./ (sqrt.(ss) .+ ϵ)
end

step_without_update! (generic function with 2 methods)

#### Adam gradient descent
Credit: Wenbo

In [22]:
abstract type DescentMethod end
# Adam from K&W page 81
mutable struct Adam <: DescentMethod
    α # learning rate
    γv # decay
    γs # decay
    ϵ # small value
    k # step counter
    v # 1st moment estimate
    s # 2nd moment estimate
end
function init!(M::Adam, x_length)
    M.k = 0
    M.v = zeros(x_length)
    M.s = zeros(x_length)
    return M
end
function step!(M::Adam, ∇f, x)
    α, γv, γs, ϵ, k = M.α, M.γv, M.γs, M.ϵ, M.k
    s, v, g = M.s, M.v, ∇f
    M.v = γv*v + (1-γv)*g
    M.s = γs*s + (1-γs)*g.*g
    # M.k = k += 1
    v_hat = M.v ./ (1 - γv^k)
    s_hat = M.s ./ (1 - γs^k)
    return x - α*v_hat ./ (sqrt.(s_hat) .+ ϵ)
end
function step_without_update!(M::Adam, ∇f, x)
    α, γv, γs, ϵ, k = M.α, M.γv, M.γs, M.ϵ, M.k
    vv = γv*M.v + (1-γv)*∇f
    ss = γs*M.s + (1-γs)*∇f.*∇f
    # M.k = k += 1
    v_hat = vv ./ (1 - γv^k)
    s_hat = ss ./ (1 - γs^k)
    return x - α*v_hat ./ (sqrt.(s_hat) .+ ϵ)
end

step_without_update! (generic function with 3 methods)

#### Conjugate gradient descent
Credit: Yashuo

In [680]:
mutable struct ConjugateGD <: DescentMethod
    α
    g
    f
    d
    k # dumb variable
end
function init!(M::ConjugateGD)
    M.k = 0
    M.d = -M.g
end
# function step!(M::ConjugateGD, g, x, train_x, train_y)
function step!(M::ConjugateGD, g, x, train_x, train_y, curr_loss, d_prev, g_prev)
    # d_prev, g_prev = M.d, M.g
    β = dot(g, g-g_prev)/(g_prev ⋅ g_prev)
    # β = dot(g, g-g_prev)/dot(g_prev, g_prev)
    β = max(0, β)
    d = -g + β*d_prev
    # d = d/sqrt(sum(d.^2))
    x_next, dumb1, dumb2, dumb3, loss_new = line_search(M.f, -d, x, curr_loss, train_x, train_y; α=M.α)
    # M.α, dumb1, dumb2 = line_serach(M.f, g, x, train_x, train_y)
    # x_next = x + M.α*d
    if dumb1 != nothing
        M.d, M.g = d, g
    end
    return x_next
end
# function step_without_update!(M::ConjugateGD, g, x, train_x, train_y)
function step_without_update!(M::ConjugateGD, g, x, train_x, train_y, curr_loss, d_prev, g_prev)
    # d_prev, g_prev = M.d, M.g
    β = dot(g, g-g_prev)/(g_prev ⋅ g_prev)
    # β = dot(g, g-g_prev)/dot(g_prev, g_prev)
    β = max(0, β)
    d = -g + β*d_prev
    # d = d/sqrt(sum(d.^2))
    x_next, dumb1, dumb2, dumb3, loss_new = line_search(M.f, -d, x, curr_loss, train_x, train_y; α=M.α)
    # α, dumb1, dumb2 = line_serach(M.f, g, x, train_x, train_y)
    # x_next = x + α*d
    if dumb1 != nothing
        M.d, M.g = d, g
    end
    return x_next
end

step_without_update! (generic function with 10 methods)

#### Nesterov Momentum (SGD)
Credit: Dylan

In [24]:
# From K&W 76
mutable struct NesterovMomentum_SGD <: DescentMethod
    α # learning rate
    β # momentum decay
    f
    v # momentum
    k # dumb variable
end
function init!(M::NesterovMomentum_SGD, x_length)
    M.k = 0
    M.v = zeros(x_length)
end
function step!(M::NesterovMomentum_SGD, x, train_x, train_y)
    α, β, v = M.α, M.β, M.v
    x_new = x + β*v
    f_x_βv, hidden_outs, Y_pred, scores = M.f(train_x, train_y, x_new) 
    ∇f_x_βv = ∇(train_x, train_y, _wrap(x_new, params_shape), scores, Y_pred, hidden_outs)
    M.v = β*v - α*_unwrap(∇f_x_βv)
    return x + M.v
end
function step_without_update!(M::NesterovMomentum_SGD, x, train_x, train_y)
    α, β, v = M.α, M.β, M.v
    x_new = x + β*v
    f_x_βv, hidden_outs, Y_pred, scores = M.f(train_x, train_y, x_new) 
    ∇f_x_βv = ∇(train_x, train_y, _wrap(x_new, params_shape), scores, Y_pred, hidden_outs)
    vv = β*v - α*_unwrap(∇f_x_βv)
    return x + vv
end

step_without_update! (generic function with 5 methods)

#### Adadelta gradient descent
Credit: Dylan

In [25]:
# source: K&W page 80
mutable struct Adadelta <: DescentMethod
    γs # gradient decay
    γx # update decay
    ϵ # small value
    s # sum of squared gradients
    u # sum of squared updates
    k # dumb variable
end
function init!(M::Adadelta, x_length)
    M.k = 0
    M.s = zeros(x_length)
    M.u = zeros(x_length)
    return M
end
function step!(M::Adadelta, g, x)
    γs, γx, ϵ, s, u = M.γs, M.γx, M.ϵ, M.s, M.u
    M.s = γs*s + (1-γs)*g.*g
    Δx = - (sqrt.(u) .+ ϵ) ./ (sqrt.(M.s) .+ ϵ) .* g
    M.u = γx*u + (1-γx)*Δx.*Δx
    return x + Δx
end
function step_without_update!(M::Adadelta, g, x)
    γs, γx, ϵ, s, u = M.γs, M.γx, M.ϵ, M.s, M.u
    ss = γs*s + (1-γs)*g.*g
    Δx = - (sqrt.(u) .+ ϵ) ./ (sqrt.(ss) .+ ϵ) .* g
    # u[:] = γx*u + (1-γx)*Δx.*Δx
    return x + Δx
end
    

step_without_update! (generic function with 6 methods)

#### Nelder-Mead gradient descent
Credit: Yashuo

In [26]:
mutable struct Nelder_Mead <: DescentMethod
    f
    k # dumb variable
end
function init!(M::Nelder_Mead)
    M.k = 0
end
function step!(M::Nelder_Mead, x, train_x, train_y)
    return nmsmax(M.f, x, train_x, train_y)
end
function step_without_update!(M::Nelder_Mead, x, train_x, train_y)
    return nmsmax(M.f, x, train_x, train_y)
end
    

step_without_update! (generic function with 7 methods)

In [27]:
function nmsmax(fun, x, train_x, train_y; trace = true, initial_simplex = 0, target_f = Inf, max_its = Inf, max_evals = Inf, tol = 1e-3 )
    x0 = x[:];  # Work with column vector internally.
    n = length(x0);

   #  V = [zeros(n,1) eye(n)];
    V = [zeros(n,1) Matrix(1.0I, n, n)];
    f = zeros(n+1,1);
    V[:,1] = x0; 
    
    f[1], hidden_outs, Y_pred, scores = fun(train_x, train_y, x) 
   #  f[1] = fun(x);

    
    fmax_old = f[1];
    fmax     = -Inf; # Some initial value

   #  if trace
   #      @printf "f(x0) = %9.4e\n" f[1]
   #  end

    k = 0; m = 0;

    # Set up initial simplex.
    scale = max(norm(x0,Inf),1);
    if initial_simplex == 0
       # Regular simplex - all edges have same length.
       # Generated from construction given in reference [18, pp. 80-81] of [1].
       alpha = scale / (n*sqrt(2)) * [ sqrt(n+1)-1+n  sqrt(n+1)-1 ];
       V[:,2:n+1] = (x0 + alpha[2]*ones(n,1)) * ones(1,n);
       for j=2:n+1
           V[j-1,j] = x0[j-1] + alpha[1];
           x[:] = V[:,j]; 

           f[j], hidden_outs, Y_pred, scores = fun(train_x, train_y, x) 
         #   f[j] = fun(x);
       end
    else
       # Right-angled simplex based on co-ordinate axes.
       alpha = scale*ones(n+1,1);
       for j=2:n+1
           V[:,j] = x0 + alpha[j]*V[:,j];
           x[:] = V[:,j]; 
           f[j], hidden_outs, Y_pred, scores = fun(train_x, train_y, x) 
         #   f[j] = fun(x);
       end
    end
    nf = n+1;
    how = "initial  ";

    j = sortperm(f[:]);
    temp = f[j];
    j = j[n+1:-1:1];
    f = f[j]; V = V[:,j];

    alpha = 1;  beta = 1/2;  gamma = 2;

    msg = ""

    while true    ###### Outer (and only) loop.
    k = k+1;

        fmax = f[1];
      #   if fmax > fmax_old
      #       if trace
      #          @printf "Iter. %2.0f," k
      #          print(string("  how = ", how, " "));
      #          @printf "nf = %3.0f,  f = %9.4e  (%2.1f%%)\n" nf fmax 100*(fmax-fmax_old)/(abs(fmax_old)+eps(fmax_old));
      #       end
      #   end
        fmax_old = fmax;

        ### Three stopping tests from MDSMAX.M

        # Stopping Test 1 - f reached target value?
        if fmax >= target_f
           msg = "Exceeded target...quitting\n";
           break  # Quit.
        end

        # Stopping Test 2 - too many f-evals?
        if nf >= max_evals
           msg = "Max no. of function evaluations exceeded...quitting\n";
           break  # Quit.
        end

        # Stopping Test 3 - too many iterations?
        if k > max_its
           msg = "Max no. of iterations exceeded...quitting\n";
           break  # Quit.
        end

        # Stopping Test 4 - converged?   This is test (4.3) in [1].
        v1 = V[:,1];
        size_simplex = norm(V[:,2:n+1]-v1[:,ones(Int,n)],1) / max(1, norm(v1,1));
        if size_simplex <= tol
         #   msg = @sprintf("Simplex size %9.4e <= %9.4e...quitting\n", size_simplex, tol)
           break  # Quit.
        end

        #  One step of the Nelder-Mead simplex algorithm
        #  NJH: Altered function calls and changed CNT to NF.
        #       Changed each `fr < f[1]' type test to `>' for maximization
        #       and re-ordered function values after sort.

        temp = sum(V[:,1:n]'; dims = 1)
        vbar = (temp/n)';  # Mean value
        vr = (1 + alpha)*vbar - alpha*V[:,n+1]; x[:] = vr; 
        
        fr, hidden_outs, Y_pred, scores = fun(train_x, train_y, x) 
      #   fr = fun(x);

        nf = nf + 1;
        vk = vr;  fk = fr; how = "reflect, ";
        if fr > f[n]
                if fr > f[1]
                   ve = gamma*vr + (1-gamma)*vbar; x[:] = ve; 

                   fe, hidden_outs, Y_pred, scores = fun(train_x, train_y, x) 
                  #  fe = fun(x);

                   nf = nf + 1;
                   if fe > f[1]
                      vk = ve; fk = fe;
                      how = "expand,  ";
                   end
                end
        else
                vt = V[:,n+1]; ft = f[n+1];
                if fr > ft
                   vt = vr;  ft = fr;
                end
                vc = beta*vt + (1-beta)*vbar; x[:] = vc; 
                
                fc, hidden_outs, Y_pred, scores = fun(train_x, train_y, x) 
               #  fc = fun(x);

                nf = nf + 1;
                if fc > f[n]
                   vk = vc; fk = fc;
                   how = "contract,";
                else
                   for j = 2:n
                       V[:,j] = (V[:,1] + V[:,j])/2;
                       x[:] = V[:,j]; 
                       
                       f[j], hidden_outs, Y_pred, scores = fun(train_x, train_y, x) 
                     #   f[j] = fun(x);
                   end
                   nf = nf + n-1;
                   vk = (V[:,1] + V[:,n+1])/2; x[:] = vk; 

                   fk, hidden_outs, Y_pred, scores = fun(train_x, train_y, x) 
                  #  fk = fun(x);

                   nf = nf + 1;
                   how = "shrink,  ";
                end
        end
        V[:,n+1] = vk;
        f[n+1] = fk;
        j = sortperm(f[:]);
        temp = f[j];
        j = j[n+1:-1:1];
        f = f[j]; V = V[:,j];

    end   ###### End of outer (and only) loop.

    # Finished.
   #  if trace
   #      print(msg)
   #  end
    x[:] = V[:,1];

   #  return x, fmax, nf, k-1
    return x
end

nmsmax (generic function with 1 method)

#### Davidon-Fletcher-Powel gradient descent(DFP)
Credit: Jinghua

In [450]:
# K&W 93
mutable struct DFP <: DescentMethod
    Q
    f
    k # dumb var
end
function init!(M::DFP, x_length)
    M.k = 0
    M.Q = Matrix(1.0I, x_length, x_length)
end
function step!(M::DFP, g, x, curr_loss, train_x, train_y)
    Q = M.Q
    x′, scores, Y_pred, hidden_outs, loss_new = line_search(M.f, M.Q*g, x, curr_loss, train_x, train_y; α=0.2)
    # x′ = line_search(f, x, -Q*g)
    if scores == nothing
        return x
    end
    g′ = _unwrap(∇(train_x, train_y, _wrap(x′, params_shape), scores, Y_pred, hidden_outs))
    # g′ = ∇f(x′)
    δ = x′ - x
    γ = g′ - g
    M.Q = Q - Q*γ*γ'*Q/(γ'*Q*γ) + δ*δ'/(δ'*γ)
    return x′
end
function step_without_update!(M::DFP, g, x, curr_loss, train_x, train_y)
    x′, scores, Y_pred, hidden_outs, loss_new = line_search(M.f, M.Q*g, x, curr_loss, train_x, train_y; α=0.2)
    if scores == nothing
        return x
    end
    return x′
end

step_without_update! (generic function with 10 methods)

#### Semi Davidon-Fletcher-Powel gradient descent(semi-DFP)
Credit: Jinghua

This is a DFP optimizer but with fixed step size

In [457]:
# K&W 93
mutable struct semi_DFP <: DescentMethod
    α # learning rate
    Q
    f
    k # dumb var
end
function init!(M::semi_DFP, x_length)
    M.k = 0
    M.Q = Matrix(1.0I, x_length, x_length)
end
function step!(M::semi_DFP, g, x, curr_loss, train_x, train_y)
    Q = M.Q
    # x′, scores, Y_pred, hidden_outs, loss_new = line_search(M.f, -M.Q*g, x, curr_loss, train_x, train_y; α=0.2)
    # x′ = line_search(f, x, -Q*g)
    x′ = x - M.α * (M.Q*g)
    # if scores == nothing
    #     return x
    # end
    g′ = _unwrap(∇(train_x, train_y, _wrap(x′, params_shape), scores, Y_pred, hidden_outs))
    # g′ = ∇f(x′)
    δ = x′ - x
    γ = g′ - g
    M.Q = Q - Q*γ*γ'*Q/(γ'*Q*γ) + δ*δ'/(δ'*γ)
    return x′
end
function step_without_update!(M::semi_DFP, g, x, curr_loss, train_x, train_y)
    # x′, scores, Y_pred, hidden_outs, loss_new = line_search(M.f, -M.Q*g, x, curr_loss, train_x, train_y; α=0.2)
    # if scores == nothing
    #     return x
    x′ = x - M.α * (M.Q*g)
    return x′
end

step_without_update! (generic function with 10 methods)

#### Limited memory Broyden-Fletcher-Goldfarb-Shanno gradient descent(LBFGS)
Credit: David

In [556]:
mutable struct LimitedMemoryBFGS <: DescentMethod
    m
    f
    δs
    γs
    qs
    k # dumb var
end
function init!(M::LimitedMemoryBFGS)
    M.k = 0
    M.δs = []
    M.γs = []
    M.qs = []
end
function step_without_update!(M::LimitedMemoryBFGS, g, x, train_x, train_y, curr_loss, m, δs, γs, qs)    
    # sleep(0.02)
    # while l > M.m+1
    #     popfirst!(M.δs); popfirst!(M.γs); popfirst!(M.qs)
    #     l -= 1
    # end
    # x_new = step_without_update!(M, g, x, train_x, train_y, curr_loss, l, δs, γs, qs)
    # return x_new
end
function step!(M::LimitedMemoryBFGS, g, x, train_x, train_y, curr_loss, m, δs, γs, qs)

    m = min(M.m, m)
    # println(m)
    g′ = g
    # m = length(δs)
    if m > 0
        q = g
        for i in m : -1 : 1
            qs[i] = q
            q -= (δs[i]⋅q)/(γs[i]⋅δs[i])*γs[i]
        end
        z = (γs[m] .* δs[m] .* q) / (γs[m]⋅γs[m])
        for i in 1 : m
            z += δs[i]*(δs[i]⋅qs[i] - γs[i]⋅z)/(γs[i]⋅δs[i])
        end
        x_neww, scores, Y_pred, hidden_outs, loss_new = line_search(M.f, z, x, curr_loss, train_x, train_y; α=0.05)
        if scores == nothing
            push!(M.qs, zeros(length(x)))
            push!(M.δs, x - x)
            push!(M.γs, g′ - g)

            while m > M.m+1
                popfirst!(M.δs); popfirst!(M.γs); popfirst!(M.qs)
                m -= 1
            end
            return x
        end
        if loss_new < curr_loss
            x_new = x_neww
        else
            x_new, scores, Y_pred, hidden_outs, loss_new = line_search(M.f, g, x, curr_loss, train_x, train_y; α=0.0001)
            if scores == nothing
                push!(M.qs, zeros(length(x)))
                push!(M.δs, x - x)
                push!(M.γs, g′ - g)

                while m > M.m+1
                    popfirst!(M.δs); popfirst!(M.γs); popfirst!(M.qs)
                    m -= 1
                end
                return x
            end
        end
        g′ = _unwrap(∇(train_x, train_y, _wrap(x_new, params_shape), scores, Y_pred, hidden_outs))
        # x_new = line_search(f, x, -z)
    else
        x_new, scores, Y_pred, hidden_outs, loss_new = line_search(M.f, g, x, curr_loss, train_x, train_y; α=0.0001)
        if scores == nothing
            push!(M.qs, zeros(length(x)))
            push!(M.δs, x - x)
            push!(M.γs, g′ - g)

            while m > M.m+1
                popfirst!(M.δs); popfirst!(M.γs); popfirst!(M.qs)
                m -= 1
            end
            return x
        end
        g′ = _unwrap(∇(train_x, train_y, _wrap(x_new, params_shape), scores, Y_pred, hidden_outs))
        # x_new = line_search(f, x, -g)
    end
    # dumb1, hidden_outs, Y_pred, scores = M.f(train_x, train_y, x_new) 
    # g′ = ∇f(x_new)

    push!(M.qs, zeros(length(x)))
    push!(M.δs, x_new - x)
    push!(M.γs, g′ - g)

    while m > M.m+1
        popfirst!(M.δs); popfirst!(M.γs); popfirst!(M.qs)
        m -= 1
    end
    return x_new
end 

step! (generic function with 10 methods)

In [30]:
# mutable struct LBFGS <: DescentMethod
# 	n # number of variables
# 	f

#     m # Memory length, was ∈ [2, 54] in paper
# 	prev_g # gradient at previous timestep 
# 	prev_x # x at previous timestep 
# 	Sm # previous m x's
# 	Ym # previous m gradients
# 	k # Internal iteration index
# end

# function init!(M::LBFGS)
#     M.m = 20 
#     M.prev_g = zeros(M.n)
#     M.prev_x = zeros(M.n)
#     M.Sm = zeros(M.n,M.m)
#     M.Ym = zeros(M.n,M.m)
#     M.k = 0 
# end
# function step!(o::LBFGS, ∇, x, train_x, train_y)
#     # o.k += 1
# 	m = o.m
	
# 	gnorm = norm(∇)
	
# 	# if gnorm < τgrad # tolerance for the norm of the slope 
# 	# 	return; 
# 	# end
	
# 	s0 = x-o.prev_x
# 	y0 = ∇-o.prev_g
	
# 	# println("y0=$y0")
# 	H0 = s0'*y0/(y0'*y0) # hessian diagonal satisfying secant condition
    
#     k = o.k
# 	# update Sm and Ym
# 	if k <= m
# 		o.Sm[:,k].=s0
# 		o.Ym[:,k].=y0
# 		p=-approxInvHess(∇,o.Sm[:,1:k],o.Ym[:,1:k],H0) 
# 	# only keep m entries in Sm and Ym so purge the old ones
		
# 	else
# 		o.Sm[:,1:(m-1)].=o.Sm[:,2:m]
# 		o.Ym[:,1:(m-1)].=o.Sm[:,2:m]
# 		o.Sm[:,m].=s0
# 		o.Ym[:,m].=y0
# 		p.=-approxInvHess(∇,o.Sm,o.Ym,H0)
# 	end
	
# 	# new direction=p, find new step size
#     # α = 0.01
# 	α, fs, gs=line_serach(o.f, p, x, train_x, train_y)
	
# 	# update for next iteration
# 	o.prev_x = x
# 	o.prev_g = ∇
# 	x .= x + α.*p
#     return x
# 	# f1=fs
# 	# g1=gs
# 	# k=k+1
	
# 	# if verbose == 1 
# 	# 	println("Iteration: $k -- x = $x1")
# 	# end
# end
# function step_without_update!(o::LBFGS, ∇, x, train_x, train_y)
#     Smm = deepcopy(o.Sm)
#     Ymm = deepcopy(o.Ym)

#     # o.k += 1
# 	m = o.m
	
# 	gnorm = norm(∇)
	
# 	# if gnorm < τgrad # tolerance for the norm of the slope 
# 	# 	return; 
# 	# end
	
# 	s0 = x-o.prev_x
# 	y0 = ∇-o.prev_g
	
# 	# println("y0=$y0")
# 	H0 = s0'*y0/(y0'*y0) # hessian diagonal satisfying secant condition

#     k = o.k
# 	# update Sm and Ym
# 	if k <= m
# 		Smm[:,k].=s0
# 		Ymm[:,k].=y0
# 		p=-approxInvHess(∇,Smm[:,1:k],Ymm[:,1:k],H0) 
# 	# only keep m entries in Sm and Ym so purge the old ones
		
# 	else
# 		Smm[:,1:(m-1)].=Smm[:,2:m]
# 		Ymm[:,1:(m-1)].=Smm[:,2:m]
# 		Smm[:,m].=s0
# 		Ymm[:,m].=y0
# 		p.=-approxInvHess(∇,Smm,Ymm,H0)
# 	end
	
# 	# new direction=p, find new step size
#     # α = 0.01
# 	α, fs, gs=line_serach(o.f, p, x, train_x, train_y)
	
# 	# update for next iteration
# 	# o.prev_x = x
# 	# o.prev_g = ∇
# 	x .= x + α.*p
#     return x
# 	# f1=fs
# 	# g1=gs
# 	# k=k+1
	
# 	# if verbose == 1 
# 	# 	println("Iteration: $k -- x = $x1")
# 	# end
# end

# function approxInvHess(g,S,Y,H0)
#     #INPUT

#     #g: gradient nx1 vector
#     #S: nxk matrixs storing S[i]=x[i+1]-x[i]
#     #Y: nxk matrixs storing Y[i]=g[i+1]-g[i]
#     #H0: initial hessian diagnol scalar

#     #OUTPUT
#     # p:  the approximate inverse hessian multiplied by the gradient g
#     #     which is the new direction
#     #notation follows:
#     #https://en.wikipedia.org/wiki/Limited-memory_BFGS

#     n,k=size(S)
#     rho=zeros(k)
#     for i=1:k
#         rho[i] = 1 /(Y[:,i]'*S[:,i])
#         if rho[i]<0
#             rho[i]=-rho[i]
#         end
#     end


#     q=zeros(n,k+1)
#     r=zeros(n,1)
#     α=zeros(k,1)
#     β=zeros(k,1)

#     q[:,k+1]=g

#     for i=k:-1:1
#         α[i] =rho[i]*S[:,i]'*q[:,i+1]
#         q[:,i].=q[:,i+1]-α[i]*Y[:,i]
#     end

#     z=zeros(size(q[:,1])[1])
#     # println(size(H0))
#     # println()
#     # println(size(H0*q[:,1]))
#     z.= H0*q[:,1]


#     for i=1:k
#         β[i] = rho[i]*Y[:,i]'*z
#         z.=z+S[:,i]*(α[i]-β[i])
#     end

#     p=copy(z)

#     return p
# end

# Test training

### Control variable for a scientifically correct comparison: **Give every optimizers a same NN to train**.

In [190]:
loss_ε = 0.001 # stop when loss <= loss_ε
# Initializing 
Y_pred, hidden_outs, Ws_copy = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
# println(Y_pred)
scores, losses, total_loss = softmax_loss(Y_pred, train_y_data[1])
∇_copy = ∇(train_x_data[1], train_y_data[1], Ws_copy, scores, Y_pred, hidden_outs);
println("New model created!\nInital loss = $total_loss")
params_shape = []
num_Ws = 0
for layer in Ws_copy
    # println(size(layer))
    s1, s2 = size(layer)
    num_Ws += s1 * s2
    push!(params_shape, size(layer))
end
function f(train_x, train_y, w_unwrapped)
    Y_pred, hidden_outs = MLP(train_x, 3; num_hidden_layers = 2, Ws=_wrap(w_unwrapped, params_shape), hidden_layer_size = 16)
    scores, losses, total_loss = softmax_loss(Y_pred, train_y)
    return total_loss, hidden_outs, Y_pred, scores
end

New model created!
Inital loss = 10.782513134966402


f (generic function with 1 method)

## Optimizer = Steepest gradient descent done

In [206]:
num_epoch_Steepest_GD = 20;

In [213]:
# Initializing 
optimizer_SteepestGD = SteepestGD(0.0001, nothing)
init!(optimizer_SteepestGD)
Ws = deepcopy(Ws_copy);

In [214]:
epoch_vs_losses = []
curr_train_loss = 0.0
# Train
for i = 1:num_epoch_Steepest_GD
    print("Epoch# $i\n\t")
    Ws, curr_loss = epoch_step(optimizer_SteepestGD, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_train_loss)
    end
    if curr_loss <= loss_ε
        break
    end
end
println("Finished")
println(epoch_vs_losses)
println(train_scores(Ws))
println(test_scores(Ws))

Epoch# 1
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  2.1696429587637915
Epoch# 2
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.336624933594325
Epoch# 3
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.1421590519778645
Epoch# 4
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.090387115997092
Epoch# 5
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.12222686592406969
Epoch# 6
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.127023486182307
Epoch# 7
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 

## Optimizer = Adagrad done

In [175]:
num_epoch_Adagrad = 20;

In [199]:
# Initializing 
optimizer_Adagrad = Adagrad(0.005, 1e-5, nothing, nothing)
init!(optimizer_Adagrad, num_Ws);
Ws = deepcopy(Ws_copy);

In [200]:
epoch_vs_losses = []
curr_train_loss = 0.0
# Train
for i = 1:num_epoch_Adagrad
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_Adagrad, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_train_loss)
    end
    if curr_train_loss <= loss_ε
        break
    end
end
println("Finished")
println(epoch_vs_losses)
println(train_scores(Ws))
println(test_scores(Ws))

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.48756383497199424
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.31177849601546637
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.11525098481730005
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.11918896341985073
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.16629175788976489
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.24484459056963964
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch#

## Optimizer = Adam done

In [137]:
num_epoch_Adam = 20;

In [201]:
# Initializing 
optimizer_Adam = Adam(0.00057, 0.9, 0.999, 1e-4, nothing, nothing, nothing)
init!(optimizer_Adam, num_Ws);
Ws = deepcopy(Ws_copy);

In [202]:
epoch_vs_losses = []
curr_train_loss = 0.0
# Train
for i = 1:num_epoch_Adam
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_Adam, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_loss)
    end
    if curr_train_loss <= loss_ε
        break
    end
end
println("Finished")

println(epoch_vs_losses)
println(train_scores(Ws))
println(test_scores(Ws))

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  3.341747667855796
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  1.4688462291258866
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.39858866531169923
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.3576027657651817
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.3250543725844321
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.5415656750595957
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 -->

## Optimizer = ConjugateGD done

In [32]:
num_epoch_ConjugateGD = 20;

In [219]:
# Initializing 
optimizer_ConjugateGD = ConjugateGD(0.003, _unwrap(deepcopy(∇_copy)), f, nothing, nothing)
init!(optimizer_ConjugateGD);
Ws = deepcopy(Ws_copy);

In [220]:
# Train
epoch_vs_losses = []
curr_train_loss = 0.0
for i = 1:num_epoch_ConjugateGD
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_ConjugateGD, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_train_loss)
    end
    if curr_train_loss <= loss_ε || isnan(curr_loss)
        break
    end
end
println("Finished")
println(epoch_vs_losses)
println(train_scores(Ws))
println(test_scores(Ws))

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  5.018214868242809
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  3.5503155972683604
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  2.525316751172077
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  1.6602582838413988
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  1.5459365044621194
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.8732302528341198
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 


## Optimizer = NesterovMomentum_SGD done

In [223]:
num_epoch_NesterovMomentum_SGD = 20;

In [250]:
# Initializing 
step_size = 0.000002
optimizer_NesterovMomentum_SGD = NesterovMomentum_SGD(step_size, 0.99, f, nothing, nothing)
init!(optimizer_NesterovMomentum_SGD, num_Ws)
Ws = deepcopy(Ws_copy);

In [251]:
epoch_vs_losses = []
curr_train_loss = 0.0
# Train
for i = 1:num_epoch_NesterovMomentum_SGD
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_NesterovMomentum_SGD, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_train_loss)
    end
    if curr_train_loss <= loss_ε || isnan(curr_loss)
        break
    end
end
println("Finished")

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  5.3273580341476094
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.21834284280729735
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.0539453401972226
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.10407222096563036
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.2651539916825724
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.2444879667076695
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 -

In [266]:
test_scores(Ws)[2]

0.9166666666666666

In [346]:
print("For **1.1.NesterovMomentum_SGD**, the **epoch_vs_losses** = ")
println(epoch_vs_losses)
println("This is with a inital α = $step_size")
print("And after 20 epochs, **loss on whole train set** = ")
println(train_scores(Ws))
print("And **loss on whole test set** = ")
score = test_scores(Ws)
println(score[1])
print("**accuracy ** = ")
println(score[2])
print("which is $(round(score[2]*100, digits=2))%")

For **1.1.NesterovMomentum_SGD**, the **epoch_vs_losses** = Any[3.2478182569668763, 1.5073105799524276, 1.1706560160314312, 0.8512706581860503, 0.7439251224030555, 0.6479145176736252, 0.6090152965735263, 0.5616231290102505, 0.5312585800962715, 0.4430113753393696, 0.4424356424796815, 0.42992856965864484, 0.39109535865259565, 0.3730720431825076, 0.3607623115236548, 0.32141748996470104, 0.37369389963113286, 0.36159723729262316, 0.36128107225339506, 0.4006233324577647]
This is with a inital α = 2.0e-6
And after 20 epochs, **loss on whole train set** = 0.4088383224058831
And **loss on whole test set** = 0.3757932035120766
**accuracy ** = 0.875
which is 87.5%

## Optimizer = Adadelta done

In [347]:
num_epoch_Adadelta = 20;

In [357]:
# Initializing 
optimizer_Adadelta = Adadelta(0.95, 0.95, 0.00005, nothing, nothing, nothing)
init!(optimizer_Adadelta, num_Ws)
Ws = deepcopy(Ws_copy);

In [358]:
epoch_vs_losses = []
curr_train_loss = 0.0
# Train
for i = 1:num_epoch_Adadelta
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_Adadelta, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_train_loss)
    end
    if curr_train_loss <= loss_ε || isnan(curr_loss)
        break
    end
end
println("Finished")

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  1.5828262243268345
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.5274823730036112
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.125848705860908
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.07696882303822083
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.13812042667223623
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.12980819503446928
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 -

In [373]:
print("For **1.1.Adadelta**, the **epoch_vs_losses** = ")
println(epoch_vs_losses)
# println("This is with a inital α = $step_size")
print("And after 20 epochs, **loss on whole train set** = ")
println(train_scores(Ws))
print("And **loss on whole test set** = ")
score = test_scores(Ws)
println(score[1])
print("**accuracy ** = ")
println(score[2])
print("which is $(round(score[2]*100, digits=2))%")

For **1.1.Adadelta**, the **epoch_vs_losses** = Any[2.553577984207015, 1.6125522425364232, 1.0939591541606564, 0.8819035625498827, 0.7525594753819873, 0.6999908940606767, 0.6268413108439863, 0.5796714961470538, 0.5622541402105504, 0.5690850832789728, 0.5777792444993618, 0.6186225185544469, 0.6250890981400434, 0.6194836064239608, 0.5761945914478452, 0.5679918389765839, 0.5576121705503273, 0.5527786184529979, 0.5435231922891999, 0.496742623298741]
And after 20 epochs, **loss on whole train set** = 0.4324020506240093
And **loss on whole test set** = 0.49458979226324246
**accuracy ** = 0.8333333333333334
which is 83.33%

## Optimizer = Nelder-Mead
\*\*\*\*\*\*\*\*\*\***NOT WORKING**\*\*\*\*\*\*\*\*\*\*

In [None]:
num_epoch_Nelder_Mead = 20;

In [None]:
# Initializing 
optimizer_Nelder_Mead = Nelder_Mead(f, nothing)
init!(optimizer_Nelder_Mead)
Ws = deepcopy(Ws_copy);

In [None]:
# # Train
# for i = 1:num_epoch_Nelder_Mead
#     println("Epoch# $i")
#     Ws, curr_loss = epoch_step(optimizer_Nelder_Mead, Ws; hidden_layer_size = 16)
#     if curr_loss <= loss_ε || isnan(curr_loss)
#         break
#     end
# end
# println("Finished")

## Optimizer = DFP done

In [392]:
num_epoch_DFP = 20;

In [451]:
# Initializing 
optimizer_DFP = DFP(nothing, f, nothing)
init!(optimizer_DFP, num_Ws)
Ws = deepcopy(Ws_copy);

In [452]:
epoch_vs_losses = []
curr_train_loss = 0.0
# Train
for i = 1:num_epoch_DFP
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_DFP, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_train_loss)
    end
    if curr_train_loss <= loss_ε || isnan(curr_loss)
        break
    end
end
println("Finished")

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.0005077023275417916
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  7.56460182822734e-5
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.0005920229500069326
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.00023028369988820017
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.4970199827674456
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  2.6606923472969646
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> b

In [453]:
print("For **1.1.DFP**, the **epoch_vs_losses** = ")
println(epoch_vs_losses)
# println("This is with a inital α = $step_size")
print("And after 20 epochs, **loss on whole train set** = ")
println(train_scores(Ws))
print("And **loss on whole test set** = ")
score = test_scores(Ws)
println(score[1])
print("**accuracy ** = ")
println(score[2])
print("which is $(round(score[2]*100, digits=2))%")

For **1.1.DFP**, the **epoch_vs_losses** = Any[1.1255128843134643, 1.8139494227858344, 1.3526529496901767, 3.380623149147914, 2.1870365732369335, 2.1962281909787174, 2.327573888441127, 1.5937501503171827, 1.3883283540062044, 1.4241786800965988, 1.404759306207787, 1.512829740434173, 1.405419025345664, 1.6003276016516415, 1.4654341430002447, 1.5957913522910432, 1.5978547616096979, 1.4357445937524835, 1.582453606571444, 1.6458025409788217]
And after 20 epochs, **loss on whole train set** = 1.66058091381958
And **loss on whole test set** = 0.8083977219731779
**accuracy ** = 0.5416666666666666
which is 54.17%

## Optimizer = semi-DFP done

In [458]:
num_epoch_semi_DFP = 20;

In [462]:
# Initializing 
optimizer_semi_DFP = semi_DFP(0.003, nothing, f, nothing)
init!(optimizer_semi_DFP, num_Ws)
Ws = deepcopy(Ws_copy);

In [463]:
epoch_vs_losses = []
curr_train_loss = 0.0
# Train
for i = 1:num_epoch_semi_DFP
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_semi_DFP, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_train_loss)
    end
    if curr_train_loss <= loss_ε || isnan(curr_loss)
        break
    end
end
println("Finished")

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.07302259271332447
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.09745239667577277
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.08715939925414744
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.08651049905763371
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.06214930733800068
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.07059796724594071
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch#

In [464]:
print("For **1.1.smei-DFP**, the **epoch_vs_losses** = ")
println(epoch_vs_losses)
# println("This is with a inital α = $step_size")
print("And after 20 epochs, **loss on whole train set** = ")
println(train_scores(Ws))
print("And **loss on whole test set** = ")
score = test_scores(Ws)
println(score[1])
print("**accuracy ** = ")
println(score[2])
print("which is $(round(score[2]*100, digits=2))%")

For **1.1.smei-DFP**, the **epoch_vs_losses** = Any[0.8970499335079404, 0.9319714761618514, 0.8973227504729017, 0.9392187105184473, 0.9042583386748708, 0.9193498702244056, 0.8921557179160868, 0.8768993753400174, 0.9392332085728637, 0.9131188643217615, 0.8886630170312965, 0.8970369292550615, 0.9462036079626132, 0.8885676765179372, 0.95813977168375, 0.8786778355827368, 0.9176907252470387, 0.9279673608297146, 0.9722710553827264, 0.9243282103059455]
And after 20 epochs, **loss on whole train set** = 0.8703657101593534
And **loss on whole test set** = 0.8128235528431832
**accuracy ** = 0.6666666666666666
which is 66.67%

## Optimizer = LBFGS done

In [526]:
num_epoch_LimitedMemoryBFGS = 20;

In [560]:
epoch_vs_losses = []
curr_train_loss = 0.0
# Initializing 
optimizer_LimitedMemoryBFGS = LimitedMemoryBFGS(20, f, nothing, nothing, nothing, nothing)
init!(optimizer_LimitedMemoryBFGS)
Ws = deepcopy(Ws_copy);

In [561]:
# Train
for i = 1:num_epoch_LimitedMemoryBFGS
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_LimitedMemoryBFGS, Ws; hidden_layer_size = 16)
    if !isnan(curr_loss)
        curr_train_loss = train_scores(Ws)
        push!(epoch_vs_losses, curr_train_loss)
    end
    if curr_train_loss <= loss_ε || isnan(curr_loss)
        break
    end
end
println("Finished")

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  5.232887644419244
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  5.889627086106204
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  4.042893749520418
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  2.416337190510231
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  2.112620454521046
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  1.7874131305179433
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	tr

In [562]:
print("For **1.1.LimitedMemoryBFGS**, the **epoch_vs_losses** = ")
println(epoch_vs_losses)
# println("This is with a inital α = $step_size")
print("And after 20 epochs, **loss on whole train set** = ")
println(train_scores(Ws))
print("And **loss on whole test set** = ")
score = test_scores(Ws)
println(score[1])
print("**accuracy ** = ")
println(score[2])
print("which is $(round(score[2]*100, digits=2))%")

For **1.1.LimitedMemoryBFGS**, the **epoch_vs_losses** = Any[2.883047610020867, 2.0065541170126546, 1.620689946502641, 1.3931216509495734, 1.2552017494677035, 1.2270935159300604, 1.126629882134411, 1.1207909985479536, 1.1092457214265974, 1.1224675442328351, 1.1693821413625354, 1.2321269154014602, 1.252417392764308, 1.245077746298742, 1.289158411883179, 1.3057483584215588, 1.3224399015383812, 1.2449846035521757, 1.4237011937155306, 1.3463238796991803]
And after 20 epochs, **loss on whole train set** = 1.3145815760814066
And **loss on whole test set** = 1.4831769358410105
**accuracy ** = 0.625
which is 62.5%

# Benchmarking

### Helper functions to produces scores, etc for benchmarking

In [339]:
function test_scores(Ws)
    ct_correct = 0
    ct_total = 0
    loss = []
    for i = 1 : size(test_x_data)[1]
        x = test_x_data[i]
        y = test_y_data[i]
        total_loss, hidden_outs, Y_pred, scores = f(x, y, _unwrap(Ws))
        # println("y_truth = $y, scores = $scores")
        push!(loss, total_loss)
        ct_total += 1
        # println("$scores -> $(argmax(scores)), $y -> $(argmax(y)[1])")
        if argmax(scores) == argmax(y)[1]
            ct_correct += 1
        end
    end
    # println(loss)
    return mean(loss), ct_correct/ct_total
end
function train_scores(Ws)
    loss = []
    for i = 1 : size(train_x_data)[1]
        x = train_x_data[i]
        y = train_y_data[i]
        total_loss, hidden_outs, Y_pred, scores = f(x, y, _unwrap(Ws))
        # println("y_truth = $y, scores = $scores")
        push!(loss, total_loss)
    end
    return mean(loss)
end
function filter_scores(test_losses, accs)
    ct_all = length(test_losses)
    ct_nan = 0
    # new_train_losses = []
    new_test_losses = []
    new_accs = []
    for i = 1 : ct_all
        if isnan(test_losses[i])
            ct_nan += 1
        else
            # push!(new_train_losses, train_losses[i])
            push!(new_test_losses, test_losses[i])
            push!(new_accs, accs[i])
        end
    end
    return new_test_losses, new_accs, ct_all-ct_nan
end

filter_scores (generic function with 1 method)

## 1: Performance after a same number of epochs = 20 epochs 

In [127]:
num_epoch_1 = 20;

### 1.1 : Train 100 times on a same inital NN

#### Create a NN

In [765]:
loss_ε = 0.1
# Initializing 
Y_pred, hidden_outs, Ws_copy = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
# println(Y_pred)
scores, losses, total_loss = softmax_loss(Y_pred, train_y_data[1])
∇_copy = ∇(train_x_data[1], train_y_data[1], Ws_copy, scores, Y_pred, hidden_outs)
println("New model created!\nInital loss = $total_loss")

New model created!
Inital loss = 4.8123363545039295


#### Optimizer = Steepest gradient descent done

In [782]:
# Initializing 
# optimizer_SteepestGD = SteepestGD(0.0025, nothing)
optimizer_SteepestGD = SteepestGD(0.0003, nothing)
# Start training
train_losses_20Epochs_SGD = []
test_losses_20Epochs_SGD = []
accuracies_20Epochs_SGD = []
ct_epochs = 0
for i = 1 : 100
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    init!(optimizer_SteepestGD)
    # Train
    curr_loss = 0.0
    curr_train_loss = 0.0
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_SteepestGD, Ws; hidden_layer_size = 16, print_info = false)
        # println(mean(_unwrap(Ws)))
        if !isnan(curr_loss)
            curr_train_loss = train_scores(Ws)
            # println(curr_train_loss)
        end
        if curr_train_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_SGD, curr_train_loss)
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_SGD, loss_curr)
    push!(accuracies_20Epochs_SGD, acc_curr)
end
test_losses_20Epochs_SGD, accuracies_20Epochs_SGD, ct_valid_loss = filter_scores(test_losses_20Epochs_SGD, accuracies_20Epochs_SGD)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [783]:
println(ct_epochs)

1986


In [769]:
println("SteepestGD after 100 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_SGD); digits = 4)) ± $(round(std(train_losses_20Epochs_SGD); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_SGD); digits = 4)) ± $(round(std(test_losses_20Epochs_SGD); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_SGD)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_SGD); digits = 2))%")

SteepestGD after 100 individual trains on the same inital NN:
	3 runs converged.
	Train loss = NaN ± NaN
	Test loss = 4.2783 ± 0.2652
	Accuracy = 59.72% ± 0.05%


#### Optimizer = Adagrad done

With inital step size = 0.005

In [784]:
# Initializing 
optimizer_Adagrad = Adagrad(0.01, 1e-5, nothing, nothing)
# Start training
train_losses_20Epochs_Adagrad = []
losses_20Epochs_Adagrad = []
accuracies_20Epochs_Adagrad = []
ct_epochs = 0
for i = 1 : 100
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    init!(optimizer_Adagrad, num_Ws)
    # Train
    curr_loss = 0.0
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_Adagrad, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_Adagrad, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(losses_20Epochs_Adagrad, loss_curr)
    push!(accuracies_20Epochs_Adagrad, acc_curr)
end
losses_20Epochs_Adagrad, accuracies_20Epochs_Adagrad, ct_valid_loss = 
filter_scores(losses_20Epochs_Adagrad, accuracies_20Epochs_Adagrad)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [785]:
println(ct_epochs)

114


In [771]:
println("Adagrad after 100 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_Adagrad); digits = 4)) ± $(round(std(train_losses_20Epochs_Adagrad); digits = 4))")
println("\tTest loss = $(round(mean(losses_20Epochs_Adagrad); digits = 4)) ± $(round(std(losses_20Epochs_Adagrad); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_Adagrad)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_Adagrad); digits = 2))%")

Adagrad after 100 individual trains on the same inital NN:
	100 runs converged.
	Train loss = 1.2115 ± 0.1189
	Test loss = 1.2235 ± 0.15
	Accuracy = 56.12% ± 0.06%


With inital step size = 0.016

In [776]:
# Initializing 
optimizer_Adagrad = Adagrad(0.016, 1e-5, nothing, nothing)
# Start training
train_losses_20Epochs_Adagrad = []
losses_20Epochs_Adagrad = []
accuracies_20Epochs_Adagrad = []
ct_epochs = 0
for i = 1 : 100
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    init!(optimizer_Adagrad, num_Ws)
    # Train
    curr_loss = 0.0
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_Adagrad, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_Adagrad, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(losses_20Epochs_Adagrad, loss_curr)
    push!(accuracies_20Epochs_Adagrad, acc_curr)
end
losses_20Epochs_Adagrad, accuracies_20Epochs_Adagrad, ct_valid_loss = 
filter_scores(losses_20Epochs_Adagrad, accuracies_20Epochs_Adagrad)
println("Finished")
println(ct_epochs)

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [None]:
println("Adagrad after 100 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_Adagrad); digits = 4)) ± $(round(std(train_losses_20Epochs_Adagrad); digits = 4))")
println("\tTest loss = $(round(mean(losses_20Epochs_Adagrad); digits = 4)) ± $(round(std(losses_20Epochs_Adagrad); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_Adagrad)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_Adagrad); digits = 2))%")

#### Optimizer = Adam done

In [774]:
# Initializing 
optimizer_Adam = Adam(0.003, 0.9, 0.999, 1e-4, nothing, nothing, nothing)
# Start training
train_losses_20Epochs_Adam = []
test_losses_20Epochs_Adam = []
accuracies_20Epochs_Adam = []
ct_epochs = 0
for i = 1 : 100
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    init!(optimizer_Adam, num_Ws)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_Adam, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_Adam, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_Adam, loss_curr)
    push!(accuracies_20Epochs_Adam, acc_curr)
end
test_losses_20Epochs_Adam, accuracies_20Epochs_Adam, ct_valid_loss = 
filter_scores(test_losses_20Epochs_Adam, accuracies_20Epochs_Adam)
println("Finished")
println(ct_epochs)

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [469]:
println("Adam after 100 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_Adam); digits = 4)) ± $(round(std(train_losses_20Epochs_Adam); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_Adam); digits = 4)) ± $(round(std(test_losses_20Epochs_Adam); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_Adam)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_Adam); digits = 2))%")

Adam after 100 individual trains on the same inital NN:
	100 runs converged.
	Train loss = 0.7258 ± 0.2388
	Test loss = 0.8585 ± 0.297
	Accuracy = 74.54% ± 0.05%


#### Optimizer = ConjugateGD done

In [777]:
# Initializing 
# Start training
train_losses_20Epochs_ConjugateGD = []
test_losses_20Epochs_ConjugateGD = []
accuracies_20Epochs_ConjugateGD = []
ct_epochs = 0
for i = 1 : 100
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    optimizer_ConjugateGD = ConjugateGD(0.00875, _unwrap(deepcopy(∇_copy)), f, nothing, nothing)
    init!(optimizer_ConjugateGD)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_ConjugateGD, Ws; hidden_layer_size = 16, print_info = true)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_ConjugateGD, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_ConjugateGD, loss_curr)
    push!(accuracies_20Epochs_ConjugateGD, acc_curr)
end
test_losses_20Epochs_ConjugateGD, accuracies_20Epochs_ConjugateGD, ct_valid_loss = 
filter_scores(test_losses_20Epochs_ConjugateGD, accuracies_20Epochs_ConjugateGD)
println("Finished")
println(ct_epochs)

run# 1 --> batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  Inf
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  NaN
run# 2 --> batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.033495011704952456
run# 3 --> batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.007608786974418685
run# 4 --> batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.023770864086609483
run# 5 --> batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.052053119409005956
run# 6 --> batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> batch# 6 --> batch# 7 --> batch# 8 --> 
	train loss =  0.

In [134]:
println("ConjugateGD after 100 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_ConjugateGD); digits = 4)) ± $(round(std(train_losses_20Epochs_ConjugateGD); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_ConjugateGD); digits = 4)) ± $(round(std(test_losses_20Epochs_ConjugateGD); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_ConjugateGD)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_ConjugateGD); digits = 2))%")

ConjugateGD after 100 individual trains on the same inital NN:
	100 runs converged.
	Train loss = 2.988 ± 0.1345
	Test loss = 2.795 ± 0.1813
	Accuracy = 66.46% ± 0.01%


#### Optimizer = NesterovMomentum_SGD done

In [778]:
# Initializing 
# Start training
train_losses_20Epochs_NesterovMomentum_SGD= []
test_losses_20Epochs_NesterovMomentum_SGD = []
accuracies_20Epochs_NesterovMomentum_SGD = []
optimizer_NesterovMomentum_SGD = NesterovMomentum_SGD(0.000002, 0.99, f, nothing, nothing)
ct_epochs = 0
for i = 1 : 100
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
init!(optimizer_NesterovMomentum_SGD, num_Ws)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_NesterovMomentum_SGD, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_NesterovMomentum_SGD, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_NesterovMomentum_SGD, loss_curr)
    push!(accuracies_20Epochs_NesterovMomentum_SGD, acc_curr)
end
test_losses_20Epochs_NesterovMomentum_SGD, accuracies_20Epochs_NesterovMomentum_SGD, ct_valid_loss = 
filter_scores(test_losses_20Epochs_NesterovMomentum_SGD, accuracies_20Epochs_NesterovMomentum_SGD)
println("Finished")
println(ct_epochs)

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [388]:
println("NesterovMomentum_SGD after 100 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_NesterovMomentum_SGD); digits = 4)) ± $(round(std(train_losses_20Epochs_NesterovMomentum_SGD); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_NesterovMomentum_SGD); digits = 4)) ± $(round(std(test_losses_20Epochs_NesterovMomentum_SGD); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_NesterovMomentum_SGD)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_NesterovMomentum_SGD); digits = 2))%")

NesterovMomentum_SGD after 100 individual trains on the same inital NN:
	100 runs converged.
	Train loss = 0.3786 ± 0.0393
	Test loss = 0.3888 ± 0.0554
	Accuracy = 84.5% ± 0.05%


#### Optimizer = Adadelta done

In [787]:
# Initializing 
# Start training
train_losses_20Epochs_Adadelta= []
test_losses_20Epochs_Adadelta = []
accuracies_20Epochs_Adadelta = []
optimizer_Adadelta = Adadelta(0.95, 0.95, 0.00005, nothing, nothing, nothing)
ct_epochs = 0
for i = 1 : 100
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    init!(optimizer_Adadelta, num_Ws)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_Adadelta, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_Adadelta, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_Adadelta, loss_curr)
    push!(accuracies_20Epochs_Adadelta, acc_curr)
end
test_losses_20Epochs_Adadelta, accuracies_20Epochs_Adadelta, ct_valid_loss = 
filter_scores(test_losses_20Epochs_Adadelta, accuracies_20Epochs_Adadelta)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [788]:
println(ct_epochs)

313


In [391]:
println("Adadelta after 100 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_Adadelta); digits = 4)) ± $(round(std(train_losses_20Epochs_Adadelta); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_Adadelta); digits = 4)) ± $(round(std(test_losses_20Epochs_Adadelta); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_Adadelta)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_Adadelta); digits = 2))%")

Adadelta after 100 individual trains on the same inital NN:
	100 runs converged.
	Train loss = 0.439 ± 0.0569
	Test loss = 0.5172 ± 0.0629
	Accuracy = 81.42% ± 0.03%


#### Optimizer = DFP done

In [780]:
# Initializing 
# Start training
train_losses_20Epochs_DFP= []
test_losses_20Epochs_DFP = []
accuracies_20Epochs_DFP = []
optimizer_DFP = DFP(nothing, f, nothing)
ct_epochs = 0
for i = 1 : 5
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    init!(optimizer_DFP, num_Ws)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_DFP, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_DFP, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_DFP, loss_curr)
    push!(accuracies_20Epochs_DFP, acc_curr)
end
test_losses_20Epochs_DFP, accuracies_20Epochs_DFP, ct_valid_loss = 
filter_scores(test_losses_20Epochs_DFP, accuracies_20Epochs_DFP)
println("Finished")
println(ct_epochs)

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> Finished
5


In [564]:
println("DFP after 100 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_DFP); digits = 4)) ± $(round(std(train_losses_20Epochs_DFP); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_DFP); digits = 4)) ± $(round(std(test_losses_20Epochs_DFP); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_DFP)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_DFP); digits = 2))%")

DFP after 100 individual trains on the same inital NN:
	5 runs converged.
	Train loss = 1.3765 ± 0.6059
	Test loss = 1.2537 ± 0.4109
	Accuracy = 68.33% ± 0.2%


#### Optimizer = LBFGS done

In [781]:
# Initializing 
# Start training
train_losses_20Epochs_LBFGS= []
test_losses_20Epochs_LBFGS = []
accuracies_20Epochs_LBFGS = []
optimizer_LimitedMemoryBFGS = LimitedMemoryBFGS(20, f, nothing, nothing, nothing, nothing)
ct_epochs = 0
for i = 1 : 5
    print("run# $i --> ")
    Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    init!(optimizer_LimitedMemoryBFGS)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        ct_epochs += 1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_LimitedMemoryBFGS, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_LBFGS, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_LBFGS, loss_curr)
    push!(accuracies_20Epochs_LBFGS, acc_curr)
end
test_losses_20Epochs_LBFGS, accuracies_20Epochs_LBFGS, ct_valid_loss = 
filter_scores(test_losses_20Epochs_LBFGS, accuracies_20Epochs_LBFGS)
println("Finished")
println(ct_epochs)

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> Finished
27


In [567]:
println("LimitedMemoryBFGS after 5 individual trains on the same inital NN:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_LBFGS); digits = 4)) ± $(round(std(train_losses_20Epochs_LBFGS); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_LBFGS); digits = 4)) ± $(round(std(test_losses_20Epochs_LBFGS); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_LBFGS)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_LBFGS); digits = 2))%")

LimitedMemoryBFGS after 5 individual trains on the same inital NN:
	5 runs converged.
	Train loss = 1.22 ± 0.53
	Test loss = 1.45 ± 0.21
	Accuracy = 66.67% ± 0.73%


### 1.2 : Train 100 times on randomly initialized NNs

#### Optimizer = Steepest gradient descent done

In [579]:
# Initializing 
# Start training
train_losses_20Epochs_SGD = []
test_losses_20Epochs_SGD = []
accuracies_20Epochs_SGD = []
for i = 1 : 100
    print("run# $i --> ")
    dumb1, dumb2, Ws = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
    # Ws = deepcopy(Ws_copy)
    optimizer_SteepestGD = SteepestGD(0.0002, nothing)
    init!(optimizer_SteepestGD)
    # Train
    curr_loss = 0.0
    curr_train_loss = 0.0
    for i = 1:num_epoch_1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_SteepestGD, Ws; hidden_layer_size = 16, print_info = false)
        # println(mean(_unwrap(Ws)))
        if !isnan(curr_loss)
            curr_train_loss = train_scores(Ws)
            # println(curr_train_loss)
        end
        if curr_train_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_SGD, curr_train_loss)
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_SGD, loss_curr)
    push!(accuracies_20Epochs_SGD, acc_curr)
end
test_losses_20Epochs_SGD, accuracies_20Epochs_SGD, ct_valid_loss = filter_scores(test_losses_20Epochs_SGD, accuracies_20Epochs_SGD)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> 

In [581]:
println("SteepestGD after 100 individual trains on 100 randomly initialized NNs:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_SGD); digits = 4)) ± $(round(std(train_losses_20Epochs_SGD); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_SGD); digits = 4)) ± $(round(std(test_losses_20Epochs_SGD); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_SGD)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_SGD); digits = 2))%")

SteepestGD after 100 individual trains on 100 randomly initialized NNs:
	73 runs converged.
	Train loss = 0.8863 ± 0.6263
	Test loss = 1.0504 ± 0.689
	Accuracy = 70.95% ± 0.13%


#### Optimizer = Adagrad done

In [593]:
# Initializing 
# Start training
train_losses_20Epochs_Adagrad = []
losses_20Epochs_Adagrad = []
accuracies_20Epochs_Adagrad = []
for i = 1 : 100
    print("run# $i --> ")
    dumb1, dumb2, Ws = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
    # Ws = deepcopy(Ws_copy)
    optimizer_Adagrad = Adagrad(0.00675, 1e-5, nothing, nothing)
    init!(optimizer_Adagrad, num_Ws)
    # Train
    curr_loss = 0.0
    for i = 1:num_epoch_1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_Adagrad, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_Adagrad, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(losses_20Epochs_Adagrad, loss_curr)
    push!(accuracies_20Epochs_Adagrad, acc_curr)
end
losses_20Epochs_Adagrad, accuracies_20Epochs_Adagrad, ct_valid_loss = 
filter_scores(losses_20Epochs_Adagrad, accuracies_20Epochs_Adagrad)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [595]:
println("Adagrad after 100 individual trains on 100 randomly initialized NNs:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_Adagrad); digits = 4)) ± $(round(std(train_losses_20Epochs_Adagrad); digits = 4))")
println("\tTest loss = $(round(mean(losses_20Epochs_Adagrad); digits = 4)) ± $(round(std(losses_20Epochs_Adagrad); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_Adagrad)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_Adagrad); digits = 2))%")

Adagrad after 100 individual trains on 100 randomly initialized NNs:
	75 runs converged.
	Train loss = 1.1799 ± 2.1373
	Test loss = 1.3938 ± 2.2428
	Accuracy = 74.22% ± 0.15%


#### Optimizer = Adam done

In [613]:
# Initializing 
# Start training
train_losses_20Epochs_Adam = []
test_losses_20Epochs_Adam = []
accuracies_20Epochs_Adam = []
for i = 1 : 100
    print("run# $i --> ")
    dumb1, dumb2, Ws = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
    # Ws = deepcopy(Ws_copy)
    optimizer_Adam = Adam(0.0005, 0.9, 0.999, 1e-4, nothing, nothing, nothing)
    init!(optimizer_Adam, num_Ws)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_Adam, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_Adam, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_Adam, loss_curr)
    push!(accuracies_20Epochs_Adam, acc_curr)
end
test_losses_20Epochs_Adam, accuracies_20Epochs_Adam, ct_valid_loss = 
filter_scores(test_losses_20Epochs_Adam, accuracies_20Epochs_Adam)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [614]:
println("Adam after 100 individual trains on 100 randomly initialized NNs:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_Adam); digits = 4)) ± $(round(std(train_losses_20Epochs_Adam); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_Adam); digits = 4)) ± $(round(std(test_losses_20Epochs_Adam); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_Adam)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_Adam); digits = 2))%")

Adam after 100 individual trains on 100 randomly initialized NNs:
	99 runs converged.
	Train loss = 1.586 ± 2.7111
	Test loss = 1.8236 ± 2.879
	Accuracy = 70.71% ± 0.15%


#### Optimizer = ConjugateGD done

In [715]:
# Initializing 
# Start training
train_losses_20Epochs_ConjugateGD = []
test_losses_20Epochs_ConjugateGD = []
accuracies_20Epochs_ConjugateGD = []
for i = 1 : 100
    print("run# $i --> ")
    Y_pred, hidden_outs, Ws = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
    scores, losses, total_loss = softmax_loss(Y_pred, train_y_data[1])
    ∇_copy = ∇(train_x_data[1], train_y_data[1], Ws, scores, Y_pred, hidden_outs)
    # Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    optimizer_ConjugateGD = ConjugateGD(mean(_unwrap(Ws))/500, _unwrap(deepcopy(∇_copy)), f, nothing, nothing)
    init!(optimizer_ConjugateGD)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_ConjugateGD, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_ConjugateGD, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_ConjugateGD, loss_curr)
    push!(accuracies_20Epochs_ConjugateGD, acc_curr)
end
test_losses_20Epochs_ConjugateGD, accuracies_20Epochs_ConjugateGD, ct_valid_loss = 
filter_scores(test_losses_20Epochs_ConjugateGD, accuracies_20Epochs_ConjugateGD)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [717]:
println("ConjugateGD after 100 individual trains on 100 randomly initialized NNs:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_ConjugateGD); digits = 4)) ± $(round(std(train_losses_20Epochs_ConjugateGD); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_ConjugateGD); digits = 4)) ± $(round(std(test_losses_20Epochs_ConjugateGD); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_ConjugateGD)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_ConjugateGD); digits = 2))%")

ConjugateGD after 100 individual trains on 100 randomly initialized NNs:
	94 runs converged.
	Train loss = 2.79 ± 0.62
	Test loss = 3.58 ± 4.29
	Accuracy = 66.67% ± 0.19%


#### Optimizer = NesterovMomentum_SGD done

In [746]:
# Initializing 
# Start training
train_losses_20Epochs_NesterovMomentum_SGD= []
test_losses_20Epochs_NesterovMomentum_SGD = []
accuracies_20Epochs_NesterovMomentum_SGD = []
for i = 1 : 100
    print("run# $i --> ")
    Y_pred, hidden_outs, Ws = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
    # Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    optimizer_NesterovMomentum_SGD = NesterovMomentum_SGD(0.000002, 0.99, f, nothing, nothing)
    init!(optimizer_NesterovMomentum_SGD, num_Ws)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_NesterovMomentum_SGD, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_NesterovMomentum_SGD, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_NesterovMomentum_SGD, loss_curr)
    push!(accuracies_20Epochs_NesterovMomentum_SGD, acc_curr)
end
test_losses_20Epochs_NesterovMomentum_SGD, accuracies_20Epochs_NesterovMomentum_SGD, ct_valid_loss = 
filter_scores(test_losses_20Epochs_NesterovMomentum_SGD, accuracies_20Epochs_NesterovMomentum_SGD)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [747]:
println("NesterovMomentum_SGD after 100 individual trains on 100 randomly initialized NNs:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_NesterovMomentum_SGD); digits = 4)) ± $(round(std(train_losses_20Epochs_NesterovMomentum_SGD); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_NesterovMomentum_SGD); digits = 4)) ± $(round(std(test_losses_20Epochs_NesterovMomentum_SGD); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_NesterovMomentum_SGD)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_NesterovMomentum_SGD); digits = 2))%")

NesterovMomentum_SGD after 100 individual trains on 100 randomly initialized NNs:
	79 runs converged.
	Train loss = 1.2243 ± 2.1855
	Test loss = 1.3638 ± 2.147
	Accuracy = 73.36% ± 0.15%


#### Optimizer = Adadelta done

In [750]:
# Initializing 
# Start training
train_losses_20Epochs_Adadelta= []
test_losses_20Epochs_Adadelta = []
accuracies_20Epochs_Adadelta = []
for i = 1 : 100
    print("run# $i --> ")
    Y_pred, hidden_outs, Ws = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
    # Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    optimizer_Adadelta = Adadelta(0.95, 0.95, 0.00005, nothing, nothing, nothing)
    init!(optimizer_Adadelta, num_Ws)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_Adadelta, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_Adadelta, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_Adadelta, loss_curr)
    push!(accuracies_20Epochs_Adadelta, acc_curr)
end
test_losses_20Epochs_Adadelta, accuracies_20Epochs_Adadelta, ct_valid_loss = 
filter_scores(test_losses_20Epochs_Adadelta, accuracies_20Epochs_Adadelta)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> run# 11 --> run# 12 --> run# 13 --> run# 14 --> run# 15 --> run# 16 --> run# 17 --> run# 18 --> run# 19 --> run# 20 --> run# 21 --> run# 22 --> run# 23 --> run# 24 --> run# 25 --> run# 26 --> run# 27 --> run# 28 --> run# 29 --> run# 30 --> run# 31 --> run# 32 --> run# 33 --> run# 34 --> run# 35 --> run# 36 --> run# 37 --> run# 38 --> run# 39 --> run# 40 --> run# 41 --> run# 42 --> run# 43 --> run# 44 --> run# 45 --> run# 46 --> run# 47 --> run# 48 --> run# 49 --> run# 50 --> run# 51 --> run# 52 --> run# 53 --> run# 54 --> run# 55 --> run# 56 --> run# 57 --> run# 58 --> run# 59 --> run# 60 --> run# 61 --> run# 62 --> run# 63 --> run# 64 --> run# 65 --> run# 66 --> run# 67 --> run# 68 --> run# 69 --> run# 70 --> run# 71 --> run# 72 --> run# 73 --> run# 74 --> run# 75 --> run# 76 --> run# 77 --> run# 78 --> run# 79 --> run# 80 --> run# 81 --> run# 82 --> run# 83 --> run# 84 --> r

In [751]:
println("Adadelta after 100 individual trains on 100 randomly initialized NNs:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_Adadelta); digits = 4)) ± $(round(std(train_losses_20Epochs_Adadelta); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_Adadelta); digits = 4)) ± $(round(std(test_losses_20Epochs_Adadelta); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_Adadelta)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_Adadelta); digits = 2))%")

Adadelta after 100 individual trains on 100 randomly initialized NNs:
	62 runs converged.
	Train loss = 1.0104 ± 1.7167
	Test loss = 1.304 ± 1.8591
	Accuracy = 75.4% ± 0.13%


#### Optimizer = DFP done

In [756]:
# Initializing 
# Start training
train_losses_20Epochs_DFP= []
test_losses_20Epochs_DFP = []
accuracies_20Epochs_DFP = []
for i = 1 : 10
    print("run# $i --> ")
    Y_pred, hidden_outs, Ws = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
    # Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    optimizer_DFP = DFP(nothing, f, nothing)
    init!(optimizer_DFP, num_Ws)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_DFP, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_DFP, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_DFP, loss_curr)
    push!(accuracies_20Epochs_DFP, acc_curr)
end
test_losses_20Epochs_DFP, accuracies_20Epochs_DFP, ct_valid_loss = 
filter_scores(test_losses_20Epochs_DFP, accuracies_20Epochs_DFP)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> run# 6 --> run# 7 --> run# 8 --> run# 9 --> run# 10 --> Finished


In [757]:
println("DFP after 10 individual trains on 10 randomly initialized NNs:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_DFP); digits = 4)) ± $(round(std(train_losses_20Epochs_DFP); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_DFP); digits = 4)) ± $(round(std(test_losses_20Epochs_DFP); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_DFP)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_DFP); digits = 2))%")

DFP after 10 individual trains on 10 randomly initialized NNs:
	1 runs converged.
	Train loss = 21.5844 ± NaN
	Test loss = 19.7231 ± NaN
	Accuracy = 66.67% ± NaN%


#### Optimizer = LBFGS done

In [758]:
# Initializing 
# Start training
train_losses_20Epochs_LBFGS= []
test_losses_20Epochs_LBFGS = []
accuracies_20Epochs_LBFGS = []
for i = 1 : 5
    print("run# $i --> ")
    Y_pred, hidden_outs, Ws = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
    # Ws = deepcopy(Ws_copy)
    # Reset params for optimizer
    optimizer_LimitedMemoryBFGS = LimitedMemoryBFGS(20, f, nothing, nothing, nothing, nothing)
    init!(optimizer_LimitedMemoryBFGS)
    curr_loss = 0.0
    # Train
    for i = 1:num_epoch_1
        # print("Epoch# $i\n\t")
        Ws, curr_loss = epoch_step(optimizer_LimitedMemoryBFGS, Ws; hidden_layer_size = 16, print_info = false)
        if curr_loss <= loss_ε || isnan(curr_loss)
            break
        end
    end
    if !isnan(curr_loss)
        push!(train_losses_20Epochs_LBFGS, train_scores(Ws))
    end
    flush(stdout)
    # println("Finished")
    loss_curr, acc_curr = test_scores(Ws)
    push!(test_losses_20Epochs_LBFGS, loss_curr)
    push!(accuracies_20Epochs_LBFGS, acc_curr)
end
test_losses_20Epochs_LBFGS, accuracies_20Epochs_LBFGS, ct_valid_loss = 
filter_scores(test_losses_20Epochs_LBFGS, accuracies_20Epochs_LBFGS)
println("Finished")

run# 1 --> run# 2 --> run# 3 --> run# 4 --> run# 5 --> Finished


In [761]:
println("LimitedMemoryBFGS after 5 individual trains on 5 randomly initialized NNs:")
println("\t$ct_valid_loss runs converged.")
println("\tTrain loss = $(round(mean(train_losses_20Epochs_LBFGS); digits = 4)) ± $(round(std(train_losses_20Epochs_LBFGS); digits = 4))")
println("\tTest loss = $(round(mean(test_losses_20Epochs_LBFGS); digits = 4)) ± $(round(std(test_losses_20Epochs_LBFGS); digits = 4))")
println("\tAccuracy = $(round(mean(accuracies_20Epochs_LBFGS)*100; digits = 2))% ± $(round(std(accuracies_20Epochs_LBFGS); digits = 2))%")

LimitedMemoryBFGS after 5 individual trains on 5 randomly initialized NNs:
	5 runs converged.
	Train loss = 1.28 ± 3.71
	Test loss = 2.93 ± 4.72
	Accuracy = 66.67% ± 3.79%


## 2: Number of epochs and runtime needed to reach a same loss threshold = 0.001

In [762]:
loss_ε = 0.7

0.7

#### Optimizer = Steepest gradient descent

#### Optimizer = Adagrad

#### Optimizer = Adam

#### Optimizer = ConjugateGD

#### Optimizer = NesterovMomentum_SGD

#### Optimizer = Adadelta

#### Optimizer = DFP

#### Optimizer = LBFGS