# Setup project

In [1]:
using Pkg
Pkg.activate(".")
Pkg.instantiate()
# Pkg.add("DataLoaders")
# Pkg.add("Plots")
# Pkg.add("CUDA")
# Pkg.add("Distributions")

[32m[1m  Activating[22m[39m project at `g:\桌面\2022 Fall\cs268\final_proj`


In [2]:
using CSV
using DataLoaders
using Plots
using CUDA
using Distributions 
using LinearAlgebra
using Statistics

# Create a custom ML structure to apply custom optimizers.

## Prepare data

The Iris dataset is from UC Irvine Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/iris

### Create a one-hot encoder

In [3]:
function one_hot(y)
    rslt = zeros(3)
    rslt[trunc(Int, y) + 1] = 1.0
    return rslt
end

one_hot (generic function with 1 method)

### Read all lines of data in train and test

Iris dataset

In [4]:
# train and test
train_x_data = []
train_y_data = []
# 8:2
# test_x_data = []
# test_y_data = []
for line in eachline("iris.txt")
    splitted = collect(split(line, " "))
    push!(train_x_data, [parse(Float64, splitted[4]), parse(Float64, splitted[7]), parse(Float64, splitted[10]), parse(Float64, splitted[13])])
    push!(train_y_data, one_hot(parse(Float64, splitted[16])))
end

Inspect shape of train and test

In [5]:
print(size(train_x_data))
print(size(train_y_data))

(148,)(148,)

### Create dataloader

In [6]:
batch_size = 33
dl_train = DataLoader((train_x_data, train_y_data), batch_size)
# dl_test = DataLoader((test_x, test_y), batch_size)

DataLoaders.GetObsParallel{DataLoaders.BatchViewCollated{Tuple{Vector{Any}, Vector{Any}}}}(batchviewcollated() with 5 batches of size 33, false)

## Create whole machine learning pipeline

### Create an activation function: ReLU 

In [7]:
function Relu!(Xs)
    for i = 1 : length(Xs)
        Xs[i] = max(0.0,Xs[i])
    end
end

Relu! (generic function with 1 method)

### Create fully connected layers

In [8]:
# Xs in only 1 set of input in a batch.
function fc(Xs, num_in_channel, num_out_channel, W)
    rslt = zeros(num_out_channel)
    Xs = reshape(Xs, 1, num_in_channel)
    b = rand()
    for i = 1:num_out_channel
        temp = reshape(W[:, i], num_in_channel, 1)
        rslt[i] = (Xs * temp)[1]+b
    end
    return rslt
end

fc (generic function with 1 method)

### Create a simple MLP model

In [9]:
# Xs in only 1 set of input in a batch.
function MLP(Xs, out_channel; num_hidden_layers = 0, hidden_layer_size = 64, Ws = nothing)
    Ws_idx = 1
    hidden_outs = []
    # layer in
    if Ws == nothing
        Wss = []
        W = rand(Uniform(-0.1, 0.1), size(Xs)[1], hidden_layer_size)
        push!(Wss, W)
    else
        W = Ws[Ws_idx]
        Ws_idx += 1
    end
    out = fc(Xs, size(Xs)[1], hidden_layer_size, W)
    Relu!(out)
    push!(hidden_outs, deepcopy(out))
    # layer hidden
    for i = 1 : num_hidden_layers
        if Ws == nothing
            W = rand(Uniform(-0.1, 0.1), hidden_layer_size, hidden_layer_size)
            push!(Wss, W)
        else
            W = Ws[Ws_idx]
            Ws_idx += 1
        end
        out = fc(out, hidden_layer_size, hidden_layer_size, W)
        Relu!(out)
        push!(hidden_outs, deepcopy(out))
    end
    # layer out
    if Ws == nothing
        W = rand(Uniform(-0.1, 0.1), hidden_layer_size, out_channel)
        push!(Wss, W)
    else
        W = Ws[Ws_idx]
        Ws_idx += 1
    end
    out = fc(out, hidden_layer_size, out_channel, W)
    if Ws == nothing
        return out, hidden_outs, Wss
    end
    return out, hidden_outs
end

MLP (generic function with 1 method)

### Create a loss function: soft-max loss
Soft-max loss is a combination of a soft-max activation layer and cross entropy loss

In [10]:
# Y_pred in only 1 set of prediction in a batch. Ys in only 1 set of truth label in a batch.
function softmax_loss(Y_pred, Ys)
    # softmax
    temp_sum = 0.0
    for x in Y_pred
        temp_sum += exp(x)
    end
    l = length(Y_pred)
    scores = zeros(l)
    for i = 1:l
        scores[i] = exp(Y_pred[i])/temp_sum
    end
    # loss
    losses = zeros(l)
    for i = 1:l
        losses[i] = -Ys[i] * log(scores[i])
    end
    total_loss = sum(losses)
    
    return scores, losses, total_loss
end

softmax_loss (generic function with 1 method)

How many threads we have for cpu?

In [11]:
Threads.nthreads()

32

### Create a back-propagation procedure

In [12]:
#Ws[idx of layer][idx of node, idx of weights]
function ∇(Xs, Ys, Ws, scores, outs, hidden_outs)
    num_in_channel = size(Xs)[1]
    num_out_channel = size(Ys)[1]
    hidden_layer_size = size(hidden_outs[1])[1]
    num_hidden_layers = size(hidden_outs)[1]
    # step -2
    ∂loss_∂netout = zeros(num_out_channel)
    for i = 1:num_out_channel
        ∂loss_∂netout[i] = -Ys[i]/scores[i]
    end
    # println("step -2: $∂loss_∂netout")
    # step -1
    outs_exp_sum = 0.0
    for out in outs
        outs_exp_sum += exp(out)
    end
    # println("step -1: $outs_exp_sum")
    for i = 1:num_out_channel
        ∂loss_∂netout[i] *= (exp(outs[i])*(outs_exp_sum - exp(outs[i])))/(outs_exp_sum^2)
    end
    # println("step -1: $∂loss_∂netout")
    # Assume we use same hidden_size of all hidden layers for the sake for brevity
    ∇s = []
    # first layer
    push!(∇s, zeros(num_in_channel, hidden_layer_size))
    # hidden layer
    for i = 1 : num_hidden_layers-1
        push!(∇s, zeros(hidden_layer_size, hidden_layer_size))
    end
    # last layer
    push!(∇s, zeros(hidden_layer_size, num_out_channel))

    # Start back_propagations
    ∂loss_∂prev_nodes = zeros(hidden_layer_size)
    # layer weights from outs to last of hidden layer
    for i = 1 : hidden_layer_size
        ∂loss_∂prev_nodes[i] = back_propagation(hidden_outs[num_hidden_layers][i],
        num_hidden_layers+1, i, ∇s,
        Ws[num_hidden_layers+1][i, :], ∂loss_∂netout)
    end
    # layer weights from i-th hidden layer to i-1th hidden layer
    for k = num_hidden_layers : -1 : 2
        ∂loss_∂curr_nodes = deepcopy(∂loss_∂prev_nodes)
        for i = 1 : hidden_layer_size
            ∂loss_∂prev_nodes[i] = back_propagation(hidden_outs[k-1][i],
            k, i, ∇s,
            Ws[k][i, :], ∂loss_∂curr_nodes)
        end
    end
    # layer weights from 1st hidden layer to input layer
    for i = 1 : num_in_channel
        back_propagation(Xs[i],
        1, i, ∇s,
        Ws[1][i, :], ∂loss_∂prev_nodes)
    end
    return ∇s
end

∇ (generic function with 1 method)

In [13]:
function back_propagation(node_out, layer_idx, i, ∇s, weights, ∂loss_∂nodes)
    ∂loss_∂node_i = (node_out==0) ? 0 : dot(weights, ∂loss_∂nodes) # considering d of ReLU
    ∇s[layer_idx][i, :] = ∂loss_∂nodes*node_out
    return ∂loss_∂node_i
end

back_propagation (generic function with 1 method)

### Create a training procedure

In [14]:
function batch_step(optimizer, Xs_batch, Ys_batch, Ws_wrapped; hidden_layer_size = 64)
    b_s = size(Xs_batch)[2]
    # b_s = 33
    cache_new_Ws_unwrapped = zeros(b_s, num_Ws)
    total_losses = zeros(b_s)
    Ws_unwrapped = _unwrap(Ws_wrapped)
    optimizer.k = optimizer.k + 1
    Threads.@threads for i in 1:b_s-1
        Xs = Xs_batch[:, i]
        Y_truth = one_hot(Ys_batch[i])
        #forward
        Y_pred, hidden_outs = MLP(Xs, 3; num_hidden_layers = 2, Ws=Ws_wrapped, hidden_layer_size = hidden_layer_size)
        scores, losses, total_loss = softmax_loss(Y_pred, Y_truth)
        total_losses[i] = total_loss
        #backward
        if typeof(optimizer) <: NesterovMomentum_SGD 
            cache_new_Ws_unwrapped[i,:] = step_without_update!(optimizer, Ws_unwrapped, Xs, Y_truth)
        else
            gradients = ∇(Xs, Y_truth, Ws_wrapped, scores, Y_pred, hidden_outs);
            cache_new_Ws_unwrapped[i,:] = step_without_update!(optimizer, _unwrap(gradients), Ws_unwrapped)
        end
        # if i==1
        #     println()
        #     println(Y_pred)
        #     println(scores)
        #     println(Y_truth)
        #     println()
        # end
    end
    i = b_s
    Xs = Xs_batch[:, i]
    Y_truth = one_hot(Ys_batch[i])
    #forward
    Y_pred, hidden_outs = MLP(Xs, 3; num_hidden_layers = 2, Ws=Ws_wrapped, hidden_layer_size = hidden_layer_size)
    scores, losses, total_loss = softmax_loss(Y_pred, Y_truth)
    total_losses[i] = total_loss
    #backward
    gradients = ∇(Xs, Y_truth, Ws_wrapped, scores, Y_pred, hidden_outs);
    if typeof(optimizer) <: NesterovMomentum_SGD 
        cache_new_Ws_unwrapped[i,:] = step!(optimizer, Ws_unwrapped, Xs, Y_truth)
    else
        gradients = ∇(Xs, Y_truth, Ws_wrapped, scores, Y_pred, hidden_outs);
        cache_new_Ws_unwrapped[i,:] = step!(optimizer, _unwrap(gradients), Ws_unwrapped)
    end

    #update parameters
    # println(cache_new_Ws_unwrapped[:,1])
    new_Ws_unwrapped = mean(cache_new_Ws_unwrapped, dims=1)
    # println(new_Ws_unwrapped[1])
    # println(" loss = $(mean(total_losses))")
    # println(" mean new para = $(mean(new_Ws_unwrapped))")
    return _wrap(new_Ws_unwrapped, params_shape), mean(total_losses)
end

batch_step (generic function with 1 method)

In [15]:
function epoch_step(optimizer, new_Ws_wrapped; hidden_layer_size = 64)
    ct_batch = 1
    curr_loss = 0
    for (xs, ys) in dl_train
        print("batch# $ct_batch --> ")
        ct_batch += 1
        new_Ws_wrapped, curr_loss = batch_step(optimizer, xs, ys, new_Ws_wrapped; hidden_layer_size = hidden_layer_size)
        flush(stdout)
    end
    println("\n\ttrain loss =  $curr_loss")
    return new_Ws_wrapped, curr_loss
end

epoch_step (generic function with 1 method)

## Create optimizers

#### Helper functions to make inputs compatible with optimizers

In [16]:
function _unwrap(wrapped)
    unwrapped = []
    for layer in wrapped
        unwrapped = vcat(unwrapped, reduce(vcat,layer))
    end
    return unwrapped
end
function _wrap(unwrapped, shape)
    wrapped = []
    curr_idx = 1
    for s in shape
        ct = s[1]*s[2]
        push!(wrapped, Float64.(reshape(unwrapped[curr_idx:curr_idx-1+ct], s)))
        curr_idx += ct
    end
    return wrapped
end

_wrap (generic function with 1 method)

#### Steepest gradient descent
Credit: Wenbo

In [17]:
abstract type DescentMethod end
# Adam from K&W page 81
mutable struct SteepestGD <: DescentMethod
    α # learning rate
    k # dumb variable
end
function init!(M::SteepestGD)
    M.k = 0
end
function step!(M::SteepestGD, ∇f, x)
    return x - M.α*∇f
end
function step_without_update!(M::SteepestGD, ∇f, x)
    return x - M.α*∇f
end

step_without_update! (generic function with 1 method)

#### Adam
Credit: Wenbo

In [18]:
abstract type DescentMethod end
# Adam from K&W page 81
mutable struct Adam <: DescentMethod
    α # learning rate
    γv # decay
    γs # decay
    ϵ # small value
    k # step counter
    v # 1st moment estimate
    s # 2nd moment estimate
end
function init!(M::Adam, x_length)
    M.k = 0
    M.v = zeros(x_length)
    M.s = zeros(x_length)
    return M
end
function step!(M::Adam, ∇f, x)
    α, γv, γs, ϵ, k = M.α, M.γv, M.γs, M.ϵ, M.k
    s, v, g = M.s, M.v, ∇f
    M.v = γv*v + (1-γv)*g
    M.s = γs*s + (1-γs)*g.*g
    # M.k = k += 1
    v_hat = M.v ./ (1 - γv^k)
    s_hat = M.s ./ (1 - γs^k)
    return x - α*v_hat ./ (sqrt.(s_hat) .+ ϵ)
end
function step_without_update!(M::Adam, ∇f, x)
    α, γv, γs, ϵ, k = M.α, M.γv, M.γs, M.ϵ, M.k
    vv = γv*M.v + (1-γv)*∇f
    ss = γs*M.s + (1-γs)*∇f.*∇f
    # M.k = k += 1
    v_hat = vv ./ (1 - γv^k)
    s_hat = ss ./ (1 - γs^k)
    return x - α*v_hat ./ (sqrt.(s_hat) .+ ϵ)
end

step_without_update! (generic function with 2 methods)

#### Conjugate gradient descent
Credit: Wenbo

In [19]:
mutable struct ConjugateGD <: DescentMethod
    α
    g
    f
    d
    k # dumb variable
end
function init!(M::ConjugateGD)
    M.k = 0
    M.d = -M.g/sqrt(sum(M.g.^2))
end
# function step!(M::ConjugateGD, g, x, train_x, train_y)
function step!(M::ConjugateGD, g, x)
    d_prev, g_prev = M.d, M.g
    β = dot(transpose(g), g-g_prev)/dot(transpose(g_prev), g_prev)
    # β = dot(g, g-g_prev)/dot(g_prev, g_prev)
    β = max(0, β)
    d = -g + β*d_prev
    d = d/sqrt(sum(d.^2))
    # M.α = strong_backtracking_line_search(train_x, train_y, M.f, x, d, g; α = M.α, σ=0.9)
    x_next = x + M.α*d
    M.d, M.g = d, g
    return x_next
end
# function step_without_update!(M::ConjugateGD, g, x, train_x, train_y)
function step_without_update!(M::ConjugateGD, g, x)
    d_prev, g_prev = M.d, M.g
    β = dot(transpose(g), g-g_prev)/dot(transpose(g_prev), g_prev)
    # β = dot(g, g-g_prev)/dot(g_prev, g_prev)
    β = max(0, β)
    d = -g + β*d_prev
    d = d/sqrt(sum(d.^2))
    # M.α = strong_backtracking_line_search(train_x, train_y, M.f, x, d, g; α = M.α, σ=0.9)
    x_next = x + M.α*d
    # M.d, M.g = d, g
    return x_next
end
# # implementing strong backtracking line search
# function strong_backtracking_line_search(train_x, train_y, f, x, d, g; α=1, β=1e-4, σ=0.1, p=2)
#     if g == zeros(length(g))
#         # println("******************* g = 0 *******************")
#         return 0.0
#     end
#     # println("\tStart bracketing phase")
#     # bracketing phase
#     prev_α = 0
#     f_x, dumb1, dumb2, dumb3 = f(train_x, train_y, x)
#     x_αd = x+α*d
#     ∇f_x = transpose(g)*d
#     f_x_αd, hidden_outs, Y_pred, scores = f(train_x, train_y, x_αd) 
#     ∇_f_x_αd = ∇(train_x, train_y, _wrap(x_αd, params_shape), scores, Y_pred, hidden_outs)
#     ∇f_x_αd = transpose(_unwrap(∇_f_x_αd))*d
#     condition_a = f_x_αd >= f_x
#     condition_b = f_x_αd > f_x + β*α*∇f_x
#     condition_c = ∇f_x_αd >= 0
#     while !(condition_a || condition_b || condition_c)
#         prev_α, α = α, α*p
#         # println("\t\tα = $α")
#         x_αd = x+α*d
#         f_x_αd, hidden_outs, Y_pred, scores = f(train_x, train_y, x_αd) 
#         ∇_f_x_αd = ∇(train_x, train_y, _wrap(x_αd, params_shape), scores, Y_pred, hidden_outs)
#         ∇f_x_αd = transpose(_unwrap(∇_f_x_αd))*d
#         condition_a = f_x_αd >= f_x
#         condition_b = f_x_αd > f_x + β*α*∇f_x
#         condition_c = ∇f_x_αd >= 0
#     end
#     # println("\t bracket = [$prev_α, $α]")
#     # println("\tFinish bracketing phase")
#     # println("\tStart bracketing phase")
#     # zoom phase
#     # println("\t\tStart sufficient decrease condition")
#     # sufficient decrease condition
#     sufficient_decrease_condition = !condition_b
#     while !sufficient_decrease_condition
#         α = (prev_α + α)/2 
#         if α == 0.0
#             # println("******************* α = 0 *******************")
#             return 0.0
#         end
#         # println("\t\t\tα = $α")
#         x_αd = x+α*d
#         f_x_αd, hidden_outs, Y_pred, scores = f(train_x, train_y, x_αd) 
#         sufficient_decrease_condition = f_x_αd <= f_x + β*α*∇f_x
#     end
#     # println("\t\tFinish sufficient decrease condition")
#     # strong curvature condition 
#     # println("\t\tStart strong curvature condition")
#     ∇_f_x_αd = ∇(train_x, train_y, _wrap(x_αd, params_shape), scores, Y_pred, hidden_outs)
#     ∇f_x_αd = transpose(_unwrap(∇_f_x_αd))*d
#     strong_curvature_condition = abs(∇f_x_αd) <= -σ*∇f_x
#     while !strong_curvature_condition
#         α = (prev_α + α)/2
#         if α == 0.0
#             # println("******************* α = 0 *******************")
#             return 0.0
#         end
#         # println("\t\t\tα = $α")
#         x_αd = x+α*d
#         f_x_αd, hidden_outs, Y_pred, scores = f(train_x, train_y, x_αd) 
#         ∇_f_x_αd = ∇(train_x, train_y, _wrap(x_αd, params_shape), scores, Y_pred, hidden_outs)
#         ∇f_x_αd = transpose(_unwrap(∇_f_x_αd))*d
#         strong_curvature_condition = abs(∇f_x_αd) <= -σ*∇f_x
#     end
#     # println("\t\tFinish strong curvature condition")
#     # println("\tFinish bracketing phase")
#     return α
# end

step_without_update! (generic function with 3 methods)

#### Nesterov Momentum (SGD)
Credit: Dylan

In [34]:
# From K&W 76
mutable struct NesterovMomentum_SGD <: DescentMethod
    α # learning rate
    β # momentum decay
    f
    v # momentum
    k # dumb variable
end
function init!(M::NesterovMomentum_SGD, x_length)
    M.k = 0
    M.v = zeros(x_length)
end
function step!(M::NesterovMomentum_SGD, x, train_x, train_y)
    α, β, v = M.α, M.β, M.v
    x_new = x + β*v
    f_x_βv, hidden_outs, Y_pred, scores = M.f(train_x, train_y, x_new) 
    ∇f_x_βv = ∇(train_x, train_y, _wrap(x_new, params_shape), scores, Y_pred, hidden_outs)
    M.v = β*v - α*_unwrap(∇f_x_βv)
    return x + M.v
end
function step_without_update!(M::NesterovMomentum_SGD, x, train_x, train_y)
    α, β, v = M.α, M.β, M.v
    x_new = x + β*v
    f_x_βv, hidden_outs, Y_pred, scores = M.f(train_x, train_y, x_new) 
    ∇f_x_βv = ∇(train_x, train_y, _wrap(x_new, params_shape), scores, Y_pred, hidden_outs)
    vv = β*v - α*_unwrap(∇f_x_βv)
    return x + vv
end

step_without_update! (generic function with 4 methods)

# Start training

### Control variable for a scientifically correct comparison: **Give every optimizers a same NN to train**.

In [21]:
loss_ε = 0.01 # stop when loss <= loss_ε
# Initializing 
Y_pred, hidden_outs, Ws_copy = MLP(train_x_data[1], 3; hidden_layer_size = 16, num_hidden_layers = 2)
# println(Y_pred)
scores, losses, total_loss = softmax_loss(Y_pred, train_y_data[1])
∇_copy = ∇(train_x_data[1], train_y_data[1], Ws_copy, scores, Y_pred, hidden_outs);
println("New model created!\nInital loss = $total_loss")
params_shape = []
num_Ws = 0
for layer in Ws_copy
    # println(size(layer))
    s1, s2 = size(layer)
    num_Ws += s1 * s2
    push!(params_shape, size(layer))
end

New model created!
Inital loss = 1.0654244064213234


## Optimizer = Steepest gradient descent

In [22]:
num_epoch_Steepest_GD = 20;

In [23]:
# Initializing 
optimizer_SteepestGD = SteepestGD(0.01, nothing)
init!(optimizer_SteepestGD)
Ws = deepcopy(Ws_copy);

In [24]:
# Train
for i = 1:num_epoch_Steepest_GD
    print("Epoch# $i\n\t")
    Ws, curr_loss = epoch_step(optimizer_SteepestGD, Ws; hidden_layer_size = 16)
    if curr_loss <= loss_ε
        break
    end
end
println("Finished")

Epoch# 1
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  1.0574794132351686
Epoch# 2
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  1.0090353614456307
Epoch# 3
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.9771846559311114
Epoch# 4
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.9437258493200867
Epoch# 5
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.9343944950249846
Epoch# 6
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.8752723694546143
Epoch# 7
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.8561123833847518
Epoch# 8
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.813260383669708
Epoch# 9
	batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.8404007268218463
Epoch# 10
	

## Optimizer = Adam

In [25]:
num_epoch_Adam = 20;

In [26]:
# Initializing 
optimizer_Adam = Adam(0.01, 0.9, 0.999, 1e-7, nothing, nothing, nothing)
init!(optimizer_Adam, num_Ws);
Ws = deepcopy(Ws_copy);

In [27]:
# Train
for i = 1:num_epoch_Adam
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_Adam, Ws; hidden_layer_size = 16)
    if curr_loss <= loss_ε
        break
    end
end
println("Finished")

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.9657101169461695
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.6840494596566998
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.22822763346511754
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.007713104168938901
Finished


## Optimizer = ConjugateGD

In [28]:
num_epoch_ConjugateGD = 20;
# function f(train_x, train_y, w_unwrapped)
#     Y_pred, hidden_outs = MLP(train_x, 3; num_hidden_layers = 2, Ws=_wrap(w_unwrapped, params_shape), hidden_layer_size = 16)
#     scores, losses, total_loss = softmax_loss(Y_pred, train_y)
#     return total_loss, hidden_outs, Y_pred, scores
# end

20

In [29]:
# Initializing 
# optimizer_ConjugateGD = ConjugateGD(0.01, _unwrap(deepcopy(∇_copy)), f, nothing, nothing)
optimizer_ConjugateGD = ConjugateGD(0.01, _unwrap(deepcopy(∇_copy)), nothing, nothing, nothing)
init!(optimizer_ConjugateGD);
Ws = deepcopy(Ws_copy);

In [30]:
# Train
for i = 1:num_epoch_ConjugateGD
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_ConjugateGD, Ws; hidden_layer_size = 16)
    if curr_loss <= loss_ε
        break
    end
end
println("Finished")

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  1.0472862669053193
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  1.0301267086428338
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.9837931614922919
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.9637532240414706
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.9160669277648521
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.8966267650069976
Epoch# 7
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.849643614354938
Epoch# 8
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.8160657133511946
Epoch# 9
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.8113718573244258
Epoch# 10
batch# 1 -

## Optimizer = NesterovMomentum_SGD

In [31]:
num_epoch_NesterovMomentum_SGD = 20
function f(train_x, train_y, w_unwrapped)
    Y_pred, hidden_outs = MLP(train_x, 3; num_hidden_layers = 2, Ws=_wrap(w_unwrapped, params_shape), hidden_layer_size = 16)
    scores, losses, total_loss = softmax_loss(Y_pred, train_y)
    return total_loss, hidden_outs, Y_pred, scores
end

f (generic function with 1 method)

In [36]:
# Initializing 
optimizer_NesterovMomentum_SGD = NesterovMomentum_SGD(0.01, 0.99, f, nothing, nothing)
init!(optimizer_NesterovMomentum_SGD, num_Ws)
Ws = deepcopy(Ws_copy);

In [37]:
# Train
for i = 1:num_epoch_NesterovMomentum_SGD
    println("Epoch# $i")
    Ws, curr_loss = epoch_step(optimizer_NesterovMomentum_SGD, Ws; hidden_layer_size = 16)
    if curr_loss <= loss_ε
        break
    end
end
println("Finished")

Epoch# 1
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.9328953591232785
Epoch# 2
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.6094325110678358
Epoch# 3
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.2540328688773901
Epoch# 4
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.2172282368491196
Epoch# 5
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.10283362300724119
Epoch# 6
batch# 1 --> batch# 2 --> batch# 3 --> batch# 4 --> batch# 5 --> 
	train loss =  0.0014416007397873934
Finished
