# Logistic Regression Neural Network

In [1]:
using LinearAlgebra

In [2]:
""" Parameters """
# α = 0.001; ITERS = 250; λ = 0.005;
# range = -0.01:0.001:0.01
# HIDDEN_SIZE = 2
# W1 = rand(range, 2, HIDDEN_SIZE )
# b1 = zeros((1, HIDDEN_SIZE));
# W2 = rand(range, HIDDEN_SIZE, 1 )
# b2 = 0.0;

In [3]:
function train_LogRegNN_GD( W1, b1, W2, b2, ITERS, α, λ, Y; max_error=0 )
    
    loss_acc = zeros(ITERS) # Loss per iteration
    W2_acc = zeros(2, ITERS) # History of weights changes
    Ŷ = 0 # Final prediction
    
    for i in 1:ITERS        
        W2_acc[:,i] = W2
    
        A1, A2 = forward( X, W1, b1, W2, b2 )
        Ŷ = A2
        
        data_loss = loss_function( Y, Ŷ )
        reg_loss = regularization_loss( W1, W2, λ )
        loss_acc[i] = data_loss + reg_loss
        
        if loss_acc[i] < max_error
            # println( "GD stopped after $i iteration" )
            return loss_acc, Int.(round.(Ŷ)), W1, b1, W2, b2, W2_acc, i
        end
        
        new_W1, new_b1, new_W2, new_b2 = gradient_descent(
            A1, A2, W1, b1, W2, b2, α, λ, Y
        )
        W1 = new_W1; b1 = new_b1; W2 = new_W2; b2 = new_b2 
    end
    
    pred_class = Int.(round.(Ŷ))
    return loss_acc, pred_class, W1, b1, W2, b2, W2_acc, ITERS
end;

In [4]:
function dot( X, W, b )
    return X * W .+ b
end;

In [5]:
function σ( z )
    return 1 / (1 + exp(-z))
end;

In [6]:
function forward( X, W1, b1, W2, b2 )
    Z1 = dot( X, W1, b1 ) .* 1000 # Scaling for Iris Dataset
    A1 = σ.( Z1 ) # ReLU can be used here instead
    Z2 = dot( A1, W2, b2 ) * 1000
    A2 = σ.(Z2)
    return A1, A2
end;

In [7]:
function loss_function( Y, Ŷ )
    losses = [ (y * log(ŷ) + (1-y) * log(1-ŷ)) for (y, ŷ) in zip(Y, Ŷ) ]
    M = length(Ŷ)
    avg_loss = -(sum(losses) / M)
    return avg_loss
end;

In [8]:
function regularization_loss( W1, W2, λ )
    reg_loss = 0.5*λ*sum(W1.^2) + 0.5*λ*sum(W2.^2)
    return reg_loss
end;

In [9]:
function gradient_descent( A1, A2, W1, b1, W2, b2, α, λ, Y )
    
    # Derivatives calculation
    ∂W1, ∂b1, ∂W2, ∂b2 = backprop( A1, A2, W2, Y )
    
    # Regularization
    ∂W1 .+= λ * W1
    ∂W2 .+= λ * W2
    
    # Weights update
    new_W1 = W1 - α * ∂W1
    new_b1 = b1 .- α * ∂b1
    
    new_W2 = W2 - α * ∂W2
    new_b2 = b2 - α * ∂b2
    
    return new_W1, new_b1, new_W2, new_b2
end;

In [10]:
function backprop( A1, A2, W2, Y )
    
    M = length(Y)
    
    ∂Z2 = A2 .- Y
    ∂W2 = (transpose(A1) * ∂Z2) ./ M
    ∂b2 = sum(∂Z2) ./ M

    ∂A1 = W2 * transpose(∂Z2)
    ∂Z1 = transpose(A1 .- Y)
    
    ∂W1 = (∂Z1 * X) ./ M
    ∂W1 = transpose(∂W1)
    ∂b1 = sum(∂Z1, dims=2) ./ M
    ∂b1 = transpose(∂b1);
    
    return ∂W1, ∂b1, ∂W2, ∂b2

end;

In [11]:
function cost_for_W2( Θ1, Θ2, Y )
    W2 = [Θ1, Θ2]
    A1, Ŷ = forward( X, W1, b1, W2, b2 )
    loss = loss_function( Y, Ŷ )
    return loss
end;

### Functions for BFGS

In [12]:
# Function to minimize. Input: flat vector, Output: number
function forward_loss( flat )
    W2, W1, b2, b1 = roughen( flat )
    A1, A2 = forward( X, W1, b1, W2, b2 )
    Ŷ = A2
    return loss_function( Y, Ŷ )
end;

In [13]:
function round_weights( flat )
    return [round(v, digits=8) for v in flat]
end;

In [14]:
function flatten( weights_list ) # Dict() to 1D Array
    line = []
    for v in weights_list
        line = [ line...; (v...)... ]
    end
    return round_weights( line )
end;

In [15]:
function roughen( flat ) # hard-coded counterpart of flatten
    W2 = reshape([flat[1] ; flat[2] ], (2,1))
    W1 = [ flat[3] flat[5]; flat[4] flat[6] ]
    b1 = [ flat[8] flat[9] ]
    b2 = flat[7]
    return W2, W1, b2, b1
end;

In [16]:
function forward_loss_gradient( Θ )
    W2, W1, b2, b1 = roughen( Θ )
    A1, A2 = forward( X, W1, b1, W2, b2 )
    grad = [backprop( A1, A2, W2, Y )]
    return flatten(grad)
end;

In [None]:
# function limit_weights( new_Θ, Θ )
#     min_weight_value = 10e-4
#     for i in 1:length(Θ)
#         if (new_Θ[i] < min_weight_value) new_Θ[i] = Θ[i] end
#     end
#     return round_weights( new_Θ )
# end

In [17]:
function train_LogRegNN_BFGS( weights, ITERS; max_error=0, v=true )
    
    # DIM = 9
    DIM = HIDDEN_SIZE * 2 + HIDDEN_SIZE + HIDDEN_SIZE + 1
    Q = Matrix(1.0I, DIM, DIM)    
    losses = zeros(1, ITERS)
    
    for i in 1:ITERS
        if v==true println("\n==== Iteration ", i, " ====") end
        
        # Forward pass
        W1, W2 = weights["W1"][end], weights["W2"][end]
        b1, b2 = weights["b1"][end], weights["b2"][end]
        A1, A2 = forward( X, W1, b1, W2, b2 )
        Ŷ = A2
        losses[i] = loss_function( Y, Ŷ )
        
        if losses[i] < max_error
            # println( "BFGS stopped after $i iteration" )
            return losses, i
        end
        
        # Flattening
        Θ = flatten( [W2, W1, b2, b1] )
        ∇f = forward_loss_gradient

        # Optimizing using BFGS
        new_flat, Q = BFGS( forward_loss, ∇f, Θ, Q )
        if v==true println(Q) end
        # new_weights = roughen( limit_weights(new_flat, Θ) )
        new_weights = roughen( round_weights(new_flat) )
        
        if v==true println( "Iteration ", i, " finished. Weights:" ) end
        for (key, new_value) in zip( keys(weights), new_weights ) 
            if v==true println( key, " ", new_value ) end
            push!( weights[ key ], new_value )            
        end

    end
    
    return losses, ITERS
    
end;

In [None]:
function train_LogRegNN_LBFGS( weights, ITERS, m; max_error=0, v=true )
    
    # DIM = 9
    DIM = HIDDEN_SIZE * 2 + HIDDEN_SIZE + HIDDEN_SIZE + 1 
    losses = zeros(1, ITERS)
    δs, γs, qs = [], [], []
    
    for i in 1:ITERS
        if v==true println("\n==== Iteration ", i, " ====") end
        
        # Forward pass
        W1, W2 = weights["W1"][end], weights["W2"][end]
        b1, b2 = weights["b1"][end], weights["b2"][end]
        A1, A2 = forward( X, W1, b1, W2, b2 )
        Ŷ = A2
        losses[i] = loss_function( Y, Ŷ )
        
        if losses[i] < max_error return losses, i end
        
        # Flattening
        Θ = flatten( [W2, W1, b2, b1] )
        ∇f = forward_loss_gradient

        # Optimizing using BFGS
        new_flat, δs, γs, qs = LBFGS( forward_loss, ∇f, Θ, m, δs, γs, qs )

        new_weights = roughen( round_weights(new_flat) )
        
        if v==true println( "Iteration ", i, " finished. Weights:" ) end
        for (key, new_value) in zip( keys(weights), new_weights ) 
            if v==true println( key, " ", new_value ) end
            push!( weights[ key ], new_value )            
        end
    end
    
    return losses, ITERS
    
end;