In [1]:
abstract type Layer end

The most basic type of layer in artificial neural networks is the densely connected layer, in which every neuron in one layer is connected to every neuron in the next layer. Since this means we'll be taking the dot product of every input vector with every weight vector, this operation is indistinguishable from a matrix multiplication of the input as a row matrix and the weights as a column matrix.

In [2]:
mutable struct DenseLayer <: Layer
    neurons::Array
    bias::Array
    
    ϕ::Function
    ∇ϕ::Function
    
    input::Array
    net::Array
    output::Array
      
    function DenseLayer(input_dim::Int, output_dim::Int, ϕ::Function)
        neurons = randn(input_dim, output_dim)
        bias = randn(1, output_dim)
        return new(neurons, bias, ϕ, gradient(ϕ))
    end
    
    function DenseLayer(neurons::Array, bias::Vector, ϕ::Function)
        return new(neurons, bias, ϕ, gradient(ϕ))
    end
end

In [3]:
mutable struct DenseOutputLayer <: Layer
    neurons::Array
    bias::Array
    
    ϕ::Function
    ∇ϕ::Function
    
    input::Array
    net::Array
    output::Array
      
    function DenseOutputLayer(input_dim::Int, output_dim::Int, ϕ::Function)
        neurons = randn(input_dim, output_dim)
        bias = zeros(1, output_dim)
        return new(neurons, bias, ϕ, gradient(ϕ))
    end
end

In [4]:
abstract type NeuralNetwork end

In [5]:
mutable struct FeedForwardNeuralNetwork <: NeuralNetwork
    layers::Array{Layer}
    η::Float64
    
    loss::Function
    ∇loss::Function
    
    function FeedForwardNeuralNetwork(input_dim::Int, hidden_dims::Vector, output_dim::Int, ϕ::Vector{Function}, loss::Function, η=0.01)
        layers = []
        
        push!(layers, DenseLayer(input_dim, hidden_dims[1], ϕ[1]))
        
        for i in 1:length(hidden_dims)-1
            push!(layers, DenseLayer(hidden_dims[i], hidden_dims[i+1], ϕ[i+1]))
        end
        
        push!(layers, DenseOutputLayer(hidden_dims[end], output_dim, ϕ[end]))
        
        return new(layers, η, loss, gradient(loss))
    end
end;

In [6]:
function predict(model::FeedForwardNeuralNetwork, data)
    for layer in model.layers
        data = data * layer.neurons .+ layer.bias
        data = layer.ϕ.(data)
    end
    return data[:,1]
end;

In [7]:
function train(model::FeedForwardNeuralNetwork, data, target, epochs=1, clear=true)
    for i in 1:epochs
        forwardpass(model, data)
        backprop(model, target)
    end
    if clear
        for i in 1:length(model.layers)
            model.layers[i].input = []
            model.layers[i].net = []
            model.layers[i].output = []
        end
    end
end;

In [8]:
function forwardpass(model::FeedForwardNeuralNetwork, data)
    for layer in model.layers 
        layer.input = data
        layer.net = data * layer.neurons .+ layer.bias
        layer.output = layer.ϕ.(layer.net)
        data = layer.output
    end
    return data
end;        

In [9]:
function backprop(model::FeedForwardNeuralNetwork, target)    
    # Backpropagate error by iteratively updating error delta terms δ
    # Work backwards from output layer
    layer = model.layers[end]
    # w:   weights
    # o:   output
    # net: output before activation
    # E:   error
    # Calculate partial derivative of error with respect to each weight
    # ∂E/∂wᵢⱼ = ∂E/∂oⱼ * ∂oⱼ/∂netⱼ * ∂netⱼ/∂wᵢⱼ
    # Partial derivative of loss
    ∂E_∂o = model.∇loss.(layer.output, target)
    # Partial derivative of activated output
    ∂o_∂net = layer.∇ϕ.(layer.net)
    # δ=∂E/∂net
    # Error with respect to net -- the error terms
    δ = ∂E_∂o .* ∂o_∂net
    # ∂net/∂w is equal to the previous layer's output (https://bit.ly/backproperror)
    ∂net_∂w = layer.input'
    # Calculate delta terms for the neurons and adjust by the learning rate
    η = model.η
    Δw = -η * ∂net_∂w * δ
    # Update the weights of the output layer
    layer.neurons += Δw
    # Output layer has no bias, so no need to update it
    # Now do the rest of the layers in reverse order
    for L in length(model.layers)-1:-1:1
        layer = model.layers[L]
        # Need to calculate weight adjustment, Δwᴸ
        # Δwᴸ = -η * (oᴸ⁻¹)ᵀ * δᴸ
        # Make sure to save error terms δᴸ for backprop
        # δᴸ = δᴸ⁺¹ * (wᴸ⁺¹)ᵀ * ∇ϕᴸ(oᴸ⁻¹wᴸ)
        # Term oᴸ⁻¹wᴸ is layer L's unactivated output and stored as netᴸ
        # All together
        # Δwᴸ = -η * (oᴸ⁻¹)ᵀ * δᴸ⁺¹ * (wᴸ⁺¹)ᵀ * ∇ϕᴸ(oᴸ⁻¹wᴸ)
        ∂E_∂o = δ * model.layers[L+1].neurons'
        ∂o_∂net = layer.∇ϕ.(layer.net)
        δ = ∂E_∂o .* ∂o_∂net
        ∂net_∂w = layer.input' 
        Δw = -η * ∂net_∂w * δ
        # Update the neurons
        layer.neurons += Δw
        # Update the bias by adding scaled error terms
        layer.bias = layer.bias .+ (-η * δ)
    end   
end;

In [10]:
function fit(model::FeedForwardNeuralNetwork, data::Array{T}, target::Vector{T}, epochs::Int, verbose=false) where T<:Real    
    if verbose
        prediction = predict(model, data)
        @show loss(prediction, target)
        print("Training for ", epochs, " epochs.")
        @time train(model, data, target, epochs)
        prediction = predict(model, data)
        @show loss(prediction, target)
    else
        train(model, data, target, epochs)
    end
end;

In [11]:
# Activations (ϕ)
function ReLU(x::T)::T where T<:Real 
    return max(x, 0)
end

function LeakyReLU(x::T)::T where T<:Real 
    return max(x, 0.01x)
end

function sigmoid(x::T)::T where T<:Real 
    return 1.0 / (1 + exp(-x))
end

# Derivatives
function ∇ReLU(x::T)::T where T<:Real 
    return x > 0
end

function ∇LeakyReLU(x::T)::T where T<:Real 
    return x < 0 ? 0.01 : 1.0
end

function ∇sigmoid(x::T)::T where T<:Real
    y = sigmoid(x)
    return y * (1 - y)
end;

In [12]:
# Error Calculations

# Mean Squared Error
function mse(x::T, target::T) where T<:Real
    return .5(target-x)^2
end

function mse(xs::Vector{T}, target::T) where T<:Real 
    err(x) = target - x
    return sum(err.(xs).^2)/2*length(xs)
end

function mse(xs::Vector{T}, target::Vector{T}) where T<:Real
    sum((xs .- target).^2)/2*length(xs)
end

# Derivatives
function ∇mse(x::T, target::T) where T<:Real
    return x - target
end

function ∇mse(xs::Vector{T}, target::T) where T<:Real
    return xs .- target
end

function ∇mse(xs::Vector{T}, target::Vector{T}) where T<:Real
    return xs .- target
end;

In [13]:
function gradient(f::Function)
    if f == ReLU
        ∇f = ∇ReLU
    elseif f == LeakyReLU
        ∇f = ∇LeakyReLU
    elseif f == sigmoid
        ∇f = ∇sigmoid
    elseif f == mse
        ∇f = ∇mse
    end
    
    return ∇f
end;

In [14]:
# Can we overfit a disporportionately large model on a random matrix?
inputsize = 8
hidden_layers = [128,128,64,64,32,32,16,16,8,4,2]
output_size = 1
activations = vcat([LeakyReLU],repeat([sigmoid], length(hidden_layers)+1))
loss=mse;

m = FeedForwardNeuralNetwork(inputsize, hidden_layers, output_size, activations, loss);

samples = 8
v = randn(samples, inputsize)
t = rand([0.,1.], samples)

fit(m, v, t, 10000, true);

loss(prediction, target) = 12.261282604474928
Training for 10000 epochs.  6.079542 seconds (4.78 M allocations: 7.936 GiB, 13.18% gc time)
loss(prediction, target) = 0.13156896802248919
