# Neural Network from Scratch
No libraries. 😎

WIP.

## Dense Layers
The most basic type of layer in artificial neural networks is the fully connected, or dense, layer in which every neuron in one layer is connected to every neuron in the next layer. Since this means we'll be taking the dot product of every input vector with every weight vector, this operation is indistinguishable from a matrix multiplication of the input as a row matrix and the weights as a column matrix.

In [1]:
abstract type Layer end

In [2]:
mutable struct DenseLayer <: Layer
    # In a dense layer, the neurons and biases are treated as column matrices
    neurons::Array
    bias::Array
    
    # Activation function for the layer and its first derivative
    ϕ::Function
    ∇ϕ::Function
    
    # Batch states for backprop
    input::Array   # Output of the previous layer
    net::Array     # input * neurons + bias
    output::Array  # activation(net)
      
    function DenseLayer(input_dim::Int, output_dim::Int, ϕ::Function) where T<:Real
        neurons = randn(input_dim, output_dim)
        bias = randn(1, output_dim)
        return new(neurons, bias, ϕ, gradient(ϕ))
    end
    
    function DenseLayer(neurons::Array{T}, bias::Array{T}, ϕ::Function) where T<:Real
        return new(neurons, bias, ϕ, gradient(ϕ))
    end

end;

### Dense Output Layers
The only real difference between a dense hidden layer and a dense output layer is that the output layer's neurons have a zero bias. I might consolidate `DenseHiddenLayer` and `DenseOutputLayer` in the future.

In [210]:
mutable struct DenseOutputLayer <: DenseLayer
    # In a dense layer, the neurons and biases are treated as column matrices
    neurons::Array
    # Even though the output layer has no bias, I'm including it to simplify forwardpass!()
    bias::Array
    
    # Activation function for the layer and its first derivative
    ϕ::Function
    ∇ϕ::Function
    
    # Batch states for backprop
    input::Array   # Output of the previous layer
    net::Array     # input * neurons (+ bias, but bias is 0)
    output::Array  # activation(net)
      
    function DenseOutputLayer(input_dim::Int, output_dim::Int, ϕ::Function) where T<:Real
        neurons = randn(input_dim, output_dim)
        bias = zeros(1, output_dim)
        return new(neurons, bias, ϕ, gradient(ϕ))
    end
    
    function DenseOutputLayer(neurons::Array{T}, bias::Array{T}, ϕ::Function) where T<:Real
        return new(neurons, bias, ϕ, gradient(ϕ))
    end

end;

In [1]:
function batchnormalize(x::Array{T,N}, γ::T=1., β::T=0.)::Array{T,N} where T<:Real where N
    # μ = mean of x
    μ = sum(x)/length(x)
    # σ² = variance of x
    σ² = sum((x .- μ).^2)
    # Normalize x
    x̂ = (x .- μ) ./ sqrt(σ²)
    # Scale and shift
    y = x̂ .* γ .+ β
    return y
end;

In [6]:
function batchnormalize!(layer::T, γ=1., β=0.) where T<:DenseLayer
    layer.output = batchnormalize(layer.output, γ, β)
end;

In [7]:
abstract type NeuralNetwork end
abstract type FeedForwardNeuralNetwork <: NeuralNetwork end

In [216]:
mutable struct FullyConnectedNeuralNetwork <: FeedForwardNeuralNetwork
    layers::Array{Layer}
    η::Float64
    
    loss::Function
    ∇loss::Function
    
    function FullyConnectedNeuralNetwork(input_dim::Int, hidden_dims::Vector{Int}, output_dim::Int, ϕ::Vector{Function}, loss::Function, η=0.01)
        layers = []
        
        push!(layers, DenseHiddenLayer(input_dim, hidden_dims[1], ϕ[1]))
        
        for i in 1:length(hidden_dims)-1
            push!(layers, DenseHiddenLayer(hidden_dims[i], hidden_dims[i+1], ϕ[i+1]))
        end
        
        push!(layers, DenseOutputLayer(hidden_dims[end], output_dim, ϕ[end]))
        
        return new(layers, η, loss, gradient(loss))
    end
    
    function FullyConnectedNeuralNetwork(layers::Vector{T}, loss::Function, η=0.01) where T<:Layer
        return new(layers, η, loss, gradient(loss))
    end
end;

In [9]:
function predict(model::FullyConnectedNeuralNetwork, data)
    for layer in model.layers
        data = data * layer.neurons .+ layer.bias
        data = layer.ϕ.(data)
    end
    return data[:,1]
end;

In [10]:
function train!(model::FullyConnectedNeuralNetwork, data, target, epochs=1, clear=true)
    for i in 1:epochs
        forwardpass!(model, data)
        backprop!(model, target)
    end
    if clear
        for i in 1:length(model.layers)
            model.layers[i].input = []
            model.layers[i].net = []
            model.layers[i].output = []
        end
    end
end;

In [486]:
function forwardpass!(model::FullyConnectedNeuralNetwork, data::Array{T}) where T<:Real
    for layer in model.layers 
        layer.input = data
        layer.net = data * layer.neurons .+ layer.bias
        layer.output = layer.ϕ.(layer.net)
        data = layer.output
    end
end;        

In [12]:
function backprop!(model::FullyConnectedNeuralNetwork, target::Array{T}) where T<:Real   
    # Backpropagate error by iteratively updating error delta terms δ
    # Work backwards from output layer
    layer = model.layers[end]
    # w:   weights
    # o:   output
    # net: output before activation
    # E:   error
    # Calculate partial derivative of error with respect to each weight
    # ∂E╱∂wᵢⱼ = ∂E╱∂oⱼ * ∂oⱼ╱∂netⱼ * ∂netⱼ╱∂wᵢⱼ
    # Partial derivative of loss
    𝜕𝐸╱𝜕𝑜 = model.∇loss.(layer.output, target)
    # Partial derivative of activated output
    𝜕𝑜╱𝜕𝑛𝑒𝑡 = layer.∇ϕ.(layer.net)
    # δ=∂E╱∂net
    # Error with respect to net -- the error terms
    𝛿 = 𝜕𝐸╱𝜕𝑜 .* 𝜕𝑜╱𝜕𝑛𝑒𝑡
    # ∂net╱∂w is equal to the transpose of the previous layer's output (https://bit.ly/backproperror)
    𝜕𝑛𝑒𝑡╱𝜕𝑤 = layer.input'
    # Calculate delta terms for the neurons and adjust by the learning rate
    𝜂 = model.η
    𝛥𝑤 = -𝜂 * 𝜕𝑛𝑒𝑡╱𝜕𝑤 * 𝛿
    # Update the weights of the output layer
    layer.neurons += 𝛥𝑤
    # Output layer has no bias, so no need to update it
    # Now do the rest of the layers in reverse order
    for L in length(model.layers)-1:-1:1
        layer = model.layers[L]
        # Need to calculate weight adjustment, Δwᴸ
        # Δwᴸ = -η * (oᴸ⁻¹)ᵀ * δᴸ
        # Make sure to save error terms δᴸ for backprop
        # δᴸ = δᴸ⁺¹ * (wᴸ⁺¹)ᵀ * ∇ϕᴸ(oᴸ⁻¹wᴸ)
        # Term oᴸ⁻¹wᴸ is layer L's unactivated output and stored as netᴸ
        # All together
        # Δwᴸ = -η * (oᴸ⁻¹)ᵀ * δᴸ⁺¹ * (wᴸ⁺¹)ᵀ * ∇ϕᴸ(oᴸ⁻¹wᴸ)
        𝜕𝐸╱𝜕𝑜 = 𝛿 * model.layers[L+1].neurons'
        𝜕𝑜╱𝜕𝑛𝑒𝑡 = layer.∇ϕ.(layer.net)
        𝛿 = 𝜕𝐸╱𝜕𝑜 .* 𝜕𝑜╱𝜕𝑛𝑒𝑡
        𝜕𝑛𝑒𝑡╱𝜕𝑤 = layer.input' 
        𝛥𝑤 = -𝜂 * 𝜕𝑛𝑒𝑡╱𝜕𝑤 * 𝛿
        # Update the neurons
        layer.neurons += 𝛥𝑤
        # Update the bias by adding scaled error terms
        layer.bias = layer.bias .+ (-𝜂 * 𝛿)
    end   
end; 

In [13]:
function fit!(model::FullyConnectedNeuralNetwork, data::Array{T}, target::Vector{T}, epochs::Int, verbose=false) where T<:Real    
    if verbose
        prediction = predict(model, data)
        @show loss(prediction, target)
        print("Training for ", epochs, " epochs.")
        @time train!(model, data, target, epochs)
        prediction = predict(model, data)
        @show loss(prediction, target)
    else
        train!(model, data, target, epochs)
    end
end;

In [None]:
abstract type ActivationFunction <: Function end

In [14]:
# Activations (ϕ)
function ReLU(x::T)::T where T<:Real 
    return max(x, 0)
end

function LeakyReLU(x::T)::T where T<:Real 
    return max(x, 0.01x)
end

function sigmoid(x::T)::T where T<:Real 
    return 1.0 / (1 + exp(-x))
end

# Derivatives
function ∇ReLU(x::T)::T where T<:Real 
    return x > 0
end

function ∇LeakyReLU(x::T)::T where T<:Real 
    return x < 0 ? 0.01 : 1.0
end

function ∇sigmoid(x::T)::T where T<:Real
    y = sigmoid(x)
    return y * (1 - y)
end

∇sigmoid (generic function with 1 method)

In [487]:
# Error Calculations

# Mean Squared Error
function mse(x::T, target::T)::T where T<:Real
    return .5(target-x)^2
end

function mse(xs::Vector{T}, target::T)::Vector{T} where T<:Real 
    err(x) = target - x
    return sum(err.(xs).^2)/2*length(xs)
end

function mse(xs::Vector{T}, target::Vector{T})::Vector{T} where T<:Real
    sum((xs .- target).^2)/2*length(xs)
end

# Derivatives
function ∇mse(x::T, target::T)::T where T<:Real
    return x - target
end

function ∇mse(xs::Vector{T}, target::T)::Vector{T} where T<:Real
    return xs .- target
end

function ∇mse(xs::Vector{T}, target::Vector{T})::Vector{T} where T<:Real
    return xs .- target
end;

In [16]:
function gradient(f::Function)
    if f == ReLU
        ∇f = ∇ReLU
    elseif f == LeakyReLU
        ∇f = ∇LeakyReLU
    elseif f == sigmoid
        ∇f = ∇sigmoid
    elseif f == mse
        ∇f = ∇mse
    elseif f == sin
        ∇f = cos
    end
    
    return ∇f
end;

In [367]:
input_size = 8

Lᵢ = DenseHiddenLayer(input_size, 8, LeakyReLU)
L₂ = DenseHiddenLayer(8, 4, LeakyReLU)
L₃ = DenseHiddenLayer(4, 4, LeakyReLU)
Lₒ = DenseOutputLayer(4, 1, LeakyReLU)

Layers = [Lᵢ, L₂, L₃, Lₒ]
m = FullyConnectedNeuralNetwork(Layers, mse);

In [368]:
samples=100
data = randn(samples,input_size)
target = randn(samples)

100-element Array{Float64,1}:
 -0.8818400733415471
 -1.0115394091213126
 -2.8013374582850488
 -1.1338479624557778
 -1.5543542394313208
 -0.5008845891822699
  0.621397536432255
 -0.40015056111015096
 -0.08519886785153656
 -0.8197982845182499
 -1.2818157254048503
 -1.4137913712385337
  0.15935307526706213
  ⋮
  0.8088045246236898
 -0.36022918508522883
  1.8467087091264482
  0.621071543330813
 -0.8669394102684208
 -1.0241410996668574
  4.323153859818787
 -0.0365663396605443
 -0.3465276766904928
 -0.10806592803819598
  0.3490531461385517
  1.9548198201993763

In [483]:
fit!(m, data, target, 1000, true)

loss(prediction, target) = 4.008682854974479e125
Training for 1000 epochs.  0.052626 seconds (96.01 k allocations: 127.961 MiB, 13.65% gc time)
loss(prediction, target) = NaN


NaN

In [209]:
# Can we overfit a disporportionately large model on a random matrix?
inputsize = 8
hidden_layers = [4096,4096]
output_size = 1
activations = Function[sigmoid, sigmoid, LeakyReLU, LeakyReLU]
loss=mse;

m = FullyConnectedNeuralNetwork(inputsize, hidden_layers, output_size, activations, loss);

samples = 8
v = randn(samples, inputsize)
t = rand(samples)

fit!(m, v, t, 100, true);

loss(prediction, target) = 9212.429768515163
Training for 100 epochs. 15.483893 seconds (9.51 k allocations: 25.495 GiB, 14.38% gc time)
loss(prediction, target) = 178.1208186822642


In [198]:
randn(5)

5-element Array{Float64,1}:
 3.3787401227584963
 0.29291124116189465
 0.2788540073004317
 1.9853819220928786
 0.793288264236646