## Neural network based on backward automatic differentiation

### Iris dataset

Kinds of irises:

    * setosa     = [1.0  0.0  0.0]
    * versicolor = [0.0  1.0  0.0]
    * virginica  = [0.0  0.0  1.0]

#### Import packages

In [1]:
include("data_preparing.jl");

####  Data preparing

In [2]:
iris_dataset = prepare_iris();
train_set, test_set = split_dataset(iris_dataset, 0.8);

### Neural network

#### Import packages

In [3]:
using LinearAlgebra
include("automatic_differentiation.jl");

#### Design architecture

In [4]:
function dense(w, b, x, activation) return activation.(w * x .+ b) end
function dense(w, x, activation) return activation.(w * x) end
function dense(w, x) return w * x end

function mean_squared_loss(y, ŷ)
    return Constant(1.0/3.0) .* sum.((y .- ŷ) .^ Constant(2))
end

function net(x, Wh, Wo, y)
    x̂ = dense(Wh, x, σ)
    x̂.name = "x̂"
    ŷ = dense(Wo, x̂, softmax)
    ŷ.name = "ŷ"
    E = mean_squared_loss(y, ŷ)
    E.name = "total_loss"
    return topological_sort(E)
end    

net (generic function with 1 method)

In [5]:
function dnet(E)
    forward!(E)
    backward!(E)
    tuple(E[4].gradient, E[3].gradient, E[14].output)
end

dnet (generic function with 1 method)

In [6]:
function optimize!(Wh::Variable, Wo::Variable, dWh::Matrix, dWo::Matrix, α::Float64)
    Wh.output -= α * dWh
    Wo.output -= α * dWo
end

optimize! (generic function with 1 method)

### Learning neural network

#### Batch Gradient Descent

In [7]:
function learn_BGD(x::Variable, Wh::Variable, Wo::Variable, y::Variable, epochs::Int64, α::Float64)
    E = net(x, Wh, Wo, y)
    total_losses = Vector()
    for i=1:epochs
        input_data, expected_values = adjust_dataset(train_set)
        dWh = zeros(length(Wh.output[:,1]), length(Wh.output[1,:]))
        dWo = zeros(length(Wo.output[:,1]), length(Wo.output[1,:]))
        epoch_loss = .0
        for j=1:length(input_data[:,1])
            x.output = input_data[j,:]
            y.output = expected_values[j,:]
            tmp_dWh, tmp_dWo, tmp_loss = dnet(E)
            dWh += tmp_dWh
            dWo += tmp_dWo
            epoch_loss += tmp_loss
        end
        dWh ./= length(input_data[:,1])
        dWo ./= length(input_data[:,1])
        append!(total_losses, epoch_loss/length(input_data[:,1]))
        optimize!(Wh, Wo, dWh, dWo, α)
    end
    tuple(E, total_losses)
end

learn_BGD (generic function with 1 method)

#### Mini-batch Gradient Descent

In [8]:
function learn_MBGD(x::Variable, Wh::Variable, Wo::Variable, y::Variable, epochs::Int64, batch_size::Int64, α::Float64)
    E = net(x, Wh, Wo, y)
    total_losses = Vector()
    last_batch = length(train_set[:,1]) % batch_size
    for i=1:epochs
        input_data, expected_values = adjust_dataset(train_set)
        dWh = zeros(length(Wh.output[:,1]), length(Wh.output[1,:]))
        dWo = zeros(length(Wo.output[:,1]), length(Wo.output[1,:]))
        batch_loss = .0
        for j=1:length(input_data[:,1])
            x.output = input_data[j,:]
            y.output = expected_values[j,:]
            tmp_dWh, tmp_dWo, tmp_loss = dnet(E)
            dWh += tmp_dWh
            dWo += tmp_dWo
            batch_loss += tmp_loss
            if j % batch_size == 0
                append!(total_losses, batch_loss/batch_size)
                batch_loss = .0
                dWh ./= batch_size
                dWo ./= batch_size
                optimize!(Wh, Wo, dWh, dWo, α)
                dWh = zeros(length(Wh.output[:,1]), length(Wh.output[1,:]))
                dWo = zeros(length(Wo.output[:,1]), length(Wo.output[1,:]))
            end
        end
        if length(input_data[:,1]) % batch_size != 0
            append!(total_losses, batch_loss/last_batch)
            dWh ./= last_batch
            dWo ./= last_batch
        end
        optimize!(Wh, Wo, dWh, dWo, α)
    end
    tuple(E, total_losses)
end

learn_MBGD (generic function with 1 method)

#### Stochastic Gradient Descent

In [9]:
function learn_SGD(x::Variable, Wh::Variable, Wo::Variable, y::Variable, epochs::Int64, α::Float64)
    E = net(x, Wh, Wo, y)
    total_losses = Vector()
    for i=1:epochs
        input_data, expected_values = adjust_dataset(train_set)
        epoch_loss = .0
        for j=1:length(input_data[:,1])
            x.output = input_data[j,:]
            y.output = expected_values[j,:]
            dWh, dWo, tmp_loss = dnet(E)
            epoch_loss += tmp_loss
            optimize!(Wh, Wo, dWh, dWo, α)
        end
        append!(total_losses, epoch_loss/length(input_data[:,1]))
    end
    tuple(E, total_losses)
end

learn_SGD (generic function with 1 method)

### Predictioning

In [10]:
function predict_result(E, input)
    E[5].output = input
    forward!(E)
    E[9].output
end

predict_result (generic function with 1 method)

### Model verification

In [11]:
function count_accuracy(E, input_data, expected_values)
    if length(input_data[:,1]) != length(expected_values[:,1]) || length(input_data[:,1]) < 1
        return "Incorrect size of dataset!"
    end
    predicts = zeros(0,3)
    for i=1:length(input_data[:,1])
        p = predict_result(E, input_data[i,:])
        predicts = vcat(predicts, p')
    end
    positive = 0
    negative = 0
    for i=1:length(predicts[:,1])
        if argmax(predicts[i,:]) == argmax(expected_values[i,:])
            positive += 1
        else
            negative += 1
        end
    end
    100.0 * positive / (positive + negative)
end

count_accuracy (generic function with 1 method)