# Neural Nets from Scratch in Julia

## Lesson 16: Softmax Activation / Crossentropy Loss

* In this video we'll implement the softmax activation function and crossentropy loss function, combined into one operation.
* [Documentation site here](https://mikesaint-antoine.github.io/SimpleGrad.jl)
* [Github repo here](https://github.com/mikesaint-antoine/SimpleGrad.jl)

In [1]:
## code so far


struct Operation{FuncType, ArgTypes}
    op::FuncType
    args::ArgTypes
end


####################################################################################
###### Values

mutable struct Value{opType} <: Number
    data::Float64
    grad::Float64
    op::opType
end

# constructor -- Value(data, grad, op)
Value(x::Number) = Value(Float64(x), 0.0, nothing);


import Base.show
function show(io::IO, value::Value)
    print(io, "Value(",value.data,")")
end


import Base.==
function ==(a::Value, b::Value)
     return a===b
end


import Base.+
function +(a::Value, b::Value)

    out = a.data + b.data    
    result = Value(out, 0.0, Operation(+, (a,b))) # Value(data, grad, op)
    return result # this should be a Value
 
end



backprop!(val::Value{Nothing}) = nothing


function backprop!(val::Value{Operation{FunType, ArgTypes}}) where {FunType<:typeof(+), ArgTypes}
    
    # val = a + b
    # update a.grad, b.grad
    
    val.op.args[1].grad += val.grad
    val.op.args[2].grad += val.grad
    
end




function backward(a::Value)
    
    
    function build_topo(v::Value, visited=Value[], topo=Value[])
    
        if !(v in visited)
            
            push!(visited, v)
            
            if v.op != nothing
                for operand in v.op.args
                    
                    if operand isa Value
                        build_topo(operand, visited, topo)
                    end
                end 
            end
            
            push!(topo, v) 
            
            
        end
        return topo
    end
    
    
    
    topo = build_topo(a)
    
    a.grad = 1
    #da/da = 1
    
    for node in reverse(topo)
        backprop!(node)
    end
    
    
end


Base.promote_rule(::Type{<:Value}, ::Type{T}) where {T<:Number} = Value




import Base.*
function *(a::Value, b::Value)

    out = a.data * b.data    
    result = Value(out, 0.0, Operation(*, (a,b))) # Value(data, grad, op)
    return result # this should be a Value
 
end



function backprop!(val::Value{Operation{FunType, ArgTypes}}) where {FunType<:typeof(*), ArgTypes}
    
    # val = a * b
    # update a.grad, b.grad
    
    val.op.args[1].grad += val.op.args[2].data * val.grad    
    val.op.args[2].grad += val.op.args[1].data * val.grad
    
end



import Base.-

# negation
function -(a::Value)
    
    return a * -1
    
end

# subtraction
function -(a::Value, b::Value)
    
    return a + (-b)
    
    
end


import Base.inv
function inv(a::Value)
    
    out = 1.0 / a.data
    result = Value(out, 0.0, Operation(inv, (a,))) # Value(data, grad, op)
    return result # this should be a Value    
    
    
end


function backprop!(val::Value{Operation{FunType, ArgTypes}}) where {FunType<:typeof(inv), ArgTypes}
    
    # val = inv(a)
    # update a.grad
    
    # a.grad -= (1.0 / a.data^2) * val.grad
    
    val.op.args[1].grad -= (1.0 / val.op.args[1].data^2) * val.grad
    
    
end


import Base./
function /(a::Value, b::Value)
     
    # a/b = a * b^(-1)
    
    return a * inv(b)
    
    
end


import Base.tanh
function tanh(a::Value)
    
    out = (exp(2 * a.data) - 1) / (exp(2 * a.data) + 1)
    result = Value(out, 0.0, Operation(tanh, (a,))) # Value(data, grad, op)
    return result # this should be a Value  
    
end




function backprop!(val::Value{Operation{FunType, ArgTypes}}) where {FunType<:typeof(tanh), ArgTypes}

    # val = tanh(a)
    # update a.grad
    
    val.op.args[1].grad += (1 - val.data^2) * val.grad
    

end

####################################################################################
###### Tensors

mutable struct Tensor{opType} <: AbstractArray{Float64, 2}
    data::Array{Float64,2}
    grad::Array{Float64,2}
    op::opType
end

# 2D constructor -- Tensor(data, grad, op)
Tensor(x::Array{Float64,2}) = Tensor(x, zeros(Float64,size(x)), nothing);

# 1D constructor
function Tensor(x::Array{Float64, 1}; column_vector::Bool=false)

    if column_vector
        # reshape x to column vector - size (N,1)
        data_2D = reshape(x, (length(x),1))

    else
        # DEFAULT - row vector - size (1,N)
        data_2D = reshape(x, (1, length(x)))

    end

    return Tensor(data_2D, zeros(Float64, size(data_2D)), nothing) # Tensor(data, grad, op)

end


import Base.show
function show(io::IO, tensor::Tensor)
    print(io, "Tensor(",tensor.data,")")
end

backprop!(tensor::Tensor{Nothing}) = nothing


import Base.==
function ==(a::Tensor, b::Tensor)
     return a===b
end


Base.size(x::Tensor) = size(x.data)

Base.getindex(x::Tensor, i...) = getindex(x.data, i...)

Base.setindex!(x::Tensor, v, i...) = setindex!(x.data, v, i...)


import Base.*
function *(a::Tensor, b::Tensor)

    out = a.data * b.data    
    result = Tensor(out, zeros(Float64, size(out)), Operation(*, (a,b))) # Tensor(data, grad, op)
    return result
 
end



function backprop!(tensor::Tensor{Operation{FunType, ArgTypes}}) where {FunType<:typeof(*), ArgTypes}
    
    # tensor = a * b
    # backprop!(tensor)
    # update a.grad, b.grad
    
    tensor.op.args[1].grad += tensor.grad * transpose(tensor.op.args[2].data)
    tensor.op.args[2].grad += transpose(tensor.op.args[1].data) * tensor.grad 
    
end


function backward(a::Tensor)
    
    
    function build_topo(v::Tensor, visited=Tensor[], topo=Tensor[])
    
        if !(v in visited)
            
            push!(visited, v)
            
            if v.op != nothing
                for operand in v.op.args
                    
                    if operand isa Tensor
                        build_topo(operand, visited, topo)
                    end
                end 
            end
            
            push!(topo, v) 
            
            
        end
        return topo
    end
    
    
    
    topo = build_topo(a)
    
    a.grad .= 1
    #da/da = 1
    
    for node in reverse(topo)
        backprop!(node)
    end
    
    
end


import Base.+
function +(a::Tensor, b::Tensor)

    # broadcasting happens automatically in case of row-vector
    out = a.data .+ b.data    

    result = Tensor(out, zeros(Float64, size(out)), Operation(+, (a,b))) # Tensor(data, grad, op)
    return result
 
end



function backprop!(tensor::Tensor{Operation{FunType, ArgTypes}}) where {FunType<:typeof(+), ArgTypes}
    
    # tensor = a + b
    # backprop!(tensor)
    # update a.grad, b.grad

    if size(tensor.grad) == size(tensor.op.args[1].data)
        tensor.op.args[1].grad += ones(size(tensor.op.args[1].data)) .* tensor.grad
    else
        # reverse broadcast
        tensor.op.args[1].grad += ones(size(tensor.op.args[1].grad)) .* sum(tensor.grad,dims=1)
    end

    
    if size(tensor.grad) == size(tensor.op.args[2].data)
        tensor.op.args[2].grad += ones(size(tensor.op.args[2].data)) .* tensor.grad
    else
        # reverse broadcast
        tensor.op.args[2].grad += ones(size(tensor.op.args[2].grad)) .* sum(tensor.grad,dims=1)
    end
    
    
end


function relu(a::Tensor)
    
    out = max.(0,a.data)
    result = Tensor(out, zeros(Float64, size(out)), Operation(relu, (a,))) # Tensor(data, grad, op)
    return result
    
end



function backprop!(tensor::Tensor{Operation{FunType, ArgTypes}}) where {FunType<:typeof(relu), ArgTypes}

    # tensor = relu(a)
    # update a.grad
    
    tensor.op.args[1].grad += (tensor.op.args[1].data .> 0) .* tensor.grad
    

end


backprop! (generic function with 9 methods)

In [23]:
inputs = Tensor(rand(2, 3)) # Matrix with shape (2,3) -- 2 samples, 3 input features per sample

# first layer
weights1 = Tensor(rand(3, 4)) # Matrix with shape (3,4) -- takes 3 inputs, has 4 neurons
biases1 = Tensor([1.0, 1.0, 1.0, 1.0]) # Bias vector for first layer neurons

# second layer
weights2 = Tensor(rand( 4, 5)) # Matrix with shape (4,5) -- takes 4 inputs, has 5 neurons
biases2 = Tensor([1.0, 1.0, 1.0, 1.0, 1.0]) # Bias vector for second layer neurons


layer1_out = relu(inputs * weights1 + biases1)

layer2_out = layer1_out * weights2 + biases2


## softmax activation
exp_values = exp.(layer2_out.data .- maximum(layer2_out.data, dims=2))
probs = exp_values ./ sum(exp_values, dims=2)
probs_clipped = clamp.(probs, 1e-7, 1-1e-7)


## crossentropy loss, comparing final output to one-hot encoded true labels

y_true = [0 1 0 0 0;
          0 0 0 1 0]


correct_confidences = sum(probs_clipped .* y_true, dims=2)

sample_losses = -log.(correct_confidences)

println(sample_losses)

out = [sum(sample_losses) / length(sample_losses)]

println(out)


# println(layer2_out)
# println(size(layer2_out))


[3.6901885797569145; 1.007464451033138;;]
[2.348826515395026]


In [None]:
function softmax_crossentropy(a::Tensor,y_true::Union{Array{Int,2},Array{Float64,2}}; grad::Bool=true)

    # softmax activation
    exp_values = exp.(a.data .- maximum(a.data, dims=2))
    probs = exp_values ./ sum(exp_values, dims=2)

    probs_clipped = clamp.(probs, 1e-7, 1 - 1e-7)
    # deal with 0s and 1s

    # basically just returns an array with the probability of the correct answer for each sample
    correct_confidences = sum(probs_clipped .* y_true, dims=2)   

    # negative log likelihood
    sample_losses = -log.(correct_confidences)

    # loss mean
    out = [sum(sample_losses) / length(sample_losses)]


    # ##
    # if grad

    #     # calculate and update a.grad

    # end


    # reshape out from (1,) to (1,1) 
    out = reshape(out, (1, 1))

    result = Tensor(out, zeros(Float64, size(out)), Operation(softmax_crossentropy, (a,))) # Tensor(data, grad, op)

    return result
    

end