# Packages

In [1]:
using Pkg
Pkg.add("MLDatasets")
Pkg.add("Flux")

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`
[32m[1mPrecompiling[22m[39m project...
[32m  ✓ [39m[90mLaTeXStrings[39m
[32m  ✓ [39m[90mGlob[39m
[32m  ✓ [39m[90mAbstractFFTs[39m
[32m  ✓ [39m[90mStructTypes[39m
[32m  ✓ [39m[90mIteratorInterfaceExtensions[39m
[32m  ✓ [39m[90mWorkerUtilities[39m
[32m  ✓ [39m[90mSentinelArrays[39m
[32m  ✓ [39m[90mTensorCore[39m
[32m  ✓ [39m[90mStatsAPI[39m
[32m  ✓ [39m[90mContour[39m
[32m  ✓ [39m[90mCEnum[39m
[32m  ✓ [39m[90mConcurrentUtilities[39m
[32m  ✓ [39m[90mInitialValues[39m
[32m  ✓ [39m[90mUnsafeAtomics[39m
[32m  ✓ [39m[90mPipe[39m
[32m  ✓ [39m[90mMeasures[39m
[32m  ✓ [39m[90mOffsetArrays[39m
[32m  ✓ [39m[90mFormat[39m
[32m  ✓ [39m[90m

# Dataset

In [2]:
using MLDatasets
train_data = MLDatasets.MNIST(split=:train)
test_data  = MLDatasets.MNIST(split=:test)

dataset MNIST:
  metadata  =>    Dict{String, Any} with 3 entries
  split     =>    :test
  features  =>    28×28×10000 Array{Float32, 3}
  targets   =>    10000-element Vector{Int64}

In [3]:
using Flux
function loader(data; batchsize::Int=1)
    x1dim = reshape(data.features, 28 * 28, :) # reshape 28×28 pixels into a vector of pixels
    yhot  = Flux.onehotbatch(data.targets, 0:9) # make a 10×60000 OneHotMatrix
    Flux.DataLoader((x1dim, yhot); batchsize, shuffle=true)
end

loader (generic function with 1 method)

In [4]:
train_loader = loader(train_data)
test_loader = loader(test_data)

10000-element DataLoader(::Tuple{Matrix{Float32}, OneHotArrays.OneHotMatrix{UInt32, Vector{UInt32}}}, shuffle=true)
  with first element:
  (784×1 Matrix{Float32}, 10×1 OneHotMatrix(::Vector{UInt32}) with eltype Bool,)

# Computation graph

In [5]:
struct ComputeNode
    forward::Function
    back::Function
end

## Multiplication

In [6]:
using LinearAlgebra: dot

function mult_fwd(W, x)
    return W * x
end

function mult_back(W, x, dz)
    dW = dz * x
    dx = W * dz
    return dW, dx
end

multiply_node = ComputeNode(mult_fwd, mult_back)

ComputeNode(mult_fwd, mult_back)

## Addition

In [7]:
function add_fwd(x1, x2)
    return x1 + x2
end

function add_back(x1, x2, dz)
    dx1 = dz * ones(size(x1))
    dx2 = dz * ones(size(x2))
    return dx1, dx2
end

addition_node = ComputeNode(add_fwd, add_back)

ComputeNode(add_fwd, add_back)

## Tanh

In [8]:
function tanh_fwd(x)
    return tanh.(x)
end

function tanh_back(x, d)
    return (1 - (tanh_fwd(x)^2)) * d
end

tanh_node = ComputeNode(tanh_fwd, tanh_back)

ComputeNode(tanh_fwd, tanh_back)

# Net

In [9]:
struct ForwardCache
    input_weighted
    hidden_weighted
    output_weighted
    self_activated
    self_raw
end

struct BackwardCache
    d_previous_hidden
    d_input_weights
    d_hidden_weights
    d_output_weights
end

# struct LayerFunc
#     forward::Function
#     backward::Function
# end

function fwd(input, previous_hidden, input_weights, hidden_weights, output_weights)
    weighted_input = multiply_node.forward(input_weights, input)
    weighted_hidden = multiply_node.forward(hidden_weights, previous_hidden)
    input_plus_hidden = addition_node.forward(weighted_input, weighted_hidden)
    activated_self = tanh_node.forward(input_plus_hidden)
    weighted_output = multiply_node.forward(output_weights, activated_self)
    return ForwardCache(weighted_input, weighted_hidden, weighted_output, activated_self, input_plus_hidden)
end

function bwd(input, previous_hidden, input_weights, hidden_weights, output_weights, diff_s, dmulv)
    fc = fwd(input, previous_hidden, input_weights, hidden_weights, output_weights)
    dV, dsv = multiply_node.back(output_weights, fc.self_activated, dmulv)
    ds = dsv + diff_s
    dadd = tanh_node.back(fc.self_raw, ds)
    dmulw, dmulu = addition_node.back(fc.hidden_weighted, fc.input_weighted, dadd)
    dW, dprev_s = multiply_node.back(hidden_weights, previous_hidden, dmulw)
    dU, dx = multiply_node.back(input_weights, input, dmulu)
    return BackwardCache(dprev_s, dU, dW, dV)
end

bwd (generic function with 1 method)

In [None]:
function backprop(

# Model

In [10]:
INPUT_SIZE = 196
HIDDEN_SIZE = 64
OUTPUT_SIZE = 10

STEP_COUNT = 4

4

In [11]:
weight_init_bound = 1/sqrt(HIDDEN_SIZE)  # read somewhere that this is a good way to init weights for tanh

Wi = weight_init_bound .* rand(HIDDEN_SIZE, INPUT_SIZE)
Wh = weight_init_bound .* rand(HIDDEN_SIZE, HIDDEN_SIZE)
Wo = rand(OUTPUT_SIZE, HIDDEN_SIZE)

10×64 Matrix{Float64}:
 0.601467   0.933733   0.710758  0.0800763   …  0.0608397  0.987254  0.679301
 0.121852   0.418534   0.502441  0.957308       0.277042   0.742181  0.42624
 0.251268   0.820714   0.837463  0.00838899     0.498122   0.993626  0.254478
 0.522565   0.484568   0.363897  0.198416       0.340878   0.127546  0.799206
 0.9227     0.908119   0.307355  0.880323       0.744356   0.755274  0.806056
 0.749594   0.145532   0.97444   0.914749    …  0.989747   0.300952  0.579607
 0.0840681  0.0143101  0.915113  0.52299        0.95016    0.455783  0.30552
 0.604289   0.142114   0.760171  0.922051       0.278485   0.939087  0.405199
 0.675881   0.55506    0.485433  0.325951       0.457805   0.305685  0.787313
 0.331135   0.620586   0.460797  0.691649       0.785094   0.951689  0.291679

In [26]:
function forward_prop(sample)
    input = [sample[1:196], sample[197:392], sample[393:588], sample[589:784]]
    
    prev_hidden = zeros(HIDDEN_SIZE)

    forward_layers = []
    for t in range(start=1, stop=STEP_COUNT)
        layer_forward = fwd(input[t], prev_hidden, Wi, Wh, Wo)
        prev_hidden = layer_forward.self_activated
        push!(forward_layers, layer_forward)
    end
    return forward_layers
end

forward_prop (generic function with 2 methods)

In [27]:
function predict(x)
    exp_scores = exp.(x)
    return exp_scores ./ sum(exp_scores)
end

function loss(x, y)
    probs = predict(x)
    return -1 * log.(probs)
end

function diff(x, y)
    probs = predict(x)
    (-1) .+ probs
    return probs
end

function classify(x)
    layers = forward_prop(x)
    return predict(layers[end].output_weighted)
end

classify (generic function with 1 method)

In [29]:
function calculate_loss(x, y)
    layers = forward_prop(x)
    output = layers[end].output_weighted
    return loss(output, y)
end

calculate_loss (generic function with 1 method)

In [35]:
function calculate_aggregate_loss(set_loader)
    aggregate_L = 0
    total_processed = 0
    for (sample, label) in set_loader
        aggregate_L += calculate_loss(sample, label)
        total_processed += 1
    end
    return loss ./ total_processed
end

calculate_aggregate_loss (generic function with 1 method)

In [41]:
function back_prop(x, y)
    layers = forward_prop(x)
    dU = zeros(size(Wi))
    dV = zeros(size(Wo))
    dW = zeros(size(Wh))

    prev_s_t = zeros(HIDDEN_SIZE)
    diff_s = zeros(HIDDEN_SIZE)

    segments = [y[1:196], y[197:392], y[393:588], y[589:784]]

    for t in range(1, STEP_COUNT)
        dmulv = diff(layers[t].weighted_output, segments[t])
        input = zeros(INPUT_SIZE)
        dprev_s, dU_t, dW_t, dV_t = layers[t].backward(input, prev_s_t, self.U, self.W, self.V, diff_s, dmulv)
    end
end

back_prop (generic function with 1 method)

In [42]:
for (sample, label) in test_loader
    # layers = forward_prop(sample)
    l = calculate_loss(sample, label)
    back_prop(sample, label)
end

LoadError: BoundsError: attempt to access 10×1 OneHotMatrix(::Vector{UInt32}) with eltype Bool at index [1:196]