In [280]:
using Flux

In [281]:
"""
y = x*w
"""
function linearActivation(x, w)
    return sum(x .* w)
end;

In [282]:
x = rand(10);

In [283]:
w = rand(10);

In [284]:
y = linearActivation(x, w)

2.3887650739454425

In [285]:
gs = gradient(linearActivation, x, w)

([0.12015509719068695, 0.021709486940606815, 0.8571674042378091, 0.31787498341878184, 0.06825305017802497, 0.33642313010076086, 0.2780656543082781, 0.8434471331906673, 0.9748458416522874, 0.4238941308519185], [0.5529009286866227, 0.5704720737045613, 0.8837895422499722, 0.9116807531729936, 0.6792797000078808, 0.9437912683369667, 0.1402304017970426, 0.3041863351001902, 0.4821845765550753, 0.3139943374663644])

In [286]:
"""
y = x*w
dy/dw = x
"""
function linearActivationDerivative(x, w)
    return x
end;

In [287]:
linearActivationDerivative(x, w) == gs[2]

true

In [288]:
"""
y = tanh(x*w)
"""
function tanhActivation(x, w)
    return sum(@. tanh(x * w))
end;

In [289]:
y = tanhActivation(x, w)

2.2145103738726015

In [290]:
gs = gradient(tanhActivation, x, w)

([0.11962635407819829, 0.021706157486354738, 0.5064721041451233, 0.29260504383644353, 0.06810654889512859, 0.30466255447801904, 0.2776432908477512, 0.7902752627410458, 0.787618867316204, 0.4164723569782784], [0.5504678853554047, 0.570384583812392, 0.5222022522926155, 0.8392053500969647, 0.6778216648396788, 0.8546911106244995, 0.14001740102957486, 0.28501008117031146, 0.3895771554812794, 0.3084967502136339])

In [291]:
"""
y = tanh(x*w)
dy/dw = dy/dtanh(x*w) * d(x*w)/dw = (sech(x*w))^2 * x
"""
function tanhActivationDerivative(x, w)
    return @. x * (sech(x * w))^2
end;

In [292]:
tanhActivationDerivative(x, w) == gs[2]

false

In [293]:
tanhActivationDerivative(x, w) ≈ gs[2]

true

In [294]:
tanhActivationDerivative(x, w)

10-element Vector{Float64}:
 0.5504678853554048
 0.570384583812392
 0.5222022522926155
 0.8392053500969648
 0.6778216648396785
 0.8546911106244997
 0.14001740102957488
 0.2850100811703115
 0.3895771554812794
 0.3084967502136339

In [295]:
gs[2]

10-element Vector{Float64}:
 0.5504678853554047
 0.570384583812392
 0.5222022522926155
 0.8392053500969647
 0.6778216648396788
 0.8546911106244995
 0.14001740102957486
 0.28501008117031146
 0.3895771554812794
 0.3084967502136339

In [349]:
model = Chain(Dense(10, 1, bias=false))

Chain(
  Dense(10 => 1; bias=false),           [90m# 10 parameters[39m
) 

In [350]:
model.layers[1].weight

1×10 Matrix{Float32}:
 0.277495  0.708802  -0.330047  0.416559  …  -0.541632  0.186023  -0.452852

In [351]:
"""
y = model(x)
"""
function FluxModelOutput(x, model)
    return model(x)[1]
end;

In [352]:
"""
y = x*model
"""
function linearModelOutput(x, model)
    return sum(x .* reshape(model.layers[1].weight, size(x)))
end;

In [353]:
FluxModelOutput(x, model) == linearModelOutput(x, model)

true

In [354]:
gradient(FluxModelOutput, x, model) == gradient(linearModelOutput, x, model)

true

In [355]:
gradient(FluxModelOutput, x, model)[2][1][1][1]' ≈ linearActivationDerivative(x, w)

true

In [356]:
model = Chain(Dense(10, 1, tanh, bias=false,))

Chain(
  Dense(10 => 1, tanh; bias=false),     [90m# 10 parameters[39m
) 

In [357]:
"""
y = tanh(x*model)
"""
function tanhModelOutput(x, model)
    weights = reshape(model.layers[1].weight, size(x))
    return tanh(sum((x .* weights)))
end;

In [363]:
FluxModelOutput(x, model) ≈ tanhModelOutput(x, model)

true

In [364]:
"""
y = tanh(x*w)
dy/dw = dy/dtanh(x*w) * d(x*w)/dw = (sech(x*w))^2 * x
"""
function tanhModelOutputDerivative(x, model)
    weights = reshape(model.layers[1].weight, size(x))
    weighted_sum = sum((x .* weights))
    return x .* sech(weighted_sum)^2
end;

In [365]:
gradient(FluxModelOutput, x, model)[2][1][1][1]' ≈ tanhModelOutputDerivative(x, model)

true