In [None]:
using LinearAlgebra
using Plots, LaTeXStrings

---

# AD using operator overloading

https://julialang.zulipchat.com/#narrow/stream/225542-helpdesk/topic/Comparing.20julia.20and.20numpy/near/209135246

This is similar to the approach used by `ForwardDiff.jl`.

In [None]:
# f(p + dɛ) := f(p) + b f'(d) ɛ
#
# f(Dual(p, d)) = Dual(f(p), d*f'(p))
#
# f(Dual(p, 1)) = Dual(f(p), f'(p))

struct Dual{T} <: Number where {T <: Real}
    p::T
    d::T
end

Dual(p,d) = Dual(promote(p,d)...)

In [None]:
promote(1, 2.0)

In [None]:
Dual(1, 1//2)

In [None]:
subtypes(Number)

In [None]:
subtypes(Real)

In [None]:
Real <: Number

In [None]:
String <: Number

In [None]:
z = Dual(3, 1)

In [None]:
import Base: promote_rule, convert, show

promote_rule(::Type{Dual{T}}, ::Type{<:Real}) where T<:Real = Dual{T}

convert(::Type{Dual{T}}, x::Real) where T<:Real = Dual(x,zero(T))

show(io::IO, d::Dual) = print(io, "$(d.p) + $(d.d)*ε")

ε = Dual(0, 1)

In [None]:
z = Dual(3, 1)

In [None]:
import Base: +, -, *, /, inv

# Sum rule
+(x::Dual, y::Dual) = Dual(x.p + y.p, x.d + y.d)

# Difference rule
-(y::Dual) = Dual(-y.p, -y.d)
-(x::Dual, y::Dual) = x + -y

# Product rule
*(x::Dual, y::Dual) = Dual(x.p*y.p, x.d*y.p + x.p*y.d)

# Quotient rule
inv(y::Dual) = Dual(1/y.p, -y.d/y.p^2)
/(x::Dual, y::Dual) = x*inv(y)

In [None]:
f(x) = (x^4 + x^3)/(x^2 + x + 1)

z = f(Dual(1.0, 1))

In [None]:
fp(x) = f(Dual(x, 1)).d

In [None]:
fp(1.0)

In [None]:
g(x) = x^4 + x^3

g(Dual(1.0, 1))

In [None]:
xx = range(-1, 1, length=100)
plot(legend=:topleft, xlabel=L"x", ylabel=L"y")
plot!(xx, f.(xx), label=L"y=f(x)")
plot!(xx, fp.(xx), label=L"y=f'(x)")

---

In [None]:
# Example from the notes

F1(x) = [x[1] - x[2], x[1]x[2]]
F2(x) = [x[1]^2, x[1] - x[2]]
F3(x) = (x[1] - 8)^2 + (x[2] - 3)^2

F = F3∘F2∘F1

ε = Dual(0, 1)

# Two forward passes required to compute the gradient of F
F([2 + ε, -1]), F([2, -1 + ε])

---

# Complex Taylor series trick

Using Taylor series, we have:

$$
f(x + ih) \approx f(x) + ihf'(x)
$$

for all $h > 0$ small enough.

Taking the imaginary part of both sides

$$
\Im(f(x + ih)) \approx hf'(x),
$$

which implies that

$$
f'(x) \approx \frac{\Im(f(x + ih))}{h}
$$

for all $h > 0$ small enough.

In [None]:
x = 1.0
h = 1e-8
err1 = abs((sin(x + h) - sin(x))/h - cos(x))
err2 = abs(imag(sin(x + im*h))/h - cos(x))
err1, err2

In [None]:
h = 1e-4
imag(F([2 + im*h, -1]))/h, imag(F([2, -1 + im*h]))/h

---

# Zygote.jl

The Zygote provides **source-to-source** automatic differentiation.

In [None]:
using Zygote

In [None]:
f(x) = x^4 + x^3

f'(1.0)

In [None]:
fp(x) = 4x^3 + 3x^2

fp(1.0)

In [None]:
@code_llvm fp(1.0)

In [None]:
@code_llvm f'(1.0)

In [None]:
dump(:(x^4 + x^3))

---

# AD of neural net using Zygote

In [None]:
using Zygote

In [None]:
ReLU(γ) = max(0, γ)

In [None]:
# Parameters
A = [ 1 0; -2 1 ]
b = [ 1, 0 ]
c = [ 1, 1 ]

# put parameters in a named tuple
x = (A=A, b=b, c=c)

In [None]:
x.A

In [None]:
# Neural net
function predict(x, v)
    y = x.A*v + x.b
    z = ReLU.(y)
    w = x.c'z
end

# Loss function
loss(w, w̄) = (w - w̄)^2

In [None]:
# Data
v = [2.0, -1.0]
#v = rand(-3:3, 2)

In [None]:
w = predict(x, v)

In [None]:
w̄ = 1

loss(w, w̄)

In [None]:
f(x) = loss(predict(x,v), w̄)

f(x)

In [None]:
f'(x)

In [None]:
@code_llvm f'(x)

---