In [1]:
import LinearAlgebra: I, ⋅
import Base.MathConstants: φ
abstract type DescentMethod end

# Metody optymalizacji

## Metoda najszybszego spadku z bezwładnością

In [2]:
mutable struct Momentum <: DescentMethod
  α # learning rate
  β # momentum decay
  v # momentum
end
Momentum(α, β, n::Integer) = Momentum(α, β, zeros(n))

function step!(M::Momentum, f, ∇f, x) 
  α, β, v, g = M.α, M.β, M.v, ∇f(x)
  v[:] = β*v .- α*g
  return x + v
end

step! (generic function with 1 method)

## BFGS

In [3]:
mutable struct BFGS <: DescentMethod
  Q
end
BFGS(n::Integer) = BFGS(Matrix(1.0I, n, n))

function strong_backtracking(f, ∇, x, d; α=1, β=1e-4, σ=0.1)
  y0, g0, y_prev, α_prev = f(x), ∇(x)⋅d, NaN, 0
  αlo, αhi = NaN, NaN
  # bracket phase
  while true
    y = f(x + α*d)
    if y > y0 + β*α*g0 || (!isnan(y_prev) && y ≥ y_prev)
      αlo, αhi = α_prev, α
      break
    end
    g = ∇(x + α*d)⋅d
    if abs(g) ≤ -σ*g0
      return α
    elseif g ≥ 0
      αlo, αhi = α, α_prev
      break
    end
    y_prev, α_prev, α = y, α, 2α
  end
  # zoom phase
  ylo = f(x + αlo*d)
  while true
    α = (αlo + αhi)/2
    y = f(x + α*d)
    if y > y0 + β*α*g0 || y ≥ ylo
      αhi = α
    else
      g = ∇(x + α*d)⋅d
      if abs(g) ≤ -σ*g0
        return α
      elseif g*(αhi - αlo) ≥ 0
        αhi = αlo
      end
      αlo = α
    end
  end
end

function step!(M::BFGS, f, ∇f, x)
  if f(x) ≈ 0.0
    return x
  end

  Q, g = M.Q, ∇f(x)
  α = strong_backtracking(f, ∇f, x, -Q*g)
  x′ = x + α*(-Q*g)
  g′ = ∇f(x′)
  δ = x′ - x
  γ = g′ - g
  Q[:] = Q - (δ*γ'*Q + Q*γ*δ')/(δ'*γ) + (1 + (γ'*Q*γ)/(δ'*γ))[1]*(δ*δ')/(δ'*γ)
  return x′
end

step! (generic function with 2 methods)

## L-BFGS

In [6]:
using LinearAlgebra

mutable struct LBFGS
  m
  δs
  γs
  qs
  LBFGS() = new()
end

function init!(M::LBFGS, m) 
  M.m = m
  M.δs = [] 
  M.γs = [] 
  M.qs = []
  return M
end

function step!(M::LBFGS, f, ∇f, θ) 
    δs, γs, qs = M.δs, M.γs, M.qs 
    m, g = length(δs), ∇f(θ)
    d = -g
    if m > 0 
        q = g
        for i in m:-1:1
            qs[i] = copy(q)
            q -= (δs[i]⋅q) / (γs[i]⋅δs[i]) * γs[i]
        end
        z = (γs[m] .* δs[m] .* q) / (γs[m]⋅γs[m]) 
        for i in 1:+1:m
            z += δs[i]*(δs[i]⋅qs[i]-γs[i]⋅z)/(γs[i]⋅δs[i]) 
        end
        d = -z;
    end
    φ =α-> f(θ+α*d); φ′=α->∇f(θ+α*d)⋅d 
    α = line_search(φ, φ′, d)
    θ′ = θ + α*d; g′ = ∇f(θ′)
    δ =θ′-θ;γ =g′-g
    push!(δs, δ); push!(γs, γ); push!(qs, zero(θ)) 
    while length(δs) > M.m
        popfirst!(δs); popfirst!(γs); popfirst!(qs) 
    end
    return θ′ 
end

step! (generic function with 3 methods)

In [9]:
function zoom(φ, φ′, αlo, αhi, c1=1e-4, c2=0.1, jmax=1000)
    φ′0 = φ′(0.0) 
    for j=1:jmax
        αj = 0.5(αlo + αhi) # bisection 
        φαj = φ(αj)
        if φαj > φ(0.0) + c1*αj*φ′0 || φαj ≥ φ(αlo)
            αhi = αj 
        else
            φ′αj = φ′(αj)
            if abs(φ′αj) ≤ -c2*φ′0
                return αj
            end
            if φ′αj*(αhi - αlo) ≥ 0.0 
                αhi = αlo
            end
            αlo = αj 
        end
    end
    return 0.5(αlo + αhi) 
end

function line_search(φ, φ′, d, c1=1e-4, c2=0.1, ρ=0.1, αmax=100., jmax=1000)
    αi, αj = 0.0, 1.0
    φαi, φ0, φ′0 = φ(αi), φ(0.0), φ′(0.0) 
    for j=1:jmax
        φαj = φ(αj)
        if φαj > φ0 + c1*αj*φ′0 || (φαj ≥ φαi && j > 1)
            return zoom(φ, φ′, αi, αj)
        end
        φ′αj = φ′(αj)
        if abs(φ′αj) ≤ -c2*φ′0
            return αj 
        end
        if φ′αj ≥ 0.0
            return zoom(φ, φ′, αj, αi)
        end
        αi, αj = αj, ρ*αj + (1.0 - ρ)*αmax
        φαi = φαj 
    end
    return αj 
end

line_search (generic function with 6 methods)

# Test

In [13]:
function main()
    f(x)  = 100*(x[2] - x[1]^2)^2 + (1-x[1])^2 # funkcja
    ∇f(x) = [400x[1]^3 - 400x[1]*x[2] + 2x[1] - 2,
        200x[2] - 200x[1]^2] # pochodne
    
    x₀  = [1.1, 2.0] # wektor początkowy x
    pts = [x₀] # tutaj zbieramy kolejne wektory x
    val = Float64[] # tutaj zbieramy wartości f. straty
    # opt = BFGS(2) # optymalizator
    # opt = Momentum(0.001, 0.00001, 2)
    opt = LBFGS()
    init!(opt, 7)
    for i=1:25
        push!(val, f(pts[end])) # odłóż wynik funkcji dla najnowszych x (chcemy żeby 0)
        push!(pts, step!(opt, f, ∇f, pts[end])) # odłóż nowe x - nowe x to wynik step
    end

    pts, val
end

pts, val = main()
val

25-element Vector{Float64}:
  62.419999999999966
   0.3732411295430105
   0.1369415961360804
   0.1369096273023483
   0.13592323378462406
   0.12719491283882386
   0.07408494998556142
   0.04240542587137532
   0.02190717330656185
   0.007288189390724729
   0.0027823174414058254
   0.0003942493679093454
   3.1689619508268084e-5
   4.926848029693371e-7
   5.68382215313125e-10
   2.2067479397409333e-12
   1.5991571708620067e-19
   1.3064843141334228e-24
   4.9747540835500057e-29
   4.35845650134609e-29
   3.988677952023741e-29
   3.865418435582958e-29
   3.865418435582958e-29
 NaN
 NaN

In [11]:
pts

26-element Vector{Vector{Float64}}:
 [1.1, 2.0]
 [1.354443359375, 1.88427734375]
 [1.3696997557884254, 1.8777012643061441]
 [1.3698555407211217, 1.8775835832864833]
 [1.3685749057239072, 1.8719549761515042]
 [1.3444778266452033, 1.7983500776632715]
 [1.344477826645203, 1.798350077663271]
 [1.3442677972680435, 1.8069572892929144]
 [1.2135694301902444, 1.4732301990300947]
 [1.1706245106449156, 1.3630899428031158]
 [1.1706245106449156, 1.363089942803116]
 [1.141384081851863, 1.3063844275313499]
 [1.064831823739329, 1.130496653869105]
 [1.035528843354047, 1.067444652229225]
 [1.035528843354047, 1.067444652229225]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]
 [NaN, NaN]