In [1]:
import LinearAlgebra: I, ⋅
import Base.MathConstants: φ, pi
import Base: cos ,Iterators
import Statistics: mean
import FiniteDifferences
abstract type DescentMethod end

In [2]:
using PyPlot
using BenchmarkTools
using Zygote

# Metody optymalizacji

## Metoda najszybszego spadku z bezwładnością

In [3]:
mutable struct Momentum <: DescentMethod
  α # learning rate
  β # momentum decay
  v # momentum
end

Momentum(α, β, n::Integer) = Momentum(α, β, zeros(n))

function step!(M::Momentum, f, ∇f, x, degub=false) 
  α, β, v, g = M.α, M.β, M.v, ∇f(x)
  if debug
    println("Gradient: ", g)
  v[:] = β*v .- α*g
  return x + v
end

step! (generic function with 1 method)

## BFGS

In [4]:
mutable struct BFGS <: DescentMethod
  Q
end
BFGS(n::Integer) = BFGS(Matrix(1.0I, n, n))

function strong_backtracking(f, ∇, x, d; α=1, β=1e-4, σ=0.1)
  y0, g0, y_prev, α_prev = f(x), ∇(x)⋅d, NaN, 0
  αlo, αhi = NaN, NaN
  # bracket phase
  while true
    y = f(x + α*d)
    if y > y0 + β*α*g0 || (!isnan(y_prev) && y ≥ y_prev)
      αlo, αhi = α_prev, α
      break
    end
    g = ∇(x + α*d)⋅d
    if abs(g) ≤ -σ*g0
      return α
    elseif g ≥ 0
      αlo, αhi = α, α_prev
      break
    end
    y_prev, α_prev, α = y, α, 2α
  end
  # zoom phase
  ylo = f(x + αlo*d)
  while true
    α = (αlo + αhi)/2
    y = f(x + α*d)
    if y > y0 + β*α*g0 || y ≥ ylo
      αhi = α
    else
      g = ∇(x + α*d)⋅d
      if abs(g) ≤ -σ*g0
        return α
      elseif g*(αhi - αlo) ≥ 0
        αhi = αlo
      end
      αlo = α
    end
  end
end

function step!(M::BFGS, f, ∇f, x)
  if f(x) ≈ 0.0
    return x
  end

  Q, g = M.Q, ∇f(x)
  α = strong_backtracking(f, ∇f, x, -Q*g)
  x′ = x + α*(-Q*g)
  g′ = ∇f(x′)
  δ = x′ - x
  γ = g′ - g
  Q[:] = Q - (δ*γ'*Q + Q*γ*δ')/(δ'*γ) + (1 + (γ'*Q*γ)/(δ'*γ))[1]*(δ*δ')/(δ'*γ)
  return x′
end

step! (generic function with 2 methods)

## L-BFGS

In [5]:
using LinearAlgebra

mutable struct LBFGS
  m
  δs
  γs
  qs
  LBFGS() = new()
end

function init!(M::LBFGS, m) 
  M.m = m
  M.δs = [] 
  M.γs = [] 
  M.qs = []
  return M
end

function step!(M::LBFGS, f, ∇f, θ) 
    δs, γs, qs = M.δs, M.γs, M.qs 
    m, g = length(δs), ∇f(θ)
    d = -g # kierunek
    if m > 0 
        q = g
        for i in m:-1:1
            qs[i] = copy(q)
            q -= (δs[i]⋅q) / (γs[i]⋅δs[i]) * γs[i]
        end
        z = (γs[m] .* δs[m] .* q) / (γs[m]⋅γs[m]) 
        for i in 1:+1:m
            z += δs[i]*(δs[i]⋅qs[i]-γs[i]⋅z)/(γs[i]⋅δs[i]) 
        end
        d = -z; # rekonstrukcja kierunku
    end
    φ =α-> f(θ+α*d); φ′=α->∇f(θ+α*d)⋅d 
    α = line_search(φ, φ′, d)
    θ′ = θ + α*d; g′ = ∇f(θ′) # nowy wektor
    δ =θ′-θ;γ =g′-g
    push!(δs, δ); push!(γs, γ); push!(qs, zero(θ)) 
    while length(δs) > M.m
        popfirst!(δs); popfirst!(γs); popfirst!(qs) 
    end
    return θ′ 
end

step! (generic function with 3 methods)

In [6]:
function zoom(φ, φ′, αlo, αhi, c1=1e-4, c2=0.1, jmax=1000)
    φ′0 = φ′(0.0) 
    for j=1:jmax
        αj = 0.5(αlo + αhi) # bisection 
        φαj = φ(αj)
        if φαj > φ(0.0) + c1*αj*φ′0 || φαj ≥ φ(αlo)
            αhi = αj 
        else
            φ′αj = φ′(αj)
            if abs(φ′αj) ≤ -c2*φ′0
                return αj
            end
            if φ′αj*(αhi - αlo) ≥ 0.0 
                αhi = αlo
            end
            αlo = αj 
        end
    end
    return 0.5(αlo + αhi) 
end

function line_search(φ, φ′, d, c1=1e-4, c2=0.1, ρ=0.1, αmax=100., jmax=1000)
    αi, αj = 0.0, 1.0
    φαi, φ0, φ′0 = φ(αi), φ(0.0), φ′(0.0) 
    for j=1:jmax
        φαj = φ(αj)
        if φαj > φ0 + c1*αj*φ′0 || (φαj ≥ φαi && j > 1)
            return zoom(φ, φ′, αi, αj)
        end
        φ′αj = φ′(αj)
        if abs(φ′αj) ≤ -c2*φ′0
            return αj 
        end
        if φ′αj ≥ 0.0
            return zoom(φ, φ′, αj, αi)
        end
        αi, αj = αj, ρ*αj + (1.0 - ρ)*αmax
        φαi = φαj 
    end
    return αj 
end

line_search (generic function with 6 methods)

# Test Set up

## Functions

In [7]:
f_beale(x) = (1.5 - x[1] + x[1]*x[2])^2 + (2.25 - x[1] + x[1]*x[2]^2)^2 + (2.625 - x[1] + x[1]*x[2]^3)^2
∇f_beale(x) = [
    2*x[1]*(x[2]^6 + x[2]^4 - 2*x[2]^3 - x[2]^2 - 2*x[2] + 3) + 5.25*x[2]^3 + 4.5*x[2]^2 + 3*x[2] - 12.75, 
    6*x[1]*(x[1]*(x[2]^5 + 0.666667*x[2]^3 - x[2]^2 - 0.333333*x[2] - 0.333333) + 2.625*x[2]^2 + 1.5x[2] + 0.5)
]

f_rosenbrock(x)  = 100*(x[2] - x[1]^2)^2 + (1-x[1])^2
∇f_rosenbrock(x) = [
    400x[1]^3 - 400x[1]*x[2] + 2x[1] - 2,
    200x[2] - 200x[1]^2
]
# a = 20, b = 0.2 and c = 2π.
f_ackley(x, a=20,b=0.2,c = 2 * pi) = -a * exp(-b * sqrt(mean(x.^2))) - exp( mean( cos.(c .* x))) + a + exp(1)
∇f_ackley(x) = f_ackley'(x)

x = [3.0, 2.4]

2-element Array{Float64,1}:
 3.0
 2.4

# Test

In [8]:
function optimalize(f, ∇f, x₀, opt, e, i, debug=false)
    pts = [x₀] # kolejne wektory x
    err = Float64[] # kolejne wartości f. straty
    p = 0
    while true
        push!(err, f(pts[end])) # odłóż wynik funkcji dla najnowszego wektora x (miara błędu)
        if debug 
            println("Iteracja p=", p)
            println("Wektor x: ", pts[end])
            println("Error: ", err[end])
            println()
        end
        if f(pts[end]) < e || p > i
            break
        end
        push!(pts, step!(opt, f, ∇f, pts[end]))
        p += 1
    end
    
    pts, err, p
end

optimalize (generic function with 2 methods)

In [9]:
bfgs = BFGS(2)

pts, err, i = optimalize(f_rosenbrock, ∇f_rosenbrock, x, bfgs, 0.0001, 100)
println("Liczba iteracji: ", i)
println("Wektor x wynikowy: ", pts[end])
println("Błąd: ", err[end])
# println(info)

Ilość iteracji: 23
Wektor x wynikowy: [0.9940387473259737, 0.9880492779277037]
Błąd: 3.594298123016037e-5


In [13]:
lbfgs = LBFGS(); init!(lbfgs, 7)

pts, err, i = optimalize(f_rosenbrock, ∇f_rosenbrock, x, lbfgs, 0.0001, 100)
println("Liczba iteracji: ", i)
println("Wektor x wynikowy: ", pts[end])
println("Błąd: ", err[end])

Ilość iteracji: 27
Wektor x wynikowy: [0.9983197763017503, 0.9959822516110969]
Błąd: 4.639954023588186e-5


In [14]:
# weird stuff happens here
momentum = Momentum(0.00000000000001, 0.01, 2)

pts, err, i = optimalize(f_rosenbrock, ∇f_rosenbrock, x, momentum, 0.01, 100)

println("Liczba iteracji: ", i)
println("Wektor x wynikowy: ", pts[end])
println("Błąd: ", err[end])

Gradient: [7924.0, -1320.0]
Gradient: [7923.999999204279, -1319.9999999022718]
Gradient: [7923.999998400601, -1319.9999998035664]
Gradient: [7923.999997596844, -1319.9999997048515]
Gradient: [7923.999996793085, -1319.999999606136]
Gradient: [7923.99999598933, -1319.9999995074213]
Gradient: [7923.99999518557, -1319.999999408706]
Gradient: [7923.999994381812, -1319.999999309991]
Gradient: [7923.999993578054, -1319.9999992112753]
Gradient: [7923.9999927742965, -1319.9999991125603]
Gradient: [7923.999991970539, -1319.9999990138451]
Gradient: [7923.999991166782, -1319.9999989151302]
Gradient: [7923.999990363023, -1319.9999988164147]
Gradient: [7923.999989559267, -1319.9999987176998]
Gradient: [7923.999988755508, -1319.9999986189846]
Gradient: [7923.99998795175, -1319.9999985202694]
Gradient: [7923.999987147992, -1319.9999984215542]
Gradient: [7923.9999863442345, -1319.999998322839]
Gradient: [7923.999985540477, -1319.9999982241238]
Gradient: [7923.999984736718, -1319.9999981254089]
Gradient

Excessive output truncated after 524291 bytes.

-1319.9999956565325]
Gradient: [7923.9999638309, -1319.9999955578178]
Gradient: [7923.999963027141, -1319.9999954591021]
Gradient: [7923.999962223385, -1319.9999953603872]
Gradient: [7923.999961419625, -1319.9999952616718]
Gradient: [7923.99996061587, -1319.999995162957]
Gradient: [7923.999959812109, -1319.9999950642416]
Gradient: [7923.999959008353, -1319.9999949655266]
Gradient: [7923.999958204594, -1319.999994866811]
Gradient: [7923.999957400836, -1319.9999947680963]
Gradient: [7923.999956597078, -1319.9999946693808]
Gradient: [7923.9999557933215, -1319.9999945706659]
Gradient: [7923.999954989563, -1319.9999944719505]
Gradient: [7923.999954185808, -1319.9999943732355]
Gradient: [7923.999953382047, -1319.9999942745203]
Gradient: [7923.9999525782905, -1319.9999941758053]
Gradient: [7923.999951774532, -1319.99999407709]
Gradient: [7923.999950970774, -1319.9999939783747]
Gradient: [7923.999950167016, -1319.9999938796595]
Gradient: [7923.999949363259, -1319.9999937809446]
Gradient: [7923

## Ackley test

### BFGS

In [12]:
dimensions = 1:5

x = [3.0, 2.4]

dim = 1
f = f_ackley
∇f = ∇f_ackley
lbfgs = LBFGS(); init!(lbfgs, 1)
pts, err, i = optimalize(f, ∇f, x[1:dim], lbfgs, 0.0001, 100)
println("Liczba iteracji: ", i)
println("Wektor x wynikowy: ", pts[end])
println("Błąd: ", err[end])

Liczba iteracji: 101
Wektor x wynikowy: [NaN]
Błąd: NaN
