In [10]:
import LinearAlgebra: I, ⋅
import Base.MathConstants: φ, pi
import Base: cos 
import Statistics: mean
abstract type DescentMethod end

# Metody optymalizacji

## Metoda najszybszego spadku z bezwładnością

In [3]:
mutable struct Momentum <: DescentMethod
  α # learning rate
  β # momentum decay
  v # momentum
end
Momentum(α, β, n::Integer) = Momentum(α, β, zeros(n))

function step!(M::Momentum, f, ∇f, x) 
  α, β, v, g = M.α, M.β, M.v, ∇f(x)
    println("Gradient: ", g)
  v[:] = β*v .- α*g
  return x + v
end

step! (generic function with 1 method)

## BFGS

In [4]:
mutable struct BFGS <: DescentMethod
  Q
end
BFGS(n::Integer) = BFGS(Matrix(1.0I, n, n))

function strong_backtracking(f, ∇, x, d; α=1, β=1e-4, σ=0.1)
  y0, g0, y_prev, α_prev = f(x), ∇(x)⋅d, NaN, 0
  αlo, αhi = NaN, NaN
  # bracket phase
  while true
    y = f(x + α*d)
    if y > y0 + β*α*g0 || (!isnan(y_prev) && y ≥ y_prev)
      αlo, αhi = α_prev, α
      break
    end
    g = ∇(x + α*d)⋅d
    if abs(g) ≤ -σ*g0
      return α
    elseif g ≥ 0
      αlo, αhi = α, α_prev
      break
    end
    y_prev, α_prev, α = y, α, 2α
  end
  # zoom phase
  ylo = f(x + αlo*d)
  while true
    α = (αlo + αhi)/2
    y = f(x + α*d)
    if y > y0 + β*α*g0 || y ≥ ylo
      αhi = α
    else
      g = ∇(x + α*d)⋅d
      if abs(g) ≤ -σ*g0
        return α
      elseif g*(αhi - αlo) ≥ 0
        αhi = αlo
      end
      αlo = α
    end
  end
end

function step!(M::BFGS, f, ∇f, x)
  if f(x) ≈ 0.0
    return x
  end

  Q, g = M.Q, ∇f(x)
  α = strong_backtracking(f, ∇f, x, -Q*g)
  x′ = x + α*(-Q*g)
  g′ = ∇f(x′)
  δ = x′ - x
  γ = g′ - g
  Q[:] = Q - (δ*γ'*Q + Q*γ*δ')/(δ'*γ) + (1 + (γ'*Q*γ)/(δ'*γ))[1]*(δ*δ')/(δ'*γ)
  return x′
end

step! (generic function with 2 methods)

## L-BFGS

In [5]:
using LinearAlgebra

mutable struct LBFGS
  m
  δs
  γs
  qs
  LBFGS() = new()
end

function init!(M::LBFGS, m) 
  M.m = m
  M.δs = [] 
  M.γs = [] 
  M.qs = []
  return M
end

function step!(M::LBFGS, f, ∇f, θ) 
    δs, γs, qs = M.δs, M.γs, M.qs 
    m, g = length(δs), ∇f(θ)
    d = -g # kierunek
    if m > 0 
        q = g
        for i in m:-1:1
            qs[i] = copy(q)
            q -= (δs[i]⋅q) / (γs[i]⋅δs[i]) * γs[i]
        end
        z = (γs[m] .* δs[m] .* q) / (γs[m]⋅γs[m]) 
        for i in 1:+1:m
            z += δs[i]*(δs[i]⋅qs[i]-γs[i]⋅z)/(γs[i]⋅δs[i]) 
        end
        d = -z; # rekonstrukcja kierunku
    end
    φ =α-> f(θ+α*d); φ′=α->∇f(θ+α*d)⋅d 
    α = line_search(φ, φ′, d)
    θ′ = θ + α*d; g′ = ∇f(θ′) # nowy wektor
    δ =θ′-θ;γ =g′-g
    push!(δs, δ); push!(γs, γ); push!(qs, zero(θ)) 
    while length(δs) > M.m
        popfirst!(δs); popfirst!(γs); popfirst!(qs) 
    end
    return θ′ 
end

step! (generic function with 4 methods)

In [6]:
function zoom(φ, φ′, αlo, αhi, c1=1e-4, c2=0.1, jmax=1000)
    φ′0 = φ′(0.0) 
    for j=1:jmax
        αj = 0.5(αlo + αhi) # bisection 
        φαj = φ(αj)
        if φαj > φ(0.0) + c1*αj*φ′0 || φαj ≥ φ(αlo)
            αhi = αj 
        else
            φ′αj = φ′(αj)
            if abs(φ′αj) ≤ -c2*φ′0
                return αj
            end
            if φ′αj*(αhi - αlo) ≥ 0.0 
                αhi = αlo
            end
            αlo = αj 
        end
    end
    return 0.5(αlo + αhi) 
end

function line_search(φ, φ′, d, c1=1e-4, c2=0.1, ρ=0.1, αmax=100., jmax=1000)
    αi, αj = 0.0, 1.0
    φαi, φ0, φ′0 = φ(αi), φ(0.0), φ′(0.0) 
    for j=1:jmax
        φαj = φ(αj)
        if φαj > φ0 + c1*αj*φ′0 || (φαj ≥ φαi && j > 1)
            return zoom(φ, φ′, αi, αj)
        end
        φ′αj = φ′(αj)
        if abs(φ′αj) ≤ -c2*φ′0
            return αj 
        end
        if φ′αj ≥ 0.0
            return zoom(φ, φ′, αj, αi)
        end
        αi, αj = αj, ρ*αj + (1.0 - ρ)*αmax
        φαi = φαj 
    end
    return αj 
end

line_search (generic function with 6 methods)

# Test

In [11]:
f_beale(x) = (1.5 - x[1] + x[1]*x[2])^2 + (2.25 - x[1] + x[1]*x[2]^2)^2 + (2.625 - x[1] + x[1]*x[2]^3)^2
∇f_beale(x) = [
    2*x[1]*(x[2]^6 + x[2]^4 - 2*x[2]^3 - x[2]^2 - 2*x[2] + 3) + 5.25*x[2]^3 + 4.5*x[2]^2 + 3*x[2] - 12.75, 
    6*x[1]*(x[1]*(x[2]^5 + 0.666667*x[2]^3 - x[2]^2 - 0.333333*x[2] - 0.333333) + 2.625*x[2]^2 + 1.5x[2] + 0.5)
]

f_rosenbrock(x)  = 100*(x[2] - x[1]^2)^2 + (1-x[1])^2
∇f_rosenbrock(x) = [
    400x[1]^3 - 400x[1]*x[2] + 2x[1] - 2,
    200x[2] - 200x[1]^2
]
# a = 20, b = 0.2 and c = 2π.
f_ackley(x, a=20,b=0.2,c = 2 * pi) = -a * exp(-b * sqrt(mean(x.^2))) - exp( mean( cos.(c .* x))) + a + exp(1)


x = [3.0, 2.4]

2-element Array{Float64,1}:
 3.0
 2.4

In [12]:
x = [0 0]

f_ackley(x)

4.440892098500626e-16

In [67]:
function optimalize(f, ∇f, x₀, opt, e, i)
    pts = [x₀] # kolejne wektory x
    err = Float64[] # kolejne wartości f. straty
    p = 0
    while true
        println("Iteracja p=", p)
        println("Wektor x: ", pts[end])
        push!(err, f(pts[end])) # odłóż wynik funkcji dla najnowszego wektora x (miara błędu)
        println("Error: ", err[end])
        println()
        if f(pts[end]) < e || p > i
            break
        end
        push!(pts, step!(opt, f, ∇f, pts[end]))
        p += 1
    end
    
    pts, err, p
end

optimalize (generic function with 1 method)

In [70]:
bfgs = BFGS(2)

pts, err, i = optimalize(f_rosenbrock, ∇f_rosenbrock, x, bfgs, 0.0001, 100)
println("Ilość iteracji: ", i)
println("Wektor x wynikowy: ", pts[end])
println("Błąd: ", err[end])

Iteracja p=0
Wektor x: [3.0, 2.4]
Error: 4359.999999999999

Iteracja p=1
Wektor x: [-0.869140625, 3.04453125]
Error: 527.5033904747688

Iteracja p=2
Wektor x: [-1.2478796921229813, 1.5289880043793513]
Error: 5.132575804989364

Iteracja p=3
Wektor x: [-1.2417043536613335, 1.5434135844467658]
Error: 5.025489277615894

Iteracja p=4
Wektor x: [-1.2361480883445939, 1.5402612271752405]
Error: 5.015240152374404

Iteracja p=5
Wektor x: [-1.070603146480687, 1.1046813267665858]
Error: 4.459703494810388

Iteracja p=6
Wektor x: [-0.9425278888097915, 0.8234893291504641]
Error: 4.194219698475874

Iteracja p=7
Wektor x: [-0.45412622360138266, 0.17149732511561044]
Error: 2.2351232998836066

Iteracja p=8
Wektor x: [-0.46618302882906454, 0.2091867792523266]
Error: 2.15631836885373

Iteracja p=9
Wektor x: [-0.2656160104123663, 0.03285005089592283]
Error: 1.7439265643908124

Iteracja p=10
Wektor x: [-0.034693098392947735, -0.03766729334590771]
Error: 1.2216845289205052

Iteracja p=11
Wektor x: [-0.0288833

In [53]:
lbfgs = LBFGS(); init!(lbfgs, 7)

pts, err, i = optimalize(f_rosenbrock, ∇f_rosenbrock, x, lbfgs, 0.0001, 100)
println("Ilość iteracji: ", i)
println("Wektor x wynikowy: ", pts[end])
println("Błąd: ", err[end])

Ilość iteracji: 12
Wektor x wynikowy: [1.0068046044609678, 1.013810396194621]
Błąd: 4.870156675650007e-5


In [82]:
# weird stuff happens here
momentum = Momentum(0.00000000000001, 0.01, 2)

pts, err, i = optimalize(f_rosenbrock, ∇f_rosenbrock, x, momentum, 0.01, 100)
println("Ilość iteracji: ", i)
println("Wektor x wynikowy: ", pts[end])
println("Błąd: ", err[end])

Iteracja p=0
Wektor x: [3.0, 2.4]
Error: 4359.999999999999

Gradient: [7924.0, -1320.0]
Iteracja p=1
Wektor x: [2.99999999992076, 2.4000000000132]
Error: 4359.999999354677

Gradient: [7923.999999204279, -1319.9999999022718]
Iteracja p=2
Wektor x: [2.9999999998407274, 2.400000000026532]
Error: 4359.999998702902

Gradient: [7923.999998400601, -1319.9999998035664]
Iteracja p=3
Wektor x: [2.999999999760687, 2.4000000000398654]
Error: 4359.999998051062

Gradient: [7923.999997596844, -1319.9999997048515]
Iteracja p=4
Wektor x: [2.9999999996806466, 2.4000000000531987]
Error: 4359.999997399221

Gradient: [7923.999996793085, -1319.999999606136]
Iteracja p=5
Wektor x: [2.999999999600606, 2.400000000066532]
Error: 4359.999996747381

Gradient: [7923.99999598933, -1319.9999995074213]
Iteracja p=6
Wektor x: [2.9999999995205657, 2.4000000000798654]
Error: 4359.99999609554

Gradient: [7923.99999518557, -1319.999999408706]
Iteracja p=7
Wektor x: [2.9999999994405253, 2.4000000000931987]
Error: 4359.9999

Wektor x: [2.999999992877211, 2.400000001186532]
Error: 4359.999941992799

Gradient: [7923.999928473673, -1319.9999912153469]
Iteracja p=90
Wektor x: [2.9999999927971706, 2.4000000011998655]
Error: 4359.999941340958

Gradient: [7923.999927669914, -1319.9999911166315]
Iteracja p=91
Wektor x: [2.99999999271713, 2.400000001213199]
Error: 4359.999940689118

Gradient: [7923.999926866157, -1319.9999910179165]
Iteracja p=92
Wektor x: [2.9999999926370897, 2.400000001226532]
Error: 4359.999940037277

Gradient: [7923.999926062398, -1319.9999909192013]
Iteracja p=93
Wektor x: [2.9999999925570493, 2.4000000012398655]
Error: 4359.999939385438

Gradient: [7923.999925258641, -1319.9999908204863]
Iteracja p=94
Wektor x: [2.999999992477009, 2.400000001253199]
Error: 4359.9999387335965

Gradient: [7923.999924454884, -1319.999990721771]
Iteracja p=95
Wektor x: [2.9999999923969685, 2.400000001266532]
Error: 4359.999938081757

Gradient: [7923.999923651127, -1319.999990623056]
Iteracja p=96
Wektor x: [2.999