###Bring in all the functions needed from Optim:

In [1]:

using Optim

import Optim.OptimizationTrace
import Optim.MultivariateOptimizationResults

function maxdiff(x::Array, y::Array)
    res = 0.0
    for i in 1:length(x)
        delta = abs(x[i] - y[i])
        if delta > res
            res = delta
        end
    end
    return res
end

function assess_convergence(x::Array,
                            x_previous::Array,
                            f_x::Real,
                            f_x_previous::Real,
                            gr::Array,
                            xtol::Real,
                            ftol::Real,
                            grtol::Real)
    x_converged, f_converged, gr_converged = false, false, false

    if maxdiff(x, x_previous) < xtol
        x_converged = true
    end

    # Absolute Tolerance
    # if abs(f_x - f_x_previous) < ftol
    # Relative Tolerance
    if abs(f_x - f_x_previous) / (abs(f_x) + ftol) < ftol || nextfloat(f_x) >= f_x_previous
        f_converged = true
    end

    if norm(vec(gr), Inf) < grtol
        gr_converged = true
    end

    converged = x_converged || f_converged || gr_converged

    return x_converged, f_converged, gr_converged, converged
end

macro gdtrace()
    quote
        if tracing
            dt = Dict()
            if extended_trace
                dt["x"] = copy(x)
                dt["g(x)"] = copy(gr)
            end
            grnorm = norm(gr, Inf)
            update!(tr,
                    iteration,
                    f_x,
                    grnorm,
                    dt,
                    store_trace,
                    show_trace)
        end
    end
end



##AdaDelta algorithm from [this paper](http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf)

Note:

 - Does not use a linesearch, 
    - all the Optim Algorithms that do use linesearch make many calls to `d.fg` per iteration
    - this only does one, therefor can afford to do many more iterations (in same amount of time).
 - has two hyper parameters, the decay constant: ρ and the smoothing constant ϵ
    - not highly sensitive to their values.
    


In [2]:

function adadelta{T}(d::Union(DifferentiableFunction,
                                      TwiceDifferentiableFunction),
                             initial_x::Array{T};
                             xtol::Real = 1e-32,
                             ftol::Real = 1e-8,
                             grtol::Real = 1e-8,
                             iterations::Integer = 20*1000,
                             store_trace::Bool = false,
                             show_trace::Bool = false,
                             extended_trace::Bool = false,
                             ρ = 0.95, #decay constant
                             ϵ = 1e-6 #smoothing constant    
    )
    
    @assert 0.0<ρ<1.0
    @assert 0.0<ϵ
    
    function rms(x²)
        √(x².+ϵ)
    end
    
    function update_running_squared_average!(avg, g)
        avg[:].*=ρ
        avg[:].+=(1-ρ).*(g.^2)
    end

    # Maintain current state in x
    x = copy(initial_x)
    x_previous = copy(initial_x)

    # Track calls to function and gradient
    f_calls, g_calls = 0, 0

    # Count number of parameters
    n = length(x)

    # Maintain current gradient in gr
    gr = similar(x)
    

    #Running windows of pass gradient and delta
    E_gr² = zeros(size(x))
    E_Δx² = zeros(size(x))


    # Store f(x) in f_x
    f_x_previous = NaN
    f_x = d.fg!(x, gr)
    
    f_calls+=1
    g_calls+=1

    f_x_best = f_x
    x_best = x
    
    # TODO: How should this flag be set?
    mayterminate = false

    # Trace the history of states visited
    tr = OptimizationTrace()
    tracing = store_trace || show_trace || extended_trace
    @gdtrace

    # Assess multiple types of convergence
    x_converged, f_converged, gr_converged = false, false, false

    converged = false
    iteration = 0
    while !converged && iteration < iterations
        # Increment the number of steps we've had to perform
        iteration += 1
        
        update_running_squared_average!(E_gr², gr)
        Δx = -rms(E_Δx²)./rms(E_gr²) .* gr 
        update_running_squared_average!(E_Δx², Δx)
        # Update current position
        x_previous = x
        x+=Δx
        
        # Update the function value and gradient
        f_x_previous = f_x
        f_x = d.fg!(x, gr)
        f_calls+=1
        g_calls+=1

        if f_x<=f_x_best
            f_x_best = f_x
            x_best=x
        end
        
        
        x_converged,
        f_converged,
        gr_converged,
        converged = assess_convergence(x,
                                       x_previous,
                                       f_x,
                                       f_x_previous,
                                       gr,
                                       xtol,
                                       ftol,
                                       grtol)

        @gdtrace
    end

    return MultivariateOptimizationResults("AdaDelta",
                                           initial_x,
                                           x_best,
                                           f_x_best,
                                           iteration,
                                           iteration == iterations,
                                           x_converged,
                                           xtol,
                                           f_converged,
                                           ftol,
                                           gr_converged,
                                           grtol,
                                           tr,
                                           f_calls,
                                           g_calls)
end


adadelta (generic function with 1 method)

##Now lets test it

In [3]:
using Optim.UnconstrainedProblems
import Optim.UnconstrainedProblems.examples

for problemName in keys(examples)
    println("--- $problemName ---")
    eg = examples[problemName]

    f=TwiceDifferentiableFunction(eg.f,eg.g!,eg.h!)
    tic()
    res = adadelta(f, eg.initial_x)
    timetaken = toc()
    
    println(res)
    println()
    
    @show eg.solutions
    @show eg.f(eg.solutions)
    @show res.minimum
    @show res.f_minimum
    
    println("-"^50)
end
    

--- Exponential ---
elapsed time: 0.77886314 seconds
Results of Optimization Algorithm
 * Algorithm: AdaDelta
 * Starting Point: [0.0,0.0]
 * Minimum: [2.0,2.999765047042759]
 * Value of Function at Minimum: 2.000000
 * Iterations: 1813
 * Convergence: true
   * |x - x'| < 1.0e-32: false
   * |f(x) - f(x')| / |f(x)| < 1.0e-08: true
   * |g(x)| < 1.0e-08: false
   * Exceeded Maximum Number of Iterations: false
 * Objective Function Calls: 1814
 * Gradient Call: 1814

eg.solutions => [2.0,3.0]
eg.f(eg.solutions) => 2.0
res.minimum => [2.0,2.999765047042759]
res.f_minimum => 2.000000055202894
--------------------------------------------------
--- Fletcher-Powell ---
elapsed time: 0.019652169 seconds
Results of Optimization Algorithm
 * Algorithm: AdaDelta
 * Starting Point: [-1.0,0.0,0.0]
 * Minimum: [-1.0,-6.91823692886755e-310,-7.013163e-317]
 * Value of Function at Minimum: 2500.000000
 * Iterations: 1
 * Convergence: true
   * |x - x'| < 1.0e-32: true
   * |f(x) - f(x')| / |f(x)| < 1.

##discussion
Compaired to [benchmarks](https://github.com/JuliaOpt/Optim.jl/blob/master/benchmarks/results.tsv) from the other algorithms is does awefully.
Honestly not sure why. Suspect it needs linesearch.

Could be that it is just that it is better for problems with very large dimentionality and complex functions/gredients such as machine learning problems (like MNIST).