<a href="https://colab.research.google.com/github/pavansai26/ADAM-OPTIMIZER-IMPLEMENTATION/blob/master/ADAM_OPTIMIZER_IMPLEMENTATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We compute the decaying averages of past and past squared gradients  mt  and  vt  respectively as follows:

$$\begin{align}
m_{t} &amp;= \beta_{1} m_{t-1} + (1 - \beta_{1}) g_{t}\\
v_{t} &amp;= \beta_{2} v_{t-1} + (1 - \beta_{2}) g_{t}^{2}
\end{align}$$

where $g_{t}$ is

$$ g_{t} = \frac{\partial \mathcal{L}(\mathbf{w}_{t})}{\partial \mathbf{w}_{t}} $$

$m_{t}$: estimate of the first moment (the mean) of the gradients
$v_{t}$: estimate of the second moment (the uncentered variance) of the gradients

Bias correction

$$\begin{align}
\hat{m}_{t} &amp;= \frac{m_{t}}{1 - \beta_{1}^{t}}\\
\hat{v}_{t} &amp;= \frac{v_{t}}{1 - \beta_{2}^{t}}
\end{align}$$

weight update

$$\mathbf{w}_{t+1} = \mathbf{w}_{t} - \frac{\eta}{\sqrt{\hat{v}_{t}} + \epsilon} \hat{m}_{t}$$

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time

import numpy as np

Build a Optimizer

In [None]:
class AdamOptimizer():
  def __init__(self, func, gradients, x_init=None, y_init=None,
               learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
    self.f = function
    self.g = gradients
    scale = 3.0
    self.vars = np.zeros([2])
    if x_init is not None:
      self.vars[0] = x_init
    else:
      self.vars[0] = np.random.uniform(low=-scale, high=scale)
    if y_init is not None:
      self.vars[1] = y_init
    else:
      self.vars[1] = np.random.uniform(low=-scale, high=scale)
    
    print("x_init: {:.3f}".format(self.vars[0]))
    print("y_init: {:.3f}".format(self.vars[1]))

    self.lr = learning_rate
    self.grads_first_moment = np.zeros([2])
    self.grads_second_moment = np.zeros([2])
    self.beta1 = beta1
    self.beta2 = beta2
    self.epsilon = epsilon

     # for accumulation of loss and path (w, b)
    self.z_history = []
    self.x_history = []
    self.y_history = []
  def func(self, variables):
    """Beale function.
    
    Args:
      variables: input data, shape: 1-rank Tensor (vector) np.array
        x: x-dimension of inputs
        y: y-dimension of inputs
      
    Returns:
      z: Beale function value at (x, y)
    """
    x, y = variables
    z = self.f(x, y)
    return z
  def gradients(self, variables):
    """Gradient of Beale function.
    
    Args:
      variables: input data, shape: 1-rank Tensor (vector) np.array
        x: x-dimension of inputs
        y: y-dimension of inputs
      
    Returns:
      grads: [dx, dy], shape: 1-rank Tensor (vector) np.array
        dx: gradient of Beale function with respect to x-dimension of inputs
        dy: gradient of Beale function with respect to y-dimension of inputs
    """
    x, y = variables
    grads = self.g(x, y)
    return grads
  def weights_update(self, grads, time):
    """Weights update using Adam.
    
      g1 = beta1 * g1 + (1 - beta1) * grads
      g2 = beta2 * g2 + (1 - beta2) * g2
      g1_unbiased = g1 / (1 - beta1**time)
      g2_unbiased = g2 / (1 - beta2**time)
      w = w - lr * g1_unbiased / (sqrt(g2_unbiased) + epsilon)
    """
    self.grads_first_moment = self.beta1 * self.grads_first_moment + \
                              (1. - self.beta1) * grads
    self.grads_second_moment = self.beta2 * self.grads_second_moment + \
                              (1. - self.beta2) * grads**2
    
    self.grads_first_moment_unbiased = self.grads_first_moment / (1. - self.beta1**time)
    self.grads_second_moment_unbiased = self.grads_second_moment / (1. - self.beta2**time)
    
    self.vars = self.vars - self.lr * self.grads_first_moment_unbiased /(np.sqrt(self.grads_second_moment_unbiased) + self.epsilon)
  
  def history_update(self, z, x, y):
    """Accumulate all interesting variables
    """
    self.z_history.append(z)
    self.x_history.append(x)
    self.y_history.append(y)
  def train(self, max_steps):
    self.z_history = []
    self.x_history = []
    self.y_history = []
    pre_z = 0.0
    print("steps: {}  z: {:.6f}  x: {:.5f}  y: {:.5f}".format(0, self.func(self.vars), self.x, self.y))
    
    file = open('adam.txt', 'w')
    file.write("{:.5f}  {:.5f}\n".format(self.x, self.y))
    
    for step in range(max_steps):
      self.z = self.func(self.vars)
      self.history_update(self.z, self.x, self.y)

      self.grads = self.gradients(self.vars)
      self.weights_update(self.grads, step+1)
      file.write("{:.5f}  {:.5f}\n".format(self.x, self.y))
      
      if (step+1) % 100 == 0:
        print("steps: {}  z: {:.6f}  x: {:.5f}  y: {:.5f}  dx: {:.5f}  dy: {:.5f}".format(step+1, self.func(self.vars), self.x, self.y, self.dx, self.dy))
        
      if np.abs(pre_z - self.z) < 1e-7:
        print("Enough convergence")
        print("steps: {}  z: {:.6f}  x: {:.5f}  y: {:.5f}".format(step+1, self.func(self.vars), self.x, self.y))
        self.z = self.func(self.vars)
        self.history_update(self.z, self.x, self.y)
        break
        
      pre_z = self.z
    file.close()

    self.x_history = np.array(self.x_history)
    self.y_history = np.array(self.y_history)
    self.path = np.concatenate((np.expand_dims(self.x_history, 1), np.expand_dims(self.y_history, 1)), axis=1).T
  @property
  def x(self):
    return self.vars[0]
  
  @property
  def y(self):
    return self.vars[1]
  
  @property
  def dx(self):
    return self.grads[0]
  
  @property
  def dy(self):
    return self.grads[1]
