In [1]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [2]:
class Adam(object):
    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8):
        """
        Use recommended parameters from paper of Adam: 
            -- https://arxiv.org/abs/1412.6980
        """
        self.lr = lr
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.eps = epsilon
        self.iter = 1
    def update(self, params, grads):
        f_param = params.ravel()
        f_grad  = grads.ravel()
        if not hasattr(self, 'ms'):
            self.ms = np.zeros_like(f_param)
            self.vs = np.zeros_like(f_param)
        for i, (x, dx, m, v) in enumerate(zip(f_param, f_grad, self.ms, self.vs)):    
            # Evaluate:
            m = self.beta_1*m + (1-self.beta_1)*dx # m_t = b1*m_t-1 + (1-b1)*g
            mt = m / (1-self.beta_1**self.iter) # m_t_h = m_t / (1-b1^t)
            v = self.beta_2*v + (1-self.beta_2)*(dx**2) # v_t = b2*v_t-1 + (1-b2)*g^2
            vt = v / (1-self.beta_2**self.iter) # v_t_h = v_t / (1-b2^t)
            
            # Update:
            f_param[i] -= self.lr * mt / (np.sqrt(vt) + self.eps) # theta = -lr * m_t_h / (sqrt(v_t_h) + eps)
            self.ms[i] = m # write m_t to memory (update from m_t-1 to m_t)
            self.vs[i] = v # write v_t to memory (update from v_t-1 to v_t)
        self.iter += 1

In [3]:
class Adagrad(object):
    def __init__(self, lr=0.001, decay_rate=0.9, epsilon=1e-8):
        """
        Ref from CS231n:
        http://cs231n.github.io/neural-networks-3
        cache = decay_rate * cache + (1 - decay_rate) * dx**2
        x += - learning_rate * dx / (np.sqrt(cache) + eps)
        """
        self.lr = lr
        self.decay = decay_rate
        self.eps = epsilon
    def update(self, params, grads):
        f_param = params.ravel()
        f_grad  = grads.ravel()
        if not hasattr(self, 'cache'):
            self.cache = np.zeros_like(f_param)
        for i, (x, dx, c) in enumerate(zip(f_param, f_grad, self.cache)):    
            # Evaluate:
            c_t = self.decay * c + (1 - self.decay) * dx**2
            
            # Update:
            f_param[i] -= self.lr * dx / (np.sqrt(c_t) + self.eps) 
            self.cache[i] = c_t # update cache

In [4]:
def minimize(x, g_f, iterations=2000, optimizer=Adam(), approximate_gradient=False, f=None, x_eps=1e-8, return_seq=False):
    x = x.copy()
    if return_seq:
        xt = [x]
        yt = [f(x)]
    for _ in range(iterations):
        if approximate_gradient:
            grad = (f(x+x_eps) - f(x)) / x_eps 
        else:
            grad = g_f(x)
        optimizer.update(x, grad)
        if return_seq:
            xt.append(x)
            yt.append(f(x))
    if return_seq:
        return x, xt, yt
    else:
        return x

In [5]:
# Basic definition of fx funciton
# fx = lambda x: x**4 - 3*(x**2) + 2
# g_fx = lambda x: 4*x**3 - 6*x
def fx(x):
    return np.squeeze(x**4 - 3*(x**2) + 2)

def g_fx(x):
    ret = np.zeros_like(x)
    ret[...] = 4*x**3 - 6*x
    return ret

# Basic definition of rosenbrock function
# rosenbrock = lambda x1, x2: 100*(x2-x1)**2 + (1-x1)**2
# g_rosenbrock = lambda x1, x2: (202*x1 - 200*x2 - 2, -200*(x1-x2)) # partial_x1, partial_x2
def rosenbrock(x):
    return np.squeeze(100*(x[...,1]-x[...,0])**2 + (1-x[...,0])**2) # (batch_size)

def g_rosenbrock(x):
    ret = np.zeros_like(x)
    ret[...,0] = 202*x[...,0] - 200*x[...,1] - 2
    ret[...,1] = -200*(x[...,0]-x[...,1])
    return ret


In [6]:
def visualize(xs, ys, f):
    pass

In [7]:
fx_x = np.random.randn(1)
print('fx_x:', fx_x)

('fx_x:', array([1.53707316]))


In [8]:
rosenbrock_x = np.random.randn(2) + 1
print('rosenbrock_x:', rosenbrock_x)

('rosenbrock_x:', array([0.67353489, 0.84394122]))


In [9]:
optimizers = [Adagrad, Adam]
funcs = [fx, rosenbrock]
grads = [g_fx, g_rosenbrock]
xs    = [fx_x, rosenbrock_x]
iters = [10000, 100000]
for func, x, g_func, iter_ in zip(funcs, xs, grads, iters):
    for opt_class in optimizers:
        opt = opt_class()
        final_x, xs, ys = minimize(x, g_func, iterations=iter_, f=func, return_seq=True)
        visualize(xs, ys, func)
        print(type(opt).__name__, func.__name__, final_x, func(x), iter_)

('Adagrad', 'fx', array([1.22474487]), array(0.49406821), 10000)
('Adam', 'fx', array([1.22474487]), array(0.49406821), 10000)
('Adagrad', 'rosenbrock', array([0.84548636, 0.84394122]), 3.0104114236778665, 100000)
('Adam', 'rosenbrock', array([0.84548636, 0.84394122]), 3.0104114236778665, 100000)
