In [1]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [2]:
class Adam(object):
    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8):
        """
        Use recommended parameters from paper of Adam: 
            -- https://arxiv.org/abs/1412.6980
        """
        self.lr = lr
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.eps = epsilon
        self.iter = 1
    def update(self, params, grads):
        f_param = params.ravel()
        f_grad  = grads.ravel()
        if not hasattr(self, 'ms'):
            self.ms = np.zeros_like(f_param)
            self.vs = np.zeros_like(f_param)
        for i, (x, dx, m, v) in enumerate(zip(f_param, f_grad, self.ms, self.vs)):    
            # Evaluate:
            m = self.beta_1*m + (1-self.beta_1)*dx # m_t = b1*m_t-1 + (1-b1)*g
            mt = m / (1-self.beta_1**self.iter) # m_t_h = m_t / (1-b1^t)
            v = self.beta_2*v + (1-self.beta_2)*(dx**2) # v_t = b2*v_t-1 + (1-b2)*g^2
            vt = v / (1-self.beta_2**self.iter) # v_t_h = v_t / (1-b2^t)
            
            # Update:
            f_param[i] -= self.lr * mt / (np.sqrt(vt) + self.eps) # theta = -lr * m_t_h / (sqrt(v_t_h) + eps)
            self.ms[i] = m # write m_t to memory (update from m_t-1 to m_t)
            self.vs[i] = v # write v_t to memory (update from v_t-1 to v_t)
        self.iter += 1

def train(x, g_f, batch_size=1, iterations=2000, optimizer=Adam(), approximate_gradient=False, f=None, x_eps=1e-7):
    x = x.copy()
    for _ in range(iterations):
        if approximate_gradient:
            grad = np.repeat(( (f(x+x_eps) - f(x)) / x_eps ).mean(axis=0, keepdims=True), x.shape[0], axis=0)
        else:
            grad= np.repeat(g_f(x).mean(axis=0, keepdims=True), x.shape[0], axis=0)
        optimizer.update(x, grad)
    return x.mean(axis=0)

In [3]:
# Basic definition of fx funciton
# fx = lambda x: x**4 - 3*(x**2) + 2
# g_fx = lambda x: 4*x**3 - 6*x
def fx(x):
    return x**4 - 3*(x**2) + 2

def g_fx(x):
    return 4*x**3 - 6*x

# Basic definition of rosenbrock function
# rosenbrock = lambda x1, x2: 100*(x2-x1)**2 + (1-x1)**2
# g_rosenbrock = lambda x1, x2: (202*x1 - 200*x2 - 2, -200*(x1-x2)) # partial_x1, partial_x2
def rosenbrock(x):
    return 100*(x[...,1]-x[...,0])**2 + (1-x[...,0])**2 # (batch_size)

def g_rosenbrock(x):
    ret = np.zeros_like(x)
    ret[...,0] = 202*x[...,0] - 200*x[...,1] - 2
    ret[...,1] = -200*(x[...,0]-x[...,1])
    return ret


In [4]:
BATCH_SIZE = 1

In [5]:
fx_x = np.random.randn(BATCH_SIZE,1)
print('fx_x:', fx_x)

('fx_x:', array([[-0.78910299]]))


In [6]:
rosenbrock_x = np.random.randn(BATCH_SIZE,2)
print('rosenbrock_x:', rosenbrock_x)

('rosenbrock_x:', array([[ 0.11219411, -0.38462552]]))


In [7]:
# min_fx = train(fx_x, g_fx, 20000, approximate_gradient=True, f=fx)
# min_rosenbrock = train(rosenbrock_x, g_rosenbrock, 200000, approximate_gradient=True, f=rosenbrock)
min_fx = train(fx_x, g_fx, 20000)
min_rosenbrock = train(rosenbrock_x, g_rosenbrock, 200000)

In [8]:
print('min_fx:', min_fx)
print('min_rosenbrock', min_rosenbrock)

('min_fx:', array([-1.22474487]))
('min_rosenbrock', array([-0.37091635, -0.38462552]))
