In [9]:
import numpy as np

In [3]:
def gradient_descent(f, gradient, x0, alpha, eps, iters):
    x = x0
    for i in range(iters):
        x_new = x - alpha * gradient(x)

        if abs(f(x_new) - f(x)) < eps:
            break

        x = x_new
        
    result = {}
    result['converged'] = i != iters
    result['num_iters'] = i
    result['x'] = x_new
        
    return result

In [5]:
def f(x):
    return 0.5*(x[0]**2 + 10*x[1]**2)

In [7]:
def gradient(x):
    return np.array([x[0], 10*x[1]])

In [23]:
x0 = np.array([3,4])
eps = 0.00001
iters = 1000
alpha = 0.1

gradient_descent(f, gradient, x0, alpha, eps, iters)

{'converged': True, 'num_iters': 54, 'x': array([0.00912976, 0.        ])}

In [13]:
def momentum(f, gradient, x0, alpha, eps, iters, beta):
    x=x0
    d = 0
    
    for i in range(iters):
        d = beta*d - alpha*gradient(x)
        x_new = x + d
        
        if abs(f(x_new) - f(x)) < eps:
            break
            
        x = x_new
    
    result = {}
    result['converged'] = i != iters
    result['num_iters'] = i
    result['x'] = x_new
        
    return result

In [22]:
momentum(f, gradient, x0, alpha, eps, iters, beta=0.9)

{'converged': True, 'num_iters': 126, 'x': array([-0.00372751,  0.00300256])}

In [28]:
def nesterov(f, gradient, x0, alpha, eps, iters, beta):
    x=x0
    d = 0
    
    for i in range(iters):
        d = beta*d - alpha*gradient(x - beta*d)
        x_new = x + d
        
        if abs(f(x_new) - f(x)) < eps:
            break
            
        x = x_new
    
    result = {}
    result['converged'] = i != iters
    result['num_iters'] = i
    result['x'] = x_new
        
    return result

In [29]:
nesterov(f, gradient, x0, alpha, eps, iters, beta=0.9)

{'converged': True,
 'num_iters': 999,
 'x': array([-7.79871912e-003,  1.98559672e+128])}

In [37]:
def adam(f, gradient, x0, alpha, eps, iters, beta1, beta2):
    result = {}
    x = x0
    m = 0
    v = 0
    
    for i in range(iters):
        grad = gradient(x)
        m = beta1*m + (1-beta1)*grad
        v = beta2*v + (1-beta2)*grad**2
        
        m_hat = m/(1-beta1**i)
        v_hat = v/(1-beta2**i)
        
        x_new = x - alpha*m_hat / (np.sqrt(v_hat) + eps)
        

        if abs(f(x_new) - f(x)) < eps:
            result['converged'] = True
            break
        x = x_new
        
    if 'converged' not in result:
        result['converged'] = False
        result['iter'] = i
        result['x'] = x
            
    return result

In [38]:
adam(x0=x0,
     f=f,
     gradient=gradient,
     alpha=0.5, # u stvarnim primenama bi alpha bilo mnogo manje, npr. 0.001
     iters=1000,
     beta1=0.9,
     beta2=0.999,
     eps=1e-6)

  m_hat = m/(1-beta1**i)
  v_hat = v/(1-beta2**i)
  x_new = x - alpha*m_hat / (np.sqrt(v_hat) + eps)


{'converged': False, 'iter': 999, 'x': array([nan, nan])}