In [1]:
from NelderMead import minimize
import time
import math
import numpy as np

def GD(gradient_function, gamma, x1, steps, timelimit=None):
    t = time.time()
    
    all_points=[np.array(x1)]
    for i in range(1,steps+1):
        x=all_points[i-1]-gamma*gradient_function(all_points[i-1])
        #x=pd(x) #if necessery add PD function
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points
        
def PGD(gradient_function, gamma, mu, x1, steps, timelimit=None):
    t = time.time()
    
    all_points=[x1, x1]
    x=x1
    for i in range(2,steps+2):
        x = all_points[i-1]-gamma*gradient_function(all_points[i-1])+mu*(all_points[i-1]-all_points[i-2])
        #x=pd(x) #if necessery add PD function
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points
        
def NGD(gradient_function, gamma, mu, x1, steps, timelimit=None):
    t = time.time()
    
    all_points=[x1, x1]
    x=x1
    for i in range(2,steps+2):
        x = all_points[i-1] - \
            gamma*gradient_function(all_points[i-1] + mu*(all_points[i-1]-all_points[i-2])) + \
            mu*(all_points[i-1]-all_points[i-2])
        #x=pd(x) #if necessery add PD function
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points
        
def AGD(gradient_function, gamma, x1, steps, timelimit=None):
    t = time.time()
    
    all_points=[x1]
    all_gradients=np.ones(len(x1))
    
    for i in range(1,steps+1):
        gradient_step=gradient_function(all_points[i-1])
        D=np.diag(1/np.sqrt(all_gradients))
        x = all_points[i-1] - gamma*D*gradient_step
            
        #x=pd(x) #if necessery add PD function
        all_gradients+=gradient_step**2
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points

def NewtonMethod(hessian_matrix, gradient_function, x1, steps, timelimit=None):
    #all_points=np.zeros((steps+1, len(x1)))
    t = time.time()
    
    all_points=[x1]
    for i in range(1,steps+1):
        x=all_points[i-1]-np.dot(np.linalg.inv(hessian_matrix(all_points[i-1])),gradient_function(all_points[i-1]))
        #print(x)
        #x=pd(x) #if necessery add PD function
        #all_points[i,:]=x
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points

def BFGS(gradient_function, x1, steps, timelimit=None):
    t = time.time()
    
    B=np.eye(len(x1))
    all_points=[x1, x1-np.dot(B,gradient_function(x1))]
    
    for i in range(steps):
        xk, xk_1 = all_points[-1], all_points[-2]
        grad_xk, grad_xk_1 = gradient_function(xk), gradient_function(xk_1)
        
        gamma = grad_xk-grad_xk_1
        delta = xk-xk_1
        
        #print(all_points)
        #print(grad_xk)
        #print(grad_xk_1)
        gamma=gamma[..., None]
        delta=delta[..., None]
        
        if delta.T.dot(gamma) == 0:
            print('    BFGS OVERFLOW!!!')
            return all_points
        
        B_new = B - (delta.dot(gamma.T.dot(B)) + B.dot(gamma).dot(delta.T)) / (delta.T.dot(gamma)) + \
             (1 + (gamma.T.dot(B).dot(gamma)) / (delta.T.dot(gamma))) * (delta * delta.T) / (delta.T.dot(gamma))      
        
        all_points.append(xk-B_new.dot(gradient_function(xk)))
        B=B_new
        
        #print(time.time() - t)
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points

### Function to compare them all using different number of steps

In [2]:
def commpare_all(fun, hessian_function, gradient_function, x1, gamma, mu, actual_min, delta):
    
    print(f"Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*")
    print(f"    Gradient descend: {np.sum(np.square(GD(gradient_function, gamma, x1, 10)[-1]- actual_min)):.5f}")
    print(f"    Polyak gradient: {np.sum(np.square(PGD(gradient_function, gamma, mu, x1, 10)[-1]- actual_min)):.5f}")
    print(f"    Nestorov gradient descend: {np.sum(np.square(NGD(gradient_function, gamma, mu, x1, 10)[-1]- actual_min)):.5f}")
    print(f"    Newton method: {np.sum(np.square(NewtonMethod(hessian_function, gradient_function, x1, 10)[-1]- actual_min))}")
    print(f"    BFGS method: {np.sum(np.square(BFGS(gradient_function, x1, 10)[-1]- actual_min))}")
    print(f"    NelderMead method: {np.sum(np.square(minimize(fun, x1, max_iterations=10, delta=delta)- actual_min))}")
    
    print(f"Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*")
    print(f"    Gradient descend: {np.sum(np.square(GD(gradient_function, gamma, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    Polyak gradient: {np.sum(np.square(PGD(gradient_function, gamma, mu, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    Nestorov gradient descend: {np.sum(np.square(NGD(gradient_function, gamma, mu, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    Newton method: {np.sum(np.square(NewtonMethod(hessian_function, gradient_function, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    BFGS method: {np.sum(np.square(BFGS(gradient_function, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    NelderMead method: {np.sum(np.square(minimize(fun, x1, max_iterations=100, delta=delta)- actual_min))}")
    
    print(f"Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*")
    print(f"    Gradient descend: {np.sum(np.square(GD(gradient_function, gamma, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    Polyak gradient: {np.sum(np.square(PGD(gradient_function, gamma, mu, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    Nestorov gradient descend: {np.sum(np.square(NGD(gradient_function, gamma, mu, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    Newton method: {np.sum(np.square(NewtonMethod(hessian_function, gradient_function, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    BFGS method: {np.sum(np.square(BFGS(gradient_function, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    NelderMead method: {np.sum(np.square(minimize(fun, x1, max_iterations=1000, delta=delta)- actual_min))}")

### First function
The first function on which we are going to compare all approaches is:
$$f(x,y,z)=(x-z)^{2}+(2y+z)^{2}+(4x-2y+z)^{2}+x+y$$
First we will start with 
$$x_{1}=(0,0,0)$$

In [7]:
def gradient_function_a_part(X):
    x,y,z = X[0], X[1], X[2]
    return np.array([34*x-16*y+6*z+1, -16*x+16*y+1, 6*x+6*z])

def hessian_function_a_part(_):
    return np.array([
        np.array([34, -16, 6]),
        np.array([-16, 16, 0]),
        np.array([6, 0, 6]),
    ])

def fun_a_part(X):
    x,y,z = X[0], X[1], X[2]
    return (x-z)**2+(2*y+z)**2+(4*x-2*y+z)**2+x+y

gamma = 0.01
mu = 0.05
x1=np.array([0,0,0])
actual_min = np.array([-1/6, -11/48, 1/6])
delta=0.1

commpare_all(fun_a_part,hessian_function_a_part, gradient_function_a_part, x1, gamma, mu, actual_min, delta)

Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.05475
    Polyak gradient: 0.05305
    Nestorov gradient descend: 0.05311
    Newton method: 1.5407439555097887e-33
    BFGS method: 8.520524277920641e-16
    NelderMead method: 0.0015423657682602562
Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.00027
    Polyak gradient: 0.00019
    Nestorov gradient descend: 0.00020
    Newton method: 0.00000
    BFGS OVERFLOW!!!
    BFGS method: 0.00000
    NelderMead method: 9.887765520216837e-10
Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.00000
    Polyak gradient: 0.00000
    Nestorov gradient descend: 0.00000
    Newton method: 0.00000
    BFGS OVERFLOW!!!
    BFGS method: 0.00000
    NelderMead method: 9.887765520216837e-10


and then 
$$x_{1}=(1,1,0)$$

In [8]:
gamma = 0.01
mu = 0.05
x1=np.array([1,1,0])
actual_min = np.array([-1/6, -11/48, 1/6])
delta=0.1

commpare_all(fun_a_part,hessian_function_a_part, gradient_function_a_part, x1, gamma, mu, actual_min, delta)

Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.99750
    Polyak gradient: 0.95503
    Nestorov gradient descend: 0.95750
    Newton method: 2.311115933264683e-33
    BFGS method: 2.311115933264683e-33
    NelderMead method: 1.6594534226870836
Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.00407
    Polyak gradient: 0.00295
    Nestorov gradient descend: 0.00298
    Newton method: 0.00000
    BFGS OVERFLOW!!!
    BFGS method: 0.00000
    NelderMead method: 1.7830876131816685e-09
Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.00000
    Polyak gradient: 0.00000
    Nestorov gradient descend: 0.00000
    Newton method: 0.00000
    BFGS OVERFLOW!!!
    BFGS method: 0.00000
    NelderMead method: 1.7830876131816685e-09


### Second function

The second function on which we are going to compare all approaches is:
$$f(x,y,z)=(x-1)^{2}+(y-1)^{2}+100(y-x^{2})^{2}+100(z-y^{2})^{2}$$
First we will start with 
$$x_{1}=(1.2, 1.2, 1.2)$$


In [9]:
def gradient_function_b_part(X):
    x, y, z = X[0], X[1], X[2]
    return np.array([2*(x-1)-400*x*(y-x**2), 2*(y-1)+200*(y-x**2)-400*y*(z-y**2), 200*(z-y**2)])

def hessian_function_b_part(X):
    x, y, z = X[0], X[1], X[2]
    return np.array([
        [-400*(y-x**2) + 800*x**2 + 2, -400*x, 0],
        [-400*x, -400*(z-y**2) + 800*y**2 + 202, -400*y],
        [0, -400*y, 200]
    ])

def fun_b_part(X):
    x, y, z = X[0], X[1], X[2]
    return (x-1)**2+(y-1)**2+100*(y-x**2)**2+100*(z-y**2)**2

gamma = 0.0001
mu = 0.1
x1=np.array([1.2, 1.2, 1.2])
actual_min = np.array([1, 1, 1])

delta=0.1

commpare_all(fun_b_part,hessian_function_a_part, gradient_function_a_part, x1, gamma, mu, actual_min, delta)

Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.10314
    Polyak gradient: 0.10160
    Nestorov gradient descend: 0.10161
    Newton method: 3.5664062500000004
    BFGS method: 3.5664062500000004
    NelderMead method: 0.1489573066436348
Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.03639
    Polyak gradient: 0.03589
    Nestorov gradient descend: 0.03587
    Newton method: 3.56641
    BFGS OVERFLOW!!!
    BFGS method: 3.56641
    NelderMead method: 2.106808024003783e-07
Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 1.10751
    Polyak gradient: 1.26815
    Nestorov gradient descend: 1.26799
    Newton method: 3.56641
    BFGS OVERFLOW!!!
    BFGS method: 3.56641
    NelderMead method: 4.028426536812032e-09


and then,
$$x_{1}=(-1, 1.2, 1.2)$$

In [10]:
x1=np.array([-1, 1.2, 1.2])
gamma = 0.0001
mu = 0.1
actual_min = np.array([1, 1, 1])
delta=0.1

commpare_all(fun_b_part,hessian_function_a_part, gradient_function_a_part, x1, gamma, mu, actual_min, delta)

Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 3.89218
    Polyak gradient: 3.87428
    Nestorov gradient descend: 3.87437
    Newton method: 3.5664062500000004
    BFGS OVERFLOW!!!
    BFGS method: 3.5664062500000004
    NelderMead method: 4.339876543209876
Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 2.73333
    Polyak gradient: 2.64018
    Nestorov gradient descend: 2.64065
    Newton method: 3.56641
    BFGS OVERFLOW!!!
    BFGS method: 3.56641
    NelderMead method: 2.0383914617433274
Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 2.27616
    Polyak gradient: 2.38798
    Nestorov gradient descend: 2.38788
    Newton method: 3.56641
    BFGS OVERFLOW!!!
    BFGS method: 3.56641
    NelderMead method: 2.8779533936226077e-09


The third function on which we are going to compare all approaches is:
$$(1.5-x+xy)^{2}+(2.25-x+xy^{2})^{2}+(2.625-x+xy^{3})^{2}$$
First we will start with 
$$x_{1}=(1, 1)$$

In [13]:
def gradient_function_c_part(X):
    x, y = X[0], X[1]
    return np.array([2*(1.5-x+x*y)*(y-1)+2*(2.25-x+x*y**2)*(y**2-1)+2*(2.625-x+x*y**3)*(y**3-1), \
                    2*(1.5-x+x*y)*x+4*(2.25-x+x*y**2)*(x*y)+6*(2.625-x+x*y**3)*(x*y**2)])

def hessian_matrix_c_part(X):
    x, y = X[0], X[1]
    return np.array([[2*(y**6+y**4-2*y**3-y**2-2*y+3), 2*x*(6*y**5+4*y**3-6*y**2-2*y-2)+15.75*y**2+9*y+3], \
                    [2*x*(6*y**5+4*y**3-6*y**2-2*y-2)+15.75*y**2+9*y+3, 2*x*(x*(15*y**4+6*y**2-6*y-1)+6*2.625*y+4.5*y)]])

def fun_c_part(X):
    x,y = X[0], X[1]
    return (1.5-x+x*y)**2+(2.25-x+x*y**2)**2+(2.625-x+x*y**3)**2


gamma = 0.0001
mu = 0.1
x1=np.array([1,1])
actual_min = np.array([3, 0.5])
delta = 0.1

commpare_all(fun_c_part, hessian_matrix_c_part, gradient_function_c_part, x1, gamma, mu, actual_min, delta)

Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 4.22247
    Polyak gradient: 4.21979
    Nestorov gradient descend: 4.21978
    Newton method: 9.25
    BFGS method: nan
    NelderMead method: 0.5049041771888727
Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 3.98408
    Polyak gradient: 3.95600
    Nestorov gradient descend: 3.95591
    Newton method: 9.25000
    BFGS method: nan
    NelderMead method: 3.4633924853168536e-08
Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 2.35085
    Polyak gradient: 2.22826
    Nestorov gradient descend: 2.22818
    Newton method: 9.25000
    BFGS method: nan
    NelderMead method: 3.4633924853168536e-08


  return np.array([2*(1.5-x+x*y)*(y-1)+2*(2.25-x+x*y**2)*(y**2-1)+2*(2.625-x+x*y**3)*(y**3-1), \
  2*(1.5-x+x*y)*x+4*(2.25-x+x*y**2)*(x*y)+6*(2.625-x+x*y**3)*(x*y**2)])


and then
$$x_{1}=(4.5, 4.5)$$

In [14]:
gamma = 1e-6
mu = 0.5
x1=np.array([4.5, 4.5])
actual_min = np.array([3, 0.5])
delta = 0.1

commpare_all(fun_c_part, hessian_matrix_c_part, gradient_function_c_part, x1, gamma, mu, actual_min, delta)

Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 9.82881
    Polyak gradient: 6.85520
    Nestorov gradient descend: 7.32214
    Newton method: 8.582711040688768
    BFGS method: nan
    NelderMead method: 4.025334472656165
Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 3.58792
    Polyak gradient: 2.41752
    Nestorov gradient descend: 2.46649
    Newton method: 9.25000
    BFGS method: nan
    NelderMead method: 5.2163738614556345e-09
Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 1.14620
    Polyak gradient: 0.83584
    Nestorov gradient descend: 0.85289
    Newton method: 9.25000
    BFGS method: nan
    NelderMead method: 5.2163738614556345e-09


  return np.array([2*(1.5-x+x*y)*(y-1)+2*(2.25-x+x*y**2)*(y**2-1)+2*(2.625-x+x*y**3)*(y**3-1), \
  2*(1.5-x+x*y)*x+4*(2.25-x+x*y**2)*(x*y)+6*(2.625-x+x*y**3)*(x*y**2)])
