### Importing all functions first

In [12]:
from NelderMead import minimize
import time
import math
import numpy as np

def GD(gradient_function, gamma, x1, steps, timelimit=None):
    t = time.time()
    
    all_points=[np.array(x1)]
    for i in range(1,steps+1):
        x=all_points[i-1]-gamma*gradient_function(all_points[i-1])
        #x=pd(x) #if necessery add PD function
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points
        
def PGD(gradient_function, gamma, mu, x1, steps, timelimit=None):
    t = time.time()
    
    all_points=[x1, x1]
    x=x1
    for i in range(2,steps+2):
        x = all_points[i-1]-gamma*gradient_function(all_points[i-1])+mu*(all_points[i-1]-all_points[i-2])
        #x=pd(x) #if necessery add PD function
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points
        
def NGD(gradient_function, gamma, mu, x1, steps, timelimit=None):
    t = time.time()
    
    all_points=[x1, x1]
    x=x1
    for i in range(2,steps+2):
        x = all_points[i-1] - \
            gamma*gradient_function(all_points[i-1] + mu*(all_points[i-1]-all_points[i-2])) + \
            mu*(all_points[i-1]-all_points[i-2])
        #x=pd(x) #if necessery add PD function
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points
        
def AGD(gradient_function, gamma, x1, steps, timelimit=None):
    t = time.time()
    
    all_points=[x1]
    all_gradients=np.ones(len(x1))
    
    for i in range(1,steps+1):
        gradient_step=gradient_function(all_points[i-1])
        D=np.diag(1/np.sqrt(all_gradients))
        x = all_points[i-1] - gamma*D*gradient_step
            
        #x=pd(x) #if necessery add PD function
        all_gradients+=gradient_step**2
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points

def NewtonMethod(hessian_matrix, gradient_function, x1, steps, timelimit=None):
    #all_points=np.zeros((steps+1, len(x1)))
    t = time.time()
    
    all_points=[x1]
    for i in range(1,steps+1):
        x=all_points[i-1]-np.dot(np.linalg.inv(hessian_matrix(all_points[i-1])),gradient_function(all_points[i-1]))
        #print(x)
        #x=pd(x) #if necessery add PD function
        #all_points[i,:]=x
        all_points.append(x)
        
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points

def BFGS(gradient_function, x1, steps, timelimit=None):
    t = time.time()
    
    B=np.eye(len(x1))
    all_points=[x1, x1-np.dot(B,gradient_function(x1))]
    
    for i in range(steps):
        xk, xk_1 = all_points[-1], all_points[-2]
        grad_xk, grad_xk_1 = gradient_function(xk), gradient_function(xk_1)
        
        gamma = grad_xk-grad_xk_1
        delta = xk-xk_1
        
        #print(all_points)
        #print(grad_xk)
        #print(grad_xk_1)
        gamma=gamma[..., None]
        delta=delta[..., None]
        
        if delta.T.dot(gamma) == 0:
            print('    BFGS OVERFLOW!!!')
            return all_points
        
        B_new = B - (delta.dot(gamma.T.dot(B)) + B.dot(gamma).dot(delta.T)) / (delta.T.dot(gamma)) + \
             (1 + (gamma.T.dot(B).dot(gamma)) / (delta.T.dot(gamma))) * (delta * delta.T) / (delta.T.dot(gamma))      
        
        all_points.append(xk-B_new.dot(gradient_function(xk)))
        B=B_new
        
        #print(time.time() - t)
        if timelimit != None and time.time() - t > timelimit:
                print('    Time limit reached!')
                break
        
    return all_points

### Function to compare them all using different number of steps

In [17]:
def commpare_all(fun, hessian_function, gradient_function, x1, gamma, mu, actual_min, delta):
    
    print(f"Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*")
    print(f"    Gradient descend: {np.sum(np.square(GD(gradient_function, gamma, x1, 10)[-1]- actual_min)):.5f}")
    print(f"    Polyak gradient: {np.sum(np.square(PGD(gradient_function, gamma, mu, x1, 10)[-1]- actual_min)):.5f}")
    print(f"    Nestorov gradient descend: {np.sum(np.square(NGD(gradient_function, gamma, mu, x1, 10)[-1]- actual_min)):.5f}")
    print(f"    Newton method: {np.sum(np.square(NewtonMethod(hessian_function, gradient_function, x1, 10)[-1]- actual_min))}")
    print(f"    BFGS method: {np.sum(np.square(BFGS(gradient_function, x1, 10)[-1]- actual_min))}")
    print(f"    NelderMead method: {np.sum(np.square(minimize(fun, x1, max_iterations=10, delta=delta)- actual_min))}")
    
    print(f"Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*")
    print(f"    Gradient descend: {np.sum(np.square(GD(gradient_function, gamma, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    Polyak gradient: {np.sum(np.square(PGD(gradient_function, gamma, mu, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    Nestorov gradient descend: {np.sum(np.square(NGD(gradient_function, gamma, mu, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    Newton method: {np.sum(np.square(NewtonMethod(hessian_function, gradient_function, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    BFGS method: {np.sum(np.square(BFGS(gradient_function, x1, 100)[-1]- actual_min)):.5f}")
    print(f"    NelderMead method: {np.sum(np.square(minimize(fun, x1, max_iterations=100, delta=delta)- actual_min))}")
    
    print(f"Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*")
    print(f"    Gradient descend: {np.sum(np.square(GD(gradient_function, gamma, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    Polyak gradient: {np.sum(np.square(PGD(gradient_function, gamma, mu, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    Nestorov gradient descend: {np.sum(np.square(NGD(gradient_function, gamma, mu, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    Newton method: {np.sum(np.square(NewtonMethod(hessian_function, gradient_function, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    BFGS method: {np.sum(np.square(BFGS(gradient_function, x1, 1000)[-1]- actual_min)):.5f}")
    print(f"    NelderMead method: {np.sum(np.square(minimize(fun, x1, max_iterations=1000, delta=delta)- actual_min))}")

### First function
The first function on which we are going to compare all approaches is:
$$f(x,y,z)=(x-z)^{2}+(2y+z)^{2}+(4x-2y+z)^{2}+x+y$$
First we will start with 
$$x_{1}=(0,0,0)$$

In [18]:
def gradient_function_a_part(X):
    x,y,z = X[0], X[1], X[2]
    return np.array([34*x-16*y+6*z+1, -16*x+16*y+1, 6*x+6*z])

def hessian_function_a_part(_):
    return np.array([
        np.array([34, -16, 6]),
        np.array([-16, 16, 0]),
        np.array([6, 0, 6]),
    ])

def fun(X):
    x,y,z = X[0], X[1], X[2]
    return (x-z)**2+(2*y+z)**2+(4*x-2*y+z)**2+x+y

gamma = 0.01
mu = 0.05
x1=np.array([0,0,0])
actual_min = np.array([-1/6, -11/48, 1/6])
delta=5

commpare_all(fun,hessian_function_a_part, gradient_function_a_part, x1, gamma, mu, actual_min, delta)

Testing all the methods on 10 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.05475
    Polyak gradient: 0.05305
    Nestorov gradient descend: 0.05311
    Newton method: 1.5407439555097887e-33
    BFGS method: 8.520524277920641e-16
2
    NelderMead method: 59.63368055555556
Testing all the methods on 100 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.00027
    Polyak gradient: 0.00019
    Nestorov gradient descend: 0.00020
    Newton method: 0.00000
    BFGS OVERFLOW!!!
    BFGS method: 0.00000
2
    NelderMead method: 59.63368055555556
Testing all the methods on 1000 steps, shown bellow is the distance from the actual minimum x*
    Gradient descend: 0.00000
    Polyak gradient: 0.00000
    Nestorov gradient descend: 0.00000
    Newton method: 0.00000
    BFGS OVERFLOW!!!
    BFGS method: 0.00000
2
    NelderMead method: 59.63368055555556
