In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"device {device}")


# Optimizers

In [None]:
from abc import ABC, abstractmethod

class Optimiser(ABC):
    def __init__(self, weight, lr=0.1):
        self.weight = weight
        self.lr = lr
        pass

    @abstractmethod
    def step(self, weight, grad):
        pass

    def __call__(self, weight, grad):
        return self.step(weight, grad)

In [None]:
class vanillaSGD(Optimiser):
    def __init__(self, weight, lr=0.1):
        self.lr = lr

    def step(self, weight, grad):
        weight -= self.lr * grad
        return weight

In [None]:
class SignSGD(Optimiser):
    def __init__(self, weight, lr=0.1):
        self.lr = lr

    def step(self, weight, grad):
        weight -= self.lr * torch.sign(grad)
        return weight 

In [None]:
class MomentumSGD(Optimiser):
    def __init__(self,weight,lr=0.1,momentum_coeff=0.9):
        self.momentum_coeff = momentum_coeff
        self.lr = lr
        self.weight_momentum = torch.zeros_like(weight)

    def step(self, weight, grad):
        self.weight_momentum = self.momentum_coeff * self.weight_momentum + grad
        weight -= self.lr * self.weight_momentum
        return weight

In [None]:
class NAG(Optimiser): # Nesterov
    def __init__(self, weight, lr=0.1, momentum_coeff=0.9):
        self.momentum_coeff = momentum_coeff
        self.weight_momentum = torch.zeros_like(weight)
        self.lr = lr
    
    def step(self, weight, grad):
        self.weight_momentum = self.momentum_coeff * self.weight_momentum + grad
        weight -= self.lr * (grad + self.momentum_coeff * self.weight_momentum)
        return weight


In [None]:
class QHM(Optimiser): # Quasi-Hyperbolic Momentum
    def __init__(self, weight, lr=0.1, momentum_coeff=0.9, nu=0.7):
        self.lr = lr
        self.momentum_coeff = momentum_coeff
        self.nu = nu
        self.weight_momentum = torch.zeros_like(weight, requires_grad=True)
    
    def step(self, weight, grad):
        self.weight_momentum = self.momentum_coeff * self.weight_momentum + (1 - self.momentum_coeff) * grad
        update = (1 - self.nu) * grad + self.nu * self.weight_momentum
        weight -= self.lr * update
        return weight


In [None]:
EPSILON = 1e-10

class RMSprop(Optimiser):
    def __init__(self, weight, lr=0.01, momentum_coeff=0.9):
        self.momentum_coeff = momentum_coeff
        self.weight_momentum = torch.zeros_like(weight)
        self.lr = lr
        
    def step(self, weight, grad):
        self.weight_momentum = self.momentum_coeff * self.weight_momentum + (1 - self.momentum_coeff) * grad**2
        weight -= self.lr * grad / (torch.sqrt(self.weight_momentum) + EPSILON)
        return weight

In [None]:
class AdaGrad(Optimiser):
    def __init__(self, weight, lr=0.1):
        self.weight_momentum = torch.zeros_like(weight)
        self.lr = lr
    
    def step(self, weight, grad):
        self.weight_momentum = self.weight_momentum + grad**2
        weight -= self.lr * grad / (torch.sqrt(self.weight_momentum) + EPSILON)
        return weight


In [None]:
DELTA_EPSILON = 1e-6

class AdaDelta(Optimiser):
    def __init__(self, weight, lr=1.0, momentum_coeff=0.75):
        self.momentum_coeff = momentum_coeff
        self.weight_momentum = torch.zeros_like(weight)
        self.grad_momentum = torch.zeros_like(weight)
        self.lr = lr
    
    def step(self, weight, grad):
        self.grad_momentum = self.momentum_coeff * self.grad_momentum + (1 - self.momentum_coeff) * grad**2
        delta_w = - torch.sqrt(self.weight_momentum + DELTA_EPSILON) / torch.sqrt(self.grad_momentum + DELTA_EPSILON) * grad
        self.weight_momentum = self.momentum_coeff * self.weight_momentum + (1 - self.momentum_coeff) * delta_w**2
        weight += delta_w
        return weight

In [None]:
class Adam(Optimiser):
    def __init__(self, weight, lr=0.001, beta_1=0.9, beta_2=0.999):
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.weight_momentum = torch.zeros_like(weight)
        self.grad_momentum = torch.zeros_like(weight)
        self.t = 0
        self.lr = lr
    
    def step(self, weight, grad):
        self.t += 1
        self.weight_momentum = self.beta_1 * self.weight_momentum + (1 - self.beta_1) * grad
        self.grad_momentum = self.beta_2 * self.grad_momentum + (1 - self.beta_2) * grad**2
        m = self.weight_momentum / (1 - self.beta_1**self.t)
        v = self.grad_momentum / (1 - self.beta_2**self.t)
        weight -= self.lr * m / (torch.sqrt(v) + EPSILON)
        return weight

In [None]:
class Nadam(Optimiser):
    def __init__(self, weight, lr=0.002, beta_1=0.9, beta_2=0.999):
        super().__init__(weight, lr)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.weight_momentum = torch.zeros_like(weight)
        self.grad_momentum = torch.zeros_like(weight)
        self.t = 0

    def step(self, weight, grad):
        self.t += 1
        self.weight_momentum = self.beta_1 * self.weight_momentum + (1 - self.beta_1) * grad
        self.grad_momentum = self.beta_2 * self.grad_momentum + (1 - self.beta_2) * grad**2
        m = self.weight_momentum / (1 - self.beta_1**self.t)
        v = self.grad_momentum / (1 - self.beta_2**self.t)
        weight -= self.lr * (self.beta_1 * m + (1 - self.beta_1) * grad / (1 - self.beta_1**self.t)) \
                    / (torch.sqrt(v) + EPSILON)
        return weight 


In [None]:
class AMSGrad(Optimiser):
    def __init__(self, weight, lr=0.001, beta_1=0.9, beta_2=0.999):
        super().__init__(weight, lr)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.weight_momentum = torch.zeros_like(weight)
        self.grad_momentum = torch.zeros_like(weight)
        self.v = torch.zeros_like(weight)
        self.t = 0

    def step(self, weight, grad):
        self.t += 1
        self.weight_momentum = self.beta_1 * self.weight_momentum + (1 - self.beta_1) * grad
        self.grad_momentum = self.beta_2 * self.grad_momentum + (1 - self.beta_2) * grad**2
        self.v = torch.max(self.v, self.grad_momentum)
        m = self.weight_momentum / (1 - self.beta_1**self.t)
        weight -= self.lr * m / (torch.sqrt(self.v) + EPSILON)
        return weight


In [None]:
class BarzilaiBorwein(Optimiser):
    def __init__(self, weight, lr=0.1):
        self.lr = lr
        self.prev_weight = None
        self.prev_grad = None
        self.alpha = lr

    def step(self, weight, grad):
        if self.prev_weight is None or self.prev_grad is None:
            self.alpha = self.lr
            self.prev_weight = weight.clone().detach()
        else:
            s = weight - self.prev_weight.clone().detach()
            y = grad - self.prev_grad.clone().detach()
            norm = torch.sum(s * y)
            if abs(norm.item()) < EPSILON:
                self.alpha = self.lr
            else:
                self.alpha = torch.sum(s * s) / (norm + EPSILON)
                if self.alpha <= 0:
                    self.alpha = self.lr
        self.prev_weight = weight.clone().detach()
        self.prev_grad = grad.clone().detach()
        weight -= self.alpha * grad
        return weight


In [None]:
def newton_schulz(M, num_iters=5):
    if M.dim() < 2:
        return M
    norm = torch.norm(M, p='fro')
    X = M / (norm + EPSILON)
    a = 3.4445
    b = -4.7750
    c = 2.0315
    for _ in range(num_iters):
        XtX = X @ X.transpose(-2, -1)
        X = a * X + b * (XtX @ X) + c * ((XtX @ XtX) @ X)
    return X

# https://arxiv.org/pdf/2502.16982v1  24 Feb 2025
class Muon(Optimiser):
    def __init__(self, weight, lr=0.1, momentum_coeff=0.75, num_iters=5):
        super().__init__(weight, lr)
        self.momentum_coeff = momentum_coeff
        self.num_iters = num_iters
        self.weight_momentum = torch.zeros_like(weight)

    def step(self, weight, grad):
        self.weight_momentum = self.momentum_coeff * self.weight_momentum + grad
        O = newton_schulz(self.weight_momentum, num_iters=self.num_iters)
        weight -= self.lr * O
        return weight

# 1-D

In [None]:
def f(x):
    return x**2

x = torch.tensor([-5.0], requires_grad=True)
zero = torch.tensor([0.0])
opt = vanillaSGD(x, lr=0.1)


x_vals = []
f_vals = []

for i in range(20):
    y = f(x)
    x_vals.append(x.item())
    f_vals.append(y.item())
    y.backward()
    with torch.no_grad():
        x = opt(x, x.grad)
        x.grad = zero.clone()

x_range = np.linspace(-6, 6, 400)
y_range = x_range**2

plt.figure(figsize=(8,6))
plt.plot(x_range, y_range, label='Loss')
plt.plot(x_vals, f_vals, 'ro-', label='Vanilla')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend()
plt.show()

In [None]:
def f(x):
    return x**2 +  torch.sin(10 * x)

x = torch.tensor([-5.0], requires_grad=True)
zero = torch.tensor([0.0])
opt = vanillaSGD(x, lr=0.05)

x_vals = []
f_vals = []

for i in range(20):
    y = f(x)
    x_vals.append(x.item())
    f_vals.append(y.item())
    y.backward()
    with torch.no_grad():
        x = opt(x, x.grad)
        x.grad = zero.clone()

x_range = torch.linspace(-6, 6, 400)
y_range = f(x_range)

plt.figure(figsize=(8,6))
plt.plot(x_range, y_range, label='Loss')
plt.plot(x_vals, f_vals, 'ro-', label='Optimizer')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend()
plt.show()

In [None]:
def f(x):
    return x**2 +  torch.sin(10 * x)

x = torch.tensor([-5.0], requires_grad=True)
zero = torch.tensor([0.0])
opt = MomentumSGD(x, lr=0.05, momentum_coeff=0.5)

x_vals = []
f_vals = []

for i in range(20):
    y = f(x)
    x_vals.append(x.item())
    f_vals.append(y.item())
    y.backward()
    with torch.no_grad():
        x = opt(x, x.grad)
        x.grad = zero.clone()

x_range = torch.linspace(-6, 6, 400)
y_range = f(x_range)

plt.figure(figsize=(8,6))
plt.plot(x_range, y_range, label='Loss')
plt.plot(x_vals, f_vals, 'ro-', label='Optimizer')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.legend()
plt.show()

# Optimization loop

In [None]:
STARTING_POINT = torch.tensor([5.0,5.0])
ZERO_ZERO = torch.zeros_like(STARTING_POINT)

In [None]:
def run_optimization(optimizer,function,starting_point=STARTING_POINT,N = 30):
    weight = starting_point.clone().requires_grad_()
    
    weights = []
    losses = []

    for i in range(N):
        znow = function(weight)
        znow.backward() 

        losses.append(znow.detach().item())
        weights.append(weight.detach().clone())

        with torch.no_grad():
            weight = optimizer(weight,weight.grad)
            weight.grad = ZERO_ZERO.clone()

    weights = torch.stack(weights).detach().numpy()
    losses = torch.tensor(losses).detach().numpy()

    return weights,losses


In [None]:
def f(x):
  return x[0]**2 + x[1]**2 / 2

path, losses = run_optimization(vanillaSGD(ZERO_ZERO,lr=0.05),f)

In [None]:
OPTIMIZERS = {
    "vanillaSGD": vanillaSGD(ZERO_ZERO, lr=0.1),
    "momentumSGD": MomentumSGD(ZERO_ZERO, lr=0.05,momentum_coeff=0.75),
    "nesterovSGD": NAG(ZERO_ZERO, lr=0.1,),
    "QHM": QHM(ZERO_ZERO, lr=0.1),
    "RMSprop": RMSprop(ZERO_ZERO, lr=0.1),
    "Adagrad": AdaGrad(ZERO_ZERO, lr=0.1),
    "Adadelta": AdaDelta(ZERO_ZERO, lr=0.1),
    "Adam": Adam(ZERO_ZERO, lr=0.1),
    "NAdam": Nadam(ZERO_ZERO,lr=0.1),
    "AMSGrad": AMSGrad(ZERO_ZERO, lr=0.1),
    "BarzilaiBorwein": BarzilaiBorwein(ZERO_ZERO, lr=0.1),
    "Muon": Muon(ZERO_ZERO, lr=0.1)
}

def run_optimizations(f, starting_point = STARTING_POINT, optims=OPTIMIZERS,N = 30):
    results = {}
    for name, opt in optims.items():
        print(f"running optimizer: {name}")
        weights, losses = run_optimization(opt,f,starting_point=starting_point,N = N)
        results[name] = (weights, losses)
    return results

results = run_optimizations(f, N=50)
# results

# Visualise

In [None]:
def plot(results, func=f, min_point=None, size=5):
    x_range = torch.linspace(-1 * size, size, 100)
    y_range = np.linspace(-1 * size, size, 100)
    X, Y = np.meshgrid(x_range, y_range)
    Z = func(torch.tensor([X, Y]))

    optimizers = list(results.keys())
    fig = go.Figure()
    fig.add_trace(go.Surface(x=X, y=Y, z=Z.numpy(), colorscale='Viridis', opacity=0.8, showscale=False))
    if min_point is not None:
        fig.add_trace(go.Scatter3d(x=[min_point[0]], y=[min_point[1]], z=[f(min_point)],
                                   mode='markers', marker=dict(color='red', size=5), name='Local Min'))
    for name in optimizers:
        weights, losses = results[name]
        fig.add_trace(go.Scatter3d(x=weights[:, 0], y=weights[:, 1], z=losses,
                                   mode='lines', name=name))
    fig.update_layout(
        title='Optimizers',
        scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Loss',
                   aspectmode='manual', aspectratio=dict(x=1, y=1, z=0.5))
    )
    fig.update_layout(width=1000, height=1000)
    fig.show()

plot(results)

In [None]:
def animate(results, min_point=None):
    import plotly.graph_objects as go
    import numpy as np
    x_range = np.linspace(-5, 5, 100)
    y_range = np.linspace(-5, 5, 100)
    X, Y = np.meshgrid(x_range, y_range)
    Z = f([X, Y])
    num_frames = list(results.values())[0][0].shape[0]
    optimizers = list(results.keys())
    fig = go.Figure()
    fig.add_trace(go.Surface(x=X, y=Y, z=Z, colorscale='Viridis', opacity=0.8, showscale=False))
    offset = 1
    if min_point is not None:
        fig.add_trace(go.Scatter3d(x=[min_point[0]], y=[min_point[1]], z=[f(min_point)],
                                   mode='markers', marker=dict(color='red', size=5), name='Local Min'))
        offset = 2
    for name in optimizers:
        weights, losses = results[name]
        fig.add_trace(go.Scatter3d(x=[weights[0, 0]], y=[weights[0, 1]], z=[losses[0]],
                                   mode='lines', name=name))
    frames = []
    for i in range(num_frames):
        data = []
        for name in optimizers:
            weights, losses = results[name]
            data.append(go.Scatter3d(x=weights[:i+1, 0], y=weights[:i+1, 1], z=losses[:i+1],
                                     mode='lines', name=name))
        frames.append(go.Frame(data=data, name=str(i), traces=list(range(offset, offset+len(optimizers)))))
    fig.frames = frames
    fig.update_layout(
        title='Optimizers',
        scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Loss',
                   aspectmode='manual', aspectratio=dict(x=1, y=1, z=0.5)),
        updatemenus=[dict(
            type='buttons',
            showactive=False,
            y=1,
            x=1.3,
            xanchor='right',
            yanchor='top',
            buttons=[
                dict(label='Play', method='animate', args=[None, {"frame": {"duration": 50, "redraw": True},
                                                                   "fromcurrent": True, "transition": {"duration": 0}}]),
                dict(label='Pause', method='animate', args=[[None], {"frame": {"duration": 0, "redraw": False},
                                                                      "mode": "immediate", "transition": {"duration": 0}}])
            ]
        )],
        sliders=[dict(
            steps=[dict(method='animate', args=[[str(i)], dict(mode='immediate', frame={'duration': 50, 'redraw': True},
                                                              transition={'duration': 0})], label=str(i))
                   for i in range(num_frames)],
            active=0,
            transition={'duration': 0},
            x=0.1,
            y=0,
            currentvalue=dict(font=dict(size=12), prefix='Iteration: ', visible=True, xanchor='center'),
            len=0.9
        )]
    )
    fig.show()

animate(results)

# Hyperparameters

In [None]:
learning_rates = [0.01,  0.1, 0.5, 1.0]
optimizers = {f"vanillaSGD_lr_{lr}": vanillaSGD(ZERO_ZERO, lr=lr) for lr in learning_rates}

results_momentum_lrs = run_optimizations(f, N=50, optims=optimizers)
plot(results_momentum_lrs)

In [None]:
learning_rates = [0.01, 0.03, 0.1]
optimizers = {f"momentumSGD_lr_{lr}": MomentumSGD(ZERO_ZERO, lr=lr,momentum_coeff=0.8) for lr in learning_rates}

results_momentum_lrs = run_optimizations(f, N=50, optims=optimizers)
plot(results_momentum_lrs)

In [None]:
learning_rate = 0.1
momentum_coeffs = [0.1,0.3,0.6,0.7,0.8,0.95]
optimisers = {f"momentumSGD_{momentum_coeff}": MomentumSGD(ZERO_ZERO, lr=learning_rate,momentum_coeff=momentum_coeff) for momentum_coeff in momentum_coeffs}

results_momentum_lrs = run_optimizations(f, N=50, optims=optimisers)
plot(results_momentum_lrs)

# other surfaces

In [None]:
def himmelblau(x):
    return (x[0]**2 + x[1] - 11)**2 + (x[0] + x[1]**2 - 7)**2

learning_rate = 0.1
optimizers = {
    "vanillaSGD": vanillaSGD(ZERO_ZERO, lr=0.01),
    "momentumSGD": MomentumSGD(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "nesterovSGD": NAG(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "QHM": QHM(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "Muon": Muon(ZERO_ZERO, lr=0.01, momentum_coeff=0.2),
    "RMSprop": RMSprop(ZERO_ZERO, lr=learning_rate),
    "Adagrad": AdaGrad(ZERO_ZERO, lr=0.5),
    "Adadelta": AdaDelta(ZERO_ZERO, lr=1.0),
    "Adam": Adam(ZERO_ZERO, lr=learning_rate),
    "AMSGrad": AMSGrad(ZERO_ZERO, lr=0.1),
    "BarzilaiBorwein": BarzilaiBorwein(ZERO_ZERO, lr=0.01),
}




results_himmelblau = run_optimizations(himmelblau, N=50,starting_point=torch.tensor([5.0,5.0]), optims=optimizers)
plot(results_himmelblau, size=5, func=himmelblau)


In [None]:

learning_rate = 0.1
optimizers = {
    "vanillaSGD": vanillaSGD(ZERO_ZERO, lr=0.005),
    "momentumSGD": MomentumSGD(ZERO_ZERO, lr=0.005,momentum_coeff=0.5),
    "nesterovSGD": NAG(ZERO_ZERO, lr=0.005,momentum_coeff=0.5),
    "QHM": QHM(ZERO_ZERO, lr=0.005,momentum_coeff=0.5),
    "Muon": Muon(ZERO_ZERO, lr=0.005, momentum_coeff=0.5),
    "RMSprop": RMSprop(ZERO_ZERO, lr=learning_rate),
    "Adagrad": AdaGrad(ZERO_ZERO, lr=0.5),
    "Adadelta": AdaDelta(ZERO_ZERO, lr=1.0),
    "Adam": Adam(ZERO_ZERO, lr=learning_rate),
    "AMSGrad": AMSGrad(ZERO_ZERO, lr=0.1),
    "BarzilaiBorwein": BarzilaiBorwein(ZERO_ZERO, lr=0.01),
}

results_himmelblau = run_optimizations(himmelblau, N=50,starting_point=torch.tensor([0.0,-6.0]), optims=optimizers)
plot(results_himmelblau, size=6.5, func=himmelblau)

In [None]:

def local_min(x):
    return x[0]**2 - x[1] **2 + (x[1] **4)/50



learning_rate = 0.1

optimizers = {
    "vanillaSGD": vanillaSGD(ZERO_ZERO, lr=0.01),
    "momentumSGD": MomentumSGD(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "nesterovSGD": NAG(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "QHM": QHM(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "Muon": Muon(ZERO_ZERO, lr=0.01, momentum_coeff=0.2),
    "RMSprop": RMSprop(ZERO_ZERO, lr=learning_rate),
    "Adagrad": AdaGrad(ZERO_ZERO, lr=0.5),
    "Adadelta": AdaDelta(ZERO_ZERO, lr=1.0),
    "Adam": Adam(ZERO_ZERO, lr=learning_rate),
    "AMSGrad": AMSGrad(ZERO_ZERO, lr=0.1),
    "BarzilaiBorwein": BarzilaiBorwein(ZERO_ZERO, lr=0.005),
}


results_sphere = run_optimizations(local_min, N=150,starting_point=torch.tensor([6.0, 0.05]), optims=optimizers)
plot(results_sphere, size=6, func=local_min)

In [None]:

def local_min(x):
    return x[0]**2 - x[1] **2 + (x[1] **4)/50



learning_rate = 0.1
optimizers = {
    "vanillaSGD": vanillaSGD(ZERO_ZERO, lr=0.01),
    "momentumSGD": MomentumSGD(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "nesterovSGD": NAG(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "QHM": QHM(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "Muon": Muon(ZERO_ZERO, lr=0.01, momentum_coeff=0.2),
    "RMSprop": RMSprop(ZERO_ZERO, lr=learning_rate),
    "Adagrad": AdaGrad(ZERO_ZERO, lr=0.5),
    "Adadelta": AdaDelta(ZERO_ZERO, lr=1.0),
    "Adam": Adam(ZERO_ZERO, lr=learning_rate),
    "AMSGrad": AMSGrad(ZERO_ZERO, lr=0.1),
    "BarzilaiBorwein": BarzilaiBorwein(ZERO_ZERO, lr=0.005),
}

results_sphere = run_optimizations(local_min, N=150,starting_point=torch.tensor([1.5, 0.01]), optims=optimizers)
plot(results_sphere, size=6, func=local_min)

In [None]:
def local_min(x):
    return  ((x[0]-2)**2 +(x[1]-2) **2) - 5 * (1 / (0.1 + x[0]**4 + x[1]**4) )

optimizers = {
    "vanillaSGD": vanillaSGD(ZERO_ZERO, lr=0.01),
    "momentumSGD": MomentumSGD(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "nesterovSGD": NAG(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "QHM": QHM(ZERO_ZERO, lr=0.01,momentum_coeff=0.2),
    "Muon": Muon(ZERO_ZERO, lr=0.01, momentum_coeff=0.2),
    "RMSprop": RMSprop(ZERO_ZERO, lr=learning_rate),
    "Adagrad": AdaGrad(ZERO_ZERO, lr=0.5),
    "Adadelta": AdaDelta(ZERO_ZERO, lr=1.0),
    "Adam": Adam(ZERO_ZERO, lr=learning_rate),
    "AMSGrad": AMSGrad(ZERO_ZERO, lr=0.1),
    "BarzilaiBorwein": BarzilaiBorwein(ZERO_ZERO, lr=0.01),
}

results_sphere = run_optimizations(local_min, N=150,starting_point=torch.tensor([-3.0, 3.0]), optims=optimizers)
plot(results_sphere, size=3, func=local_min)