# Super-Universal Regularized Newton Method

Playing around with Nikita Doikov, Konstantin Mishchenko, and Yurii Nesterov's [new Newton Method](https://arxiv.org/pdf/2208.05888.pdf). Implementation based on the [code they published](https://github.com/doikov/super-newton/blob/main/methods.py) published with the paper.

---

## Representing an objective

Just some classes to represent functinos and their gradients.

In [1]:
import numpy as np

from dataclasses import dataclass
import abc


class Objective(abc.ABC):
    @abc.abstractmethod
    def f(self, arg):
        ...

    @abc.abstractmethod
    def df(self, arg):
        ...

    @abc.abstractmethod
    def ddf(self, arg):
        ...

    def __add__(self, other):
        if other == 0:
            return self
        if not isinstance(other, Objective):
            other = ConstantObjective(other)
        return SumObjective(self, other)

    def __radd__(self, other):
        return self + other

    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return - self + other

    def __rmul__(self, lmbda):
        return ScalarMulObjective(self, lmbda)
    
    def __neg__(self):
        return ScalarMulObjective(self, -1)


@dataclass(slots=True)
class ConstantObjective(Objective):
    const: np.ndarray | float

    def f(self, arg):
        return self.const

    def df(self, arg):
        return np.zeros_like(self.const)
    
    def ddf(self, arg):
        z = np.zeros_like(self.const)
        return np.add.outer(z, z)


@dataclass(slots=True)
class SumObjective(Objective):
    left: Objective
    right: Objective

    def f(self, arg):
        return self.left.f(arg) + self.right.f(arg)

    def df(self, arg):
        return self.left.df(arg) + self.right.df(arg)

    def ddf(self, arg):
        return self.left.ddf(arg) + self.right.ddf(arg)


@dataclass(slots=True)
class ScalarMulObjective(Objective):
    obj: Objective
    lmbda: float

    def f(self, arg):
        return self.lmbda * self.obj.f(arg)

    def df(self, arg):
        return self.lmbda * self.obj.df(arg)

    def ddf(self, arg):
        return self.lmbda * self.obj.ddf(arg)

---

## Minimize routine

Implementation of the super universal regularized newton method based on the authors' code. Notice the implementation is ridiculously simple !

In [2]:
import numpy as np
import scipy as sp


def minimize(obj, initial, alpha=2/3, tol=1e-15, outer=100, inner=10):
    h = 1.0
    argmin = initial
    eye = np.eye(initial.shape[0])

    for _ in range(outer):
        hess = obj.ddf(argmin)
        ngrad = - obj.df(argmin)
        sqnorm = ngrad.dot(ngrad)

        if sqnorm < tol ** 2:
            break

        for _ in range(inner):
            l = h * sqnorm ** (alpha / 2)
            cho = sp.linalg.cho_factor(hess + l * eye, lower=False)
            delta = sp.linalg.cho_solve(cho, ngrad)

            if ngrad.dot(delta) >= sqnorm / (4 * l):
                h = max(h / 4, 1e-5)
                break

            h *= 4
        
        argmin += delta
    
    return argmin

Minimize squared norm, unconstrained.

In [3]:
@dataclass
class sqdist(Objective):
    origin: np.ndarray

    def f(self, arg):
        x = arg - self.origin
        return x.dot(x)

    def df(self, arg):
        return 2 * arg - 2 * self.origin

    def ddf(self, arg):
        return np.eye(len(arg))


def sqnorm(n):
    return sqdist(np.zeros(n))

In [4]:
minimize(sqnorm(2), np.ones(2))

array([-5.64415714e-17, -5.64415714e-17])

---

## Barrier method

Solving constrained problems.

In [5]:
from functools import reduce
from operator import add


@dataclass(slots=True)
class barrier(Objective):
    h: Objective
    
    def f(self, arg):
        return - np.log(- self.h.f(arg)).sum()

    def df(self, arg):
        return - self.h.df(arg) / self.h.f(arg)

    def ddf(self, arg):
        h = self.h.f(arg)
        hdf = self.h.df(arg)
        return np.multiply.outer(hdf, hdf) / h ** 2 - self.h.ddf(arg) / h


@dataclass(slots=True)
class affine(Objective):
    A: np.ndarray
    b: np.ndarray

    def f(self, arg):
        return self.A @ arg + self.b
    
    def df(self, arg):
        return self.A.sum(axis=0)
    
    def ddf(self, arg):
        n = self.A.shape[1]
        return np.zeros((n, n), dtype=self.A.dtype)


def constrained(obj, initial, barriers, mu=1.5, outer=20):
    if not barriers:
        return minimize(obj, initial)

    b = reduce(add, barriers, 0)
    argmin = initial

    for t in np.logspace(-1, outer, outer, base=mu):
        argmin = minimize(t * obj + b, argmin)

        if not np.isfinite(b.f(argmin)):
            raise RuntimeError(f"mu too large: {mu}")

    return argmin

Minimize distance objective with linear constraint:

In [6]:
def linearb(A, b):
    return barrier(affine(A, -b))

A = np.asarray([[1, 1]])
b = np.asarray([1])

n = 2
constrained(
    obj=sqdist(2 * np.ones(n)),
    initial=np.zeros(n),
    barriers=[linearb(A, b)],
)

array([0.49994988, 0.49994988])

Minimize distance objective with L2 norm constraint:

In [7]:
constrained(
    obj=sqdist(np.ones(n)),
    initial=np.zeros(n),
    barriers=[barrier(sqnorm(2) - 1)],
)

array([0.70685036, 0.70685036])