# Numba-compatible Optimisation

> **Created** by Mario Boley on 2025-06-09

In the future we might want to replace `scipy.optimize` by compiled `numba`-code. This could be especially useful for Shahrzad's LLTBoost algorithm.

In [None]:
import numpy as np
from numba import njit

@njit
def generate_logreg_data(n, p, seed=0):
    np.random.seed(seed)
    x = np.random.randn(n, p)
    w_true = np.random.randn(p)
    logits = x @ w_true
    probs = 1 / (1 + np.exp(-logits))
    y = np.empty(n)
    for i in range(n):
        y[i] = 1.0 if np.random.rand() < probs[i] else 0.0
    return x, y, w_true

x, y, beta = generate_logreg_data(10, 2)
x, y, beta

(array([[ 1.76405235,  0.40015721],
        [ 0.97873798,  2.2408932 ],
        [ 1.86755799, -0.97727788],
        [ 0.95008842, -0.15135721],
        [-0.10321885,  0.4105985 ],
        [ 0.14404357,  1.45427351],
        [ 0.76103773,  0.12167502],
        [ 0.44386323,  0.33367433],
        [ 1.49407907, -0.20515826],
        [ 0.3130677 , -0.85409574]]),
 array([0., 0., 0., 0., 1., 1., 0., 0., 0., 0.]),
 array([-2.55298982,  0.6536186 ]))

## Numba Implementation of Raw Newton

In [3]:
import numpy as np
from numba import njit

@njit
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

@njit
def numba_rawnewton_logreg(x, y, lam, max_iter=100, tol=1e-6):
    _, d = x.shape
    beta = np.zeros(d)
    
    for _ in range(max_iter):
        p = sigmoid(x.dot(beta))
        grad = x.T @ (p - y) + 2 * lam * beta
        s = p * (1 - p)
        h = x.T @ (x * s[:, None]) + 2 * lam * np.eye(d)
        delta = np.linalg.solve(h, grad)
        beta -= delta
        if np.linalg.norm(delta) < tol:
            break
    return beta

numba_rawnewton_logreg(x, y, 0.1)

array([-2.86478563,  0.74649916])

In [4]:
@njit
def numba_rawnewton_logreg_loopfused(x, y, lam, max_iter=100, tol=1e-6):
    n, d = x.shape
    beta = np.zeros(d)
    
    for _ in range(max_iter):
        xb = x @ beta
        p = sigmoid(xb)
        grad = x.T @ (p - y) + 2 * lam * beta

        s = p * (1 - p)

        # Loop-fused weighted Gramian computation
        h = np.zeros((d, d))
        for i in range(n):
            si = s[i]
            for j in range(d):
                xij = x[i, j]
                for k in range(j + 1):
                    xik = x[i, k]
                    h[j, k] += si * xij * xik
        for j in range(d):
            for k in range(j):
                h[k, j] = h[j, k]
            h[j, j] += 2 * lam  # Regularization

        delta = np.linalg.solve(h, grad)
        beta -= delta
        if np.linalg.norm(delta) < tol:
            break
    return beta

numba_rawnewton_logreg_loopfused(x, y, 0.1)

array([-2.86478563,  0.74649916])

## Scipy Implementation with Conjugated Gradient Newton

In [5]:
from scipy.optimize import minimize

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logreg_objective(w, X, y, lam):
    z = X @ w
    log_likelihood = np.sum(np.log(1 + np.exp(-z)) * y + np.log(1 + np.exp(z)) * (1 - y))
    reg = lam * np.dot(w, w)
    return log_likelihood + reg

def logreg_grad(w, X, y, lam):
    z = X @ w
    p = sigmoid(z)
    grad = X.T @ (p - y) + 2 * lam * w
    return grad

def logreg_hess(w, X, y, lam):
    z = X @ w
    p = sigmoid(z)
    S = p * (1 - p)
    H = X.T @ (X * S[:, None]) + 2 * lam * np.eye(X.shape[1])
    return H

def scipy_newtoncg_logreg(X, y, lam):
    d = X.shape[1]
    res = minimize(fun=logreg_objective,
                   x0=np.zeros(d),
                   args=(X, y, lam),
                   method='Newton-CG',
                   jac=logreg_grad,
                   hess=logreg_hess,
                   options={'xtol': 1e-6, 'disp': False})
    return res.x

scipy_newtoncg_logreg(x, y, 0.1)

array([-2.86478563,  0.74649916])

In [6]:
%timeit scipy_newtoncg_logreg(x, y, 0.1)

276 μs ± 6.03 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
%timeit numba_rawnewton_logreg(x, y, 0.1)

21.6 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [8]:
%timeit numba_rawnewton_logreg_loopfused(x, y, 0.1)

20.5 μs ± 258 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## Medium Size Problem

In [9]:
x_med, y_med, beta_med = generate_logreg_data(200, 10)
beta_med

array([-1.53292105, -1.71197016,  0.04613506, -0.95837448, -0.08081161,
       -0.70385904, -0.7707843 , -0.48084534,  0.70358555,  0.92914515])

In [10]:
numba_rawnewton_logreg(x_med, y_med, 0.01)

array([-2.11985398, -1.95074541, -0.06884512, -0.94531301, -0.06184398,
       -0.83223477, -0.90754435, -0.48258231,  1.00053492,  0.84389168])

In [11]:
scipy_newtoncg_logreg(x_med, y_med, 0.01)

array([-2.11985398, -1.95074541, -0.06884512, -0.94531301, -0.06184398,
       -0.83223477, -0.90754435, -0.48258231,  1.00053492,  0.84389168])

In [12]:
numba_rawnewton_logreg_loopfused(x_med, y_med, 0.01)

array([-2.11985398, -1.95074541, -0.06884512, -0.94531301, -0.06184398,
       -0.83223477, -0.90754435, -0.48258231,  1.00053492,  0.84389168])

In [13]:
%timeit scipy_newtoncg_logreg(x_med, y_med, 0.01)

611 μs ± 8.17 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [14]:
%timeit numba_rawnewton_logreg(x_med, y_med, 0.01)

57.1 μs ± 213 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [15]:
%timeit numba_rawnewton_logreg_loopfused(x_med, y_med, 0.01)

51.6 μs ± 179 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [29]:
x_large, y_large, beta_large = generate_logreg_data(2000, 100)
beta_large

array([ 0.0395099 ,  0.33837761, -0.84218319, -0.04963228, -1.23024536,
       -0.98879267,  0.60207459, -0.1595486 , -1.28334796, -1.23822008,
        0.7172287 , -0.48890867,  1.6726215 ,  0.54363772,  0.44157299,
       -0.3919033 , -1.31489467,  0.55313608,  0.20199444,  2.03256366,
       -1.05225342, -1.3299612 ,  1.38865101, -0.24899491, -0.5529632 ,
        0.4356544 ,  0.50993161,  1.8285584 ,  0.60777267,  0.62245265,
        1.21324199,  0.39491096, -1.77359641,  0.04892789, -1.57725036,
       -0.71665916,  0.15661669, -0.00540036,  1.35534798, -1.4107126 ,
       -0.48702981, -1.08426052, -1.16806619, -0.66418541, -1.86610036,
       -1.34430408,  0.44725004,  0.07811359, -0.76761874,  0.64854971,
        1.35362042,  1.41406129,  1.80486759, -1.69093891, -0.15198415,
       -1.64778682, -0.24812456, -0.52488949, -0.90949058,  1.14413438,
       -1.1993784 , -0.10774471, -0.72829357, -0.1086098 ,  0.44094439,
       -0.45808654,  0.96875567, -1.40368748,  1.44444663, -0.10

In [31]:
print(np.mean((beta_large-scipy_newtoncg_logreg(x_large, y_large, 0.001))**2))
print(np.mean((beta_large-numba_rawnewton_logreg(x_large, y_large, 0.001))**2))
print(np.mean((beta_large-numba_rawnewton_logreg_loopfused(x_large, y_large, 0.001))**2))

0.12124282326377242
0.12124283497888044
0.12124283497888026


In [32]:
%timeit scipy_newtoncg_logreg(x_large, y_large, 0.001)

8.79 ms ± 447 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
%timeit numba_rawnewton_logreg(x_large, y_large, 0.001)

5.15 ms ± 20.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
%timeit numba_rawnewton_logreg_loopfused(x_large, y_large, 0.001)

13.7 ms ± 346 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
