# Maintainable Architecture for Boosting Variants

This study explores options for a maintainable but computationally efficient code architecture to provide a range of boosting options. Specifically, we focus here on the choice of rule condition optimisation algorithm. However, the goal is to establish an architectural development direction that is also able to integrate different objective functions, different weight optimisers, and so on.

The main suspense is that:

1. Because of efficiency the main boosting loop should be implemented through a numba njit function or jitclass method.
2. Numba does not support certain mechanism, especially related to class-based polymorphism, that are usually used to provide a clean architectural framework for extensions without code duplication

## Numba supports higher order functions with type-based specialisation

In [18]:
import numpy as np
from numba import njit

@njit
def sum_aggregator(a=0, b=0):
    return a + b

@njit
def product_aggregator(a=1, b=1):
    return a * b

@njit
def aggregate(a, aggregator):
    res = aggregator()
    for x in a:
        res = aggregator(res, x)
    return res

a = np.array([1, 2, 3, 4, 5])
print(aggregate(a, sum_aggregator))
print(aggregate(a, product_aggregator))


15
120


In [19]:
from numba.typed import List
from numba.types import unicode_type

@njit
def concat_aggregator(a='', b=''):
    return a + b

list = List.empty_list(unicode_type)
list.append('a')
list.append('b')
list.append('c')

aggregate(list, concat_aggregator)

'abc'

In [26]:
def aggregation_function(aggregator):
    
    @njit
    def res(a):
        return aggregate(a, aggregator)
    
    return res

sum_aggregate = aggregation_function(sum_aggregator)
sum_aggregate(np.array([1, 2, 3, 4]))

10

In [27]:
def outer(f, c):

    @njit
    def inner(x):
        return f(x, c)
    
    return inner

@njit
def f(x, y):
    return x + y

g = outer(f, 2)
g(1)


3

## Jitclasses as Members do not Provide Polymorphism

The main reason is that jitclasses cannot store functions as fields and, when they store other jitclass objects, those have to be of a uniform type. Recalling that there is no subclassing for jitclass, this leaves explicit behaviour lists defined within the class as the likely only option for behaviour configuration.

In [48]:
from numba import int64

@jitclass
class Foo1:

    n: int64

    def __init__(self, n):
        self.n = n

    def compute(self):
        return self.n + 1
    
@jitclass
class Foo2:

    n: int64

    def __init__(self, n):
        self.n = n

    def compute(self):
        return 2*self.n

@jitclass
class Ba:

    foo: Foo1 # this either has to be types as Foo1 or Foo2

    def __init__(self, foo):
        self.foo = foo

    def compute(self):
        return self.foo.compute()
    
ba = Ba(Foo1(10))
ba.compute()


11

## Jitclasses Support Flexible Init Arguments

In [None]:
from numba.types import unicode_type

@jitclass
class Foo3:

    n: unicode_type

    def __init__(self, n):
        self.n = str(n)

foo3 = Foo3(3)
foo3.n

'3'

In [66]:
import numpy as np
from numba import int64, float64, boolean, njit
from numba.experimental import jitclass
from optikon import Propositionalization, max_weighted_support_bb, max_weighted_support_greedy, equal_width_propositionalization, full_propositionalization
from numba.typed import List
from numba.types import unicode_type
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin

regression_spec_spec = [
    ('y', float64[:]),
    ('x', float64[:, :]),
    ('max_features', int64),
    ('intercept', boolean),
    ('lam', float64)
]

@jitclass(regression_spec_spec)
class RegressionSpec:
    def __init__(self, y, x, max_features, intercept, lam):
        self.y = y
        self.x = x
        self.max_features = max_features
        self.intercept = intercept
        self.lam = lam

classification_spec_spec = [
    ('y', int64[:]),
    ('x', float64[:, :]),
    ('max_features', int64),
    ('intercept', boolean),
    ('lam', float64),
    ('max_iter', int64),
    ('tol', float64)
]

@jitclass(classification_spec_spec)
class ClassificationSpec:
    def __init__(self, y, x, max_features, intercept, lam):
        self.y = y
        self.x = x
        self.max_features = max_features
        self.intercept = intercept
        self.lam = lam
        self.max_iter=100
        self.tol=1e-6

state_spec = [
    ('phi', float64[:, :]),
    ('coef', float64[:]),
    ('current_features', int64),
]

@jitclass(state_spec)
class BoostingState:
    def __init__(self, phi, coef, current_features):
        self.phi = phi
        self.coef = coef
        self.current_features = current_features

    @staticmethod
    def from_spec(spec):
        phi = np.zeros(shape=(len(spec.y), spec.max_features+spec.intercept))
        coef = np.zeros(spec.max_features+spec.intercept)
        current_features = 0
        return BoostingState(phi, coef, current_features)

incremental_ls_spec = [*state_spec,
    ('gram', float64[:, :]),
    ('chol', float64[:, :]),
]

@jitclass(incremental_ls_spec)
class IncrementalLeastSquaresBoostingState:
    def __init__(self, phi, coef, current_features, gram, chol):
        self.phi = phi
        self.coef = coef
        self.current_features = current_features
        self.gram = gram
        self.chol = chol

    @staticmethod
    def from_spec(spec):
        p = spec.max_features+spec.intercept
        phi = np.zeros(shape=(len(spec.y), p))
        g =  np.zeros((p, p))
        l = np.zeros((p, p))
        coef = np.zeros(p)
        current_features = 0
        return IncrementalLeastSquaresBoostingState(phi, coef, current_features, g, l)

@njit
def gradient_least_squares(spec, state):
    return state.phi[:, :state.current_features].dot(state.coef[:state.current_features]) - spec.y

@njit
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

@njit
def gradient_logistic_loss(spec, state):
    return sigmoid(state.phi[:, :state.current_features].dot(state.coef[:state.current_features])) - spec.y

@njit
def fit_minimum_squared_loss_coefs_incrementally(spec, state):
    x, y = state.phi, spec.y
    g, l = state.gram, state.chol
    coef = state.coef[:state.current_features]
    j = state.current_features - 1

    # Update Gramian
    g[j, :j] = x[:, :j].T @ x[:, j]
    g[:j, j] = g[j, :j]
    g[j, j] = x[:, j] @ x[:, j]

    if j!=0 or not spec.intercept:
        g[j, j] += spec.lam

    # Compute RHS
    b = np.zeros(j + 1)
    for i in range(j + 1):
        b[i] = x[:, i] @ y

    # Cholesky update: compute row j of l
    for k in range(j):
        s = 0.0
        for m in range(k):
            s += l[j, m] * l[k, m]
        l[j, k] = (g[j, k] - s) / l[k, k]
    s = 0.0
    for m in range(j):
        s += l[j, m] ** 2
    l[j, j] = np.sqrt(g[j, j] - s)

    # Solve l z = b  (forward solve writing z into coeff)
    for i in range(j + 1):
        s = 0.0
        for k in range(i):
            s += l[i, k] * coef[k]
        coef[i] = (b[i] - s) / l[i, i]

    # Solve l' coef = z  (backward solve, in-place)
    for i in range(j, -1, -1):
        s = 0.0
        for k in range(i + 1, j + 1):
            s += l[k, i] * coef[k]
        coef[i] = (coef[i] - s) / l[i, i]

@njit
def fit_min_logistic_loss_coefs(spec, state):
    phi = state.phi[:, :state.current_features]
    _, d = phi.shape
    beta = state.coef[:d]
    
    for _ in range(spec.max_iter):
        p = sigmoid(phi.dot(beta))
        grad = phi.T @ (p - spec.y) + 2 * spec.lam * beta
        s = p * (1 - p)
        h = phi.T @ (phi * s[:, None]) + 2 * spec.lam * np.eye(d)
        delta = np.linalg.solve(h, grad)
        beta -= delta
        if np.linalg.norm(delta) < spec.tol:
            break

@jitclass
class BranchAndBoundGradientSumBaseLearner:

    max_depth: int64
    props: Propositionalization

    def __init__(self, spec, max_depth=5, prop_fac=equal_width_propositionalization):
        self.max_depth = max_depth
        self.props = prop_fac(spec.x)

    def compute(self, spec, state, gradient_function):
        g = gradient_function(spec, state)

        opt_q_pos, opt_val_pos, _ = max_weighted_support_bb(spec.x, g, self.props, self.max_depth)
        opt_q_neg, opt_val_neg, _ = max_weighted_support_bb(spec.x, -g, self.props, self.max_depth)
        if opt_val_pos >= opt_val_neg:
            return opt_q_pos
        else:
            return opt_q_neg


def bb_baselearner_function_maker(spec, gradient_function, max_depth=4, prop='equal_width'):

    if prop == 'equal_width':
        props = equal_width_propositionalization(spec.x)
    elif prop == 'full':
        props = full_propositionalization(spec.x)
    else:
        raise NotImplementedError(prop)

    @njit
    def res(state):
        g = gradient_function(spec, state)

        opt_q_pos, opt_val_pos, _ = max_weighted_support_bb(spec.x, g, props, max_depth)
        opt_q_neg, opt_val_neg, _ = max_weighted_support_bb(spec.x, -g, props, max_depth)
        if opt_val_pos >= opt_val_neg:
            return opt_q_pos
        else:
            return opt_q_neg
        
    return res


def greedy_baselearner_function_maker(spec, gradient_function, max_depth=4):

    @njit
    def res(state):
        g = gradient_function(spec, state)

        opt_q_pos, opt_val_pos, _ = max_weighted_support_greedy(spec.x, g, self.max_depth)
        opt_q_neg, opt_val_neg, _ = max_weighted_support_greedy(spec.x, -g, self.max_depth)
        if opt_val_pos >= opt_val_neg:
            return opt_q_pos
        else:
            return opt_q_neg
        
    return res


@jitclass
class GreedyGradientSumBaseLearner:

    max_depth: int64

    def __init__(self, spec, max_depth=5, prop_factory=None):
        self.max_depth = max_depth

    def compute(self, spec, state, gradient_function):
        g = gradient_function(spec, state)

        opt_q_pos, opt_val_pos, _ = max_weighted_support_greedy(spec.x, g, self.max_depth)
        opt_q_neg, opt_val_neg, _ = max_weighted_support_greedy(spec.x, -g, self.max_depth)
        if opt_val_pos >= opt_val_neg:
            return opt_q_pos
        else:
            return opt_q_neg
        
@njit
def gradient_sum_rule_ensemble(spec, state, fit_function, base_learner, gradient_function):
    qs = List()
    if spec.intercept:
        qs.append(Propositionalization(np.empty(0, dtype=np.int64), np.empty(0, dtype=np.float64), np.empty(0, dtype=np.int64))) 
        state.phi[:, state.current_features] = 1
        state.current_features += 1
        fit_function(spec, state)
        
    for _ in range(spec.max_features):
        qs.append(base_learner.compute(spec, state, gradient_function))

        state.phi[qs[-1].support_all(spec.x), state.current_features] = 1
        state.current_features += 1

        fit_function(spec, state)
    return state.coef, qs

class BaseRuleBoostingEstimator(BaseEstimator):

    prop_options = {
        'equal_width': equal_width_propositionalization,
        'full': full_propositionalization
    } 

    baselearner_options = {
        'greedy': GreedyGradientSumBaseLearner,
        'bb': BranchAndBoundGradientSumBaseLearner,
    }

    def __init__(self, 
                 spec_factory, 
                 state_factory, 
                 gradient_function, 
                 fit_function, num_rules=3, 
                 fit_intercept=True, 
                 lam=0.0, 
                 prop='equal_width',
                 baselearner='greedy',
                 max_depth=5):
        self.num_rules = num_rules
        self.fit_intercept = fit_intercept
        self.prop = prop
        self.baselearner = baselearner
        self.max_depth = max_depth
        self.lam = lam
        self.spec_factory = spec_factory
        self.state_factory = state_factory
        self.gradient_function = gradient_function
        self.fit_function = fit_function

    def fit(self, x, y):
        spec = self.spec_factory(y, x, self.num_rules, self.fit_intercept, self.lam)
        base_learner_function = self.baselearner_options[self.baselearner](spec, self.max_depth, self.prop_options[self.prop])
        state = self.state_factory(spec)
        self.coef_, self.q_ = gradient_sum_rule_ensemble(spec, state, self.fit_function, base_learner_function, self.gradient_function)
        return self
    
    def predict(self, x):
        q_matrix = self.transform(x)
        return q_matrix.dot(self.coef_)

    def transform(self, x):
        n = len(x)
        q_matrix = np.zeros(shape=(n, len(self.q_)))
        for i in range(len(self.q_)):
            q_matrix[self.q_[i].support_all(x), i] = 1
        return q_matrix
    
    def rules_str(self):
        res = ''
        for i in range(len(self.q_)):
            res += f'{self.coef_[i]:+.3f} if {self.q_[i].str_from_conj(np.arange(len(self.q_[i])))} {'\n' if i<len(self.q_)-1 else ''}'
        return res

class RuleBoostingRegressor(BaseRuleBoostingEstimator, RegressorMixin):
    """
    Rule-based regressor using gradient boosting with branch-and-bound search for conjunctive condition.

    Parameters
    ----------
    num_rules : int, default=3
        Maximum number of rules to fit.
    fit_intercept : bool, default=True
        Whether to include an intercept term.
    lam : float, default=1.0
        L2 regularization parameter.
    max_depth : int, default=5
        Maximum depth of rule condition tree search.

    Examples
    --------
    >>> from ruleboost import RuleBoostingRegressor
    >>> from optikon import full_propositionalization
    >>> import numpy as np
    >>> x = np.array([[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9]])
    >>> y = np.array([0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0])
    >>> model = RuleBoostingRegressor(num_rules=2, lam=0.0, fit_intercept=True, prop=full_propositionalization).fit(x, y)
    >>> print(model.rules_str()) # doctest: +NORMALIZE_WHITESPACE
        +4.000 if  
        -3.000 if x1 <= 0.600 
        -1.000 if x1 <= 0.300 
    >>> np.round(model.predict(x), 3)
    array([0., 0., 0., 1., 1., 1., 4., 4., 4.])
    """

    def __init__(self, num_rules=3, fit_intercept=True, lam=1.0, prop='equal_width', baselearner='greedy', max_depth=4):
        super().__init__(RegressionSpec, 
                         IncrementalLeastSquaresBoostingState.from_spec, 
                         gradient_least_squares, 
                         fit_minimum_squared_loss_coefs_incrementally, 
                         num_rules, 
                         fit_intercept, 
                         lam, 
                         prop,
                         baselearner,
                         max_depth)

class RuleBoostingClassifier(BaseRuleBoostingEstimator, ClassifierMixin):
    """
    Rule-based regressor using gradient boosting with branch-and-bound search for conjunctive condition.

    Parameters
    ----------
    num_rules : int, default=3
        Maximum number of rules to fit.
    fit_intercept : bool, default=True
        Whether to include an intercept term.
    lam : float, default=1.0
        L2 regularization parameter.
    max_depth : int, default=5
        Maximum depth of rule condition tree search.

    Examples
    --------
    >>> from ruleboost import RuleBoostingClassifier
    >>> from optikon import full_propositionalization
    >>> import numpy as np
    >>> x = np.array([[0.1], [0.2], [0.3], [0.4], [0.5], [0.6], [0.7], [0.8], [0.9]])
    >>> y = np.array([0, 0, 0, 1, 1, 1, 0, 0, 0])
    >>> model = RuleBoostingClassifier(num_rules=1, fit_intercept=True, prop=full_propositionalization).fit(x, y)
    >>> print(model.rules_str()) # doctest: +NORMALIZE_WHITESPACE
        -0.475 if  
        +0.675 if x1 >= 0.400 & x1 <= 0.600
    >>> model.predict(x)
    array([0, 0, 0, 1, 1, 1, 0, 0, 0])
    >>> np.round(model.predict_proba(x)[:, 1], 2)
    array([0.38, 0.38, 0.38, 0.55, 0.55, 0.55, 0.38, 0.38, 0.38])
    """

    def __init__(self, num_rules=3, fit_intercept=True, lam=1.0, prop='equal_width', baselearner='greedy', max_depth=4):
        super().__init__(ClassificationSpec, 
                         BoostingState.from_spec, 
                         gradient_logistic_loss, 
                         fit_min_logistic_loss_coefs, 
                         num_rules, 
                         fit_intercept, 
                         lam, 
                         prop,
                         baselearner,
                         max_depth)

    def fit(self, x, y):
        self.classes_, y_encoded = np.unique(y, return_inverse=True)
        return super().fit(x, y_encoded)

    def predict_proba(self, x):
        res = np.zeros((len(x), len(self.classes_)))
        res[:, 1] = sigmoid(super().predict(x))
        res[:, 0] = 1 - res[:, 1]
        return res
    
    def predict(self, x):
        return self.classes_[(super().predict(x)>=0.0).astype(np.int64)]


In [67]:
from sklearn.datasets import load_breast_cancer
x_class, y_class = load_breast_cancer(return_X_y=True)
classifier_bb = RuleBoostingClassifier(baselearner='bb', max_depth=2)
classifier_bb.fit(x_class, y_class)
print(classifier_bb.rules_str())
classifier_bb

-0.741 if  
+2.459 if x1 <= 15.844 & x28 <= 0.140 
-2.507 if x2 >= 16.504 & x21 >= 14.905 
+2.234 if x5 <= 0.121 & x24 <= 871.498 


0,1,2
,num_rules,3
,fit_intercept,True
,lam,1.0
,prop,'equal_width'
,baselearner,'bb'
,max_depth,2


In [63]:
classifier_greedy = RuleBoostingClassifier(baselearner='greedy', max_depth=2)
classifier_greedy.fit(x_class, y_class)
print(classifier_greedy.rules_str())
classifier_greedy

-0.649 if  
+2.797 if x23 <= 105.950 & x28 <= 0.159 
-2.906 if x8 >= 0.049 & x22 >= 20.695 
+2.317 if x24 <= 953.700 & x2 <= 21.315 


0,1,2
,num_rules,3
,fit_intercept,True
,lam,1.0
,prop,'equal_width'
,baselearner,'greedy'
,max_depth,2


In [54]:
from sklearn.datasets import load_diabetes
x_regr, y_regr = load_diabetes(return_X_y=True)
regressor_bb = RuleBoostingRegressor(baselearner='bb', max_depth=2)
regressor_bb.fit(x_regr, y_regr)
print(regressor_bb.rules_str())

+138.250 if  
+57.165 if x3 >= -0.022 & x9 >= -0.005 
+46.005 if x3 >= 0.012 & x4 >= -0.055 
-38.181 if x4 <= 0.039 & x10 <= 0.042 


In [55]:
regressor_greedy = RuleBoostingRegressor(baselearner='greedy', max_depth=2)
regressor_greedy.fit(x_regr, y_regr)
print(regressor_greedy.rules_str())

+176.300 if  
+61.961 if x9 >= -0.004 & x3 >= -0.023 
-48.262 if x3 <= 0.018 & x10 <= 0.042 
-31.549 if x4 <= 0.024 & x1 >= -0.087 
