diff --git a/docs/basic/grammar.md b/docs/basic/grammar.md index 243ed2d..62d3229 100644 --- a/docs/basic/grammar.md +++ b/docs/basic/grammar.md @@ -59,6 +59,7 @@ that have *not* been implemented by `formualaic` are explicitly noted also. | `center(...)` | Shift column data so mean is zero. | ✓ | ✓ | 🗙 | | `scale(...)` | Shift column so mean is zero and variance is 1. | ✓ | ✓[^7] | ✓ | | `standardize(...)` | Alias of `scale`. | 🗙 | ✓ | 🗙 | +| `poly(...)` | Generates a polynomial basis, allowing non-linear fits. | ✓ | 🗙 | ✓ | | `bs(...)` | Generates a B-Spline basis, allowing non-linear fits. | ✓ | ✓ | ✓ | | `cr(...)` | Generates a natural cubic spline basis, allowing non-linear fits. | 🗙 | ✓ | ✓ | | `cc(...)` | Generates a cyclic cubic spline basis, allowing non-linear fits. | 🗙 | ✓ | ✓ | diff --git a/formulaic/transforms/__init__.py b/formulaic/transforms/__init__.py index a54a527..3f2d525 100644 --- a/formulaic/transforms/__init__.py +++ b/formulaic/transforms/__init__.py @@ -1,12 +1,14 @@ from .basis_spline import basis_spline from .identity import identity from .encode_categorical import encode_categorical +from .poly import poly from .scale import center, scale TRANSFORMS = { "bs": basis_spline, "center": center, + "poly": poly, "scale": scale, "C": encode_categorical, "I": identity, diff --git a/formulaic/transforms/poly.py b/formulaic/transforms/poly.py new file mode 100644 index 0000000..998f27a --- /dev/null +++ b/formulaic/transforms/poly.py @@ -0,0 +1,108 @@ +from formulaic.utils.stateful_transforms import stateful_transform + +import numpy +import numpy.typing + + +@stateful_transform +def poly( + x: numpy.typing.ArrayLike, degree: int = 1, raw: bool = False, _state=None +) -> numpy.ndarray: + """ + Generate a basis for a polynomial vector-space representation of `x`. + + The basis vectors returned by this transform can be used, for example, to + capture non-linear dependence on `x` in a linear regression. + + Args: + x: The vector for which a polynomial vector space should be generated. + degree: The degree of the polynomial vector space. + raw: Whether to return "raw" basis vectors (e.g. `[x, x**2, x**3]`). If + `False`, an orthonormal set of basis vectors is returned instead + (see notes below for more information). + + Returns: + A two-dimensional numpy array with `len(x)` rows, and `degree` columns. + The columns represent the basis vectors of the polynomial vector-space. + + Notes: + This transform is an implementation of the "three-term recurrence + relation" for monic orthogonal polynomials. There are many good + introductions to these recurrence relations, including: + https://dec41.user.srcf.net/h/IB_L/numerical_analysis/2_3 + Another common approach is QR factorisation, where the columns of Q are + the orthogonal basis vectors. However, our implementation outperforms + numpy's QR decomposition, and does not require needless computation of + the R matrix. It should also be noted that orthogonal polynomial bases + are unique up to the choice of inner-product and scaling, and so all + methods will result in the same set of polynomials. + + When used as a stateful transform, we retain the coefficients that + uniquely define the polynomials; and so new data will be evaluated + against the same polynomial bases as the original dataset. However, + the polynomial basis will almost certainly *not* be orthogonal for the + new data. This is because changing the incoming dataset is equivalent to + changing your choice of inner product. + + Using orthogonal basis vectors (as compared to the "raw" vectors) allows + you to increase the degree of the polynomial vector space without + affecting the coefficients of lower-order components in a linear + regression. This stability is often attractive during exploratory data + analysis, but does not otherwise change the results of a linear + regression. + + `nan` values in `x` will be ignored and progagated through to generated + polynomials. + + The signature of this transform is intentionally chosen to be compatible + with R. + """ + + if raw: + return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1) + + x = numpy.array(x) + + # Check if we already have already generated the alpha and beta coefficients. + # If not, we enter "training" mode. + training = False + alpha = _state.get("alpha") + norms2 = _state.get("norms2") + + if alpha is None: + training = True + alpha = {} + norms2 = {} + + # Build polynomials iteratively using the monic three-term recurrence relation + # Note that alpha and beta are fixed if not in "training" mode. + P = numpy.empty((x.shape[0], degree + 1)) + P[:, 0] = 1 + + def get_alpha(k): + if training and k not in alpha: + alpha[k] = numpy.sum(x * P[:, k] ** 2) / numpy.sum(P[:, k] ** 2) + return alpha[k] + + def get_norm(k): + if training and k not in norms2: + norms2[k] = numpy.sum(P[:, k] ** 2) + return norms2[k] + + def get_beta(k): + return get_norm(k) / get_norm(k - 1) + + for i in range(1, degree + 1): + P[:, i] = (x - get_alpha(i - 1)) * P[:, i - 1] + if i >= 2: + P[:, i] -= get_beta(i - 1) * P[:, i - 2] + + # Renormalize so we provide an orthonormal basis. + P /= numpy.array([numpy.sqrt(get_norm(k)) for k in range(0, degree + 1)]) + + if training: + _state["alpha"] = alpha + _state["norms2"] = norms2 + + # Return basis dropping the first (constant) column + return P[:, 1:] diff --git a/tests/transforms/test_poly.py b/tests/transforms/test_poly.py new file mode 100644 index 0000000..d89d17f --- /dev/null +++ b/tests/transforms/test_poly.py @@ -0,0 +1,155 @@ +import numpy +import pytest + +from formulaic.transforms.poly import poly + + +class TestPoly: + @pytest.fixture(scope="session") + def data(self): + return numpy.linspace(0, 1, 21) + + def test_basic(self, data): + state = {} + V = poly(data, _state=state) + + assert V.shape[1] == 1 + + # Comparison data copied from R output of: + # > data = seq(from=0, to=1, by=0.05) + # > poly(data) + r_reference = [ + -3.603750e-01, + -3.243375e-01, + -2.883000e-01, + -2.522625e-01, + -2.162250e-01, + -1.801875e-01, + -1.441500e-01, + -1.081125e-01, + -7.207500e-02, + -3.603750e-02, + -3.000725e-17, + 3.603750e-02, + 7.207500e-02, + 1.081125e-01, + 1.441500e-01, + 1.801875e-01, + 2.162250e-01, + 2.522625e-01, + 2.883000e-01, + 3.243375e-01, + 3.603750e-01, + ] + + assert numpy.allclose( + V[:, 0], + r_reference, + ) + + assert pytest.approx(state["alpha"], {0: 0.5}) + assert pytest.approx(state["norms2"], {0: 21.0, 2: 1.925}) + + assert numpy.allclose( + poly(data, _state=state)[:, 0], + r_reference, + ) + + def test_degree(self, data): + state = {} + V = poly(data, degree=3, _state=state) + + assert V.shape[1] == 3 + + # Comparison data copied from R output of: + # > data = seq(from=0, to=1, by=0.05) + # > poly(data, 3) + r_reference = numpy.array( + [ + [-3.603750e-01, 0.42285541, -4.332979e-01], + [-3.243375e-01, 0.29599879, -1.733191e-01], + [-2.883000e-01, 0.18249549, 1.824412e-02], + [-2.522625e-01, 0.08234553, 1.489937e-01], + [-2.162250e-01, -0.00445111, 2.265312e-01], + [-1.801875e-01, -0.07789442, 2.584584e-01], + [-1.441500e-01, -0.13798440, 2.523770e-01], + [-1.081125e-01, -0.18472105, 2.158888e-01], + [-7.207500e-02, -0.21810437, 1.565954e-01], + [-3.603750e-02, -0.23813436, 8.209854e-02], + [-3.000725e-17, -0.24481103, -4.395626e-17], + [3.603750e-02, -0.23813436, -8.209854e-02], + [7.207500e-02, -0.21810437, -1.565954e-01], + [1.081125e-01, -0.18472105, -2.158888e-01], + [1.441500e-01, -0.13798440, -2.523770e-01], + [1.801875e-01, -0.07789442, -2.584584e-01], + [2.162250e-01, -0.00445111, -2.265312e-01], + [2.522625e-01, 0.08234553, -1.489937e-01], + [2.883000e-01, 0.18249549, -1.824412e-02], + [3.243375e-01, 0.29599879, 1.733191e-01], + [3.603750e-01, 0.42285541, 4.332979e-01], + ] + ) + + assert numpy.allclose( + V, + r_reference, + ) + + assert pytest.approx(state["alpha"], {0: 0.5, 1: 0.5, 2: 0.5}) + assert pytest.approx( + state["norms2"], {1: 0.09166666666666667, 2: 0.07283333333333333} + ) + + assert numpy.allclose( + poly(data, degree=3, _state=state), + r_reference, + ) + + def test_reuse_state(self, data): + state = {} + V = poly(data, degree=3, _state=state) # as tested above + + # Reuse state but with different data + V = poly(data ** 2, degree=3, _state=state) + + assert V.shape[1] == 3 + + # Comparison data copied from R output of: + # > data = seq(from=0, to=1, by=0.05) + # > coefs = attr(poly(data, 3), 'coefs') + # > poly(data^2, 3, coefs=coefs) + r_reference = numpy.array( + [ + [-0.36037499, 0.422855413, -0.43329786], + [-0.35857311, 0.416195441, -0.41855671], + [-0.35316749, 0.396415822, -0.37546400], + [-0.34415811, 0.364117458, -0.30735499], + [-0.33154499, 0.320301848, -0.21959840], + [-0.31532811, 0.266371091, -0.11931132], + [-0.29550749, 0.204127887, -0.01496018], + [-0.27208311, 0.135775535, 0.08415243], + [-0.24505499, 0.063917934, 0.16851486], + [-0.21442312, -0.008440417, 0.22914758], + [-0.18018749, -0.077894418, 0.25845837], + [-0.14234812, -0.140638372, 0.25121156], + [-0.10090500, -0.192465980, 0.20561124], + [-0.05585812, -0.228770342, 0.12449854], + [-0.00720750, -0.244543962, 0.01666296], + [0.04504687, -0.234378741, -0.10173235], + [0.10090500, -0.192465980, -0.20561124], + [0.16036687, -0.112596382, -0.25933114], + [0.22343249, 0.011839952, -0.21491574], + [0.29010186, 0.187853517, -0.01017347], + [0.36037499, 0.422855413, 0.43329786], + ] + ) + + assert numpy.allclose( + V, + r_reference, + ) + + def test_raw(self, data): + assert numpy.allclose( + poly(data, 3, raw=True), numpy.array([data, data ** 2, data ** 3]).T + )