Add support for the poly transform.

matthewwardrop · Oct 16, 2021 · dc6e38d · dc6e38d
1 parent 279b97c
commit dc6e38d
Show file tree

Hide file tree

Showing 4 changed files with 266 additions and 0 deletions.
diff --git a/docs/basic/grammar.md b/docs/basic/grammar.md
@@ -59,6 +59,7 @@ that have *not* been implemented by `formualaic` are explicitly noted also.
 | `center(...)` | Shift column data so mean is zero. | ✓ | ✓ | 🗙 |
 | `scale(...)` | Shift column so mean is zero and variance is 1. | ✓ | ✓[^7] | ✓ |
 | `standardize(...)` | Alias of `scale`. | 🗙 | ✓ | 🗙 |
+| `poly(...)` | Generates a polynomial basis, allowing non-linear fits. | ✓ | 🗙 | ✓ |
 | `bs(...)` | Generates a B-Spline basis, allowing non-linear fits. | ✓ | ✓ | ✓ |
 | `cr(...)` | Generates a natural cubic spline basis, allowing non-linear fits. | 🗙 | ✓ | ✓ |
 | `cc(...)` | Generates a cyclic cubic spline basis, allowing non-linear fits. | 🗙 | ✓ | ✓ |

diff --git a/formulaic/transforms/__init__.py b/formulaic/transforms/__init__.py
@@ -1,12 +1,14 @@
 from .basis_spline import basis_spline
 from .identity import identity
 from .encode_categorical import encode_categorical
+from .poly import poly
 from .scale import center, scale
 
 
 TRANSFORMS = {
     "bs": basis_spline,
     "center": center,
+    "poly": poly,
     "scale": scale,
     "C": encode_categorical,
     "I": identity,

diff --git a/formulaic/transforms/poly.py b/formulaic/transforms/poly.py
@@ -0,0 +1,108 @@
+from formulaic.utils.stateful_transforms import stateful_transform
+
+import numpy
+import numpy.typing
+
+
+@stateful_transform
+def poly(
+    x: numpy.typing.ArrayLike, degree: int = 1, raw: bool = False, _state=None
+) -> numpy.ndarray:
+    """
+    Generate a basis for a polynomial vector-space representation of `x`.
+
+    The basis vectors returned by this transform can be used, for example, to
+    capture non-linear dependence on `x` in a linear regression.
+
+    Args:
+        x: The vector for which a polynomial vector space should be generated.
+        degree: The degree of the polynomial vector space.
+        raw: Whether to return "raw" basis vectors (e.g. `[x, x**2, x**3]`). If
+            `False`, an orthonormal set of basis vectors is returned instead
+            (see notes below for more information).
+
+    Returns:
+        A two-dimensional numpy array with `len(x)` rows, and `degree` columns.
+        The columns represent the basis vectors of the polynomial vector-space.
+
+    Notes:
+        This transform is an implementation of the "three-term recurrence
+        relation" for monic orthogonal polynomials. There are many good
+        introductions to these recurrence relations, including:
+            https://dec41.user.srcf.net/h/IB_L/numerical_analysis/2_3
+        Another common approach is QR factorisation, where the columns of Q are
+        the orthogonal basis vectors. However, our implementation outperforms
+        numpy's QR decomposition, and does not require needless computation of
+        the R matrix. It should also be noted that orthogonal polynomial bases
+        are unique up to the choice of inner-product and scaling, and so all
+        methods will result in the same set of polynomials.
+
+        When used as a stateful transform, we retain the coefficients that
+        uniquely define the polynomials; and so new data will be evaluated
+        against the same polynomial bases as the original dataset. However,
+        the polynomial basis will almost certainly *not* be orthogonal for the
+        new data. This is because changing the incoming dataset is equivalent to
+        changing your choice of inner product.
+
+        Using orthogonal basis vectors (as compared to the "raw" vectors) allows
+        you to increase the degree of the polynomial vector space without
+        affecting the coefficients of lower-order components in a linear
+        regression. This stability is often attractive during exploratory data
+        analysis, but does not otherwise change the results of a linear
+        regression.
+
+        `nan` values in `x` will be ignored and progagated through to generated
+        polynomials.
+
+        The signature of this transform is intentionally chosen to be compatible
+        with R.
+    """
+
+    if raw:
+        return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1)
+
+    x = numpy.array(x)
+
+    # Check if we already have already generated the alpha and beta coefficients.
+    # If not, we enter "training" mode.
+    training = False
+    alpha = _state.get("alpha")
+    norms2 = _state.get("norms2")
+
+    if alpha is None:
+        training = True
+        alpha = {}
+        norms2 = {}
+
+    # Build polynomials iteratively using the monic three-term recurrence relation
+    # Note that alpha and beta are fixed if not in "training" mode.
+    P = numpy.empty((x.shape[0], degree + 1))
+    P[:, 0] = 1
+
+    def get_alpha(k):
+        if training and k not in alpha:
+            alpha[k] = numpy.sum(x * P[:, k] ** 2) / numpy.sum(P[:, k] ** 2)
+        return alpha[k]
+
+    def get_norm(k):
+        if training and k not in norms2:
+            norms2[k] = numpy.sum(P[:, k] ** 2)
+        return norms2[k]
+
+    def get_beta(k):
+        return get_norm(k) / get_norm(k - 1)
+
+    for i in range(1, degree + 1):
+        P[:, i] = (x - get_alpha(i - 1)) * P[:, i - 1]
+        if i >= 2:
+            P[:, i] -= get_beta(i - 1) * P[:, i - 2]
+
+    # Renormalize so we provide an orthonormal basis.
+    P /= numpy.array([numpy.sqrt(get_norm(k)) for k in range(0, degree + 1)])
+
+    if training:
+        _state["alpha"] = alpha
+        _state["norms2"] = norms2
+
+    # Return basis dropping the first (constant) column
+    return P[:, 1:]
diff --git a/tests/transforms/test_poly.py b/tests/transforms/test_poly.py
@@ -0,0 +1,155 @@
+import numpy
+import pytest
+
+from formulaic.transforms.poly import poly
+
+
+class TestPoly:
+    @pytest.fixture(scope="session")
+    def data(self):
+        return numpy.linspace(0, 1, 21)
+
+    def test_basic(self, data):
+        state = {}
+        V = poly(data, _state=state)
+
+        assert V.shape[1] == 1
+
+        # Comparison data copied from R output of:
+        # > data = seq(from=0, to=1, by=0.05)
+        # > poly(data)
+        r_reference = [
+            -3.603750e-01,
+            -3.243375e-01,
+            -2.883000e-01,
+            -2.522625e-01,
+            -2.162250e-01,
+            -1.801875e-01,
+            -1.441500e-01,
+            -1.081125e-01,
+            -7.207500e-02,
+            -3.603750e-02,
+            -3.000725e-17,
+            3.603750e-02,
+            7.207500e-02,
+            1.081125e-01,
+            1.441500e-01,
+            1.801875e-01,
+            2.162250e-01,
+            2.522625e-01,
+            2.883000e-01,
+            3.243375e-01,
+            3.603750e-01,
+        ]
+
+        assert numpy.allclose(
+            V[:, 0],
+            r_reference,
+        )
+
+        assert pytest.approx(state["alpha"], {0: 0.5})
+        assert pytest.approx(state["norms2"], {0: 21.0, 2: 1.925})
+
+        assert numpy.allclose(
+            poly(data, _state=state)[:, 0],
+            r_reference,
+        )
+
+    def test_degree(self, data):
+        state = {}
+        V = poly(data, degree=3, _state=state)
+
+        assert V.shape[1] == 3
+
+        # Comparison data copied from R output of:
+        # > data = seq(from=0, to=1, by=0.05)
+        # > poly(data, 3)
+        r_reference = numpy.array(
+            [
+                [-3.603750e-01, 0.42285541, -4.332979e-01],
+                [-3.243375e-01, 0.29599879, -1.733191e-01],
+                [-2.883000e-01, 0.18249549, 1.824412e-02],
+                [-2.522625e-01, 0.08234553, 1.489937e-01],
+                [-2.162250e-01, -0.00445111, 2.265312e-01],
+                [-1.801875e-01, -0.07789442, 2.584584e-01],
+                [-1.441500e-01, -0.13798440, 2.523770e-01],
+                [-1.081125e-01, -0.18472105, 2.158888e-01],
+                [-7.207500e-02, -0.21810437, 1.565954e-01],
+                [-3.603750e-02, -0.23813436, 8.209854e-02],
+                [-3.000725e-17, -0.24481103, -4.395626e-17],
+                [3.603750e-02, -0.23813436, -8.209854e-02],
+                [7.207500e-02, -0.21810437, -1.565954e-01],
+                [1.081125e-01, -0.18472105, -2.158888e-01],
+                [1.441500e-01, -0.13798440, -2.523770e-01],
+                [1.801875e-01, -0.07789442, -2.584584e-01],
+                [2.162250e-01, -0.00445111, -2.265312e-01],
+                [2.522625e-01, 0.08234553, -1.489937e-01],
+                [2.883000e-01, 0.18249549, -1.824412e-02],
+                [3.243375e-01, 0.29599879, 1.733191e-01],
+                [3.603750e-01, 0.42285541, 4.332979e-01],
+            ]
+        )
+
+        assert numpy.allclose(
+            V,
+            r_reference,
+        )
+
+        assert pytest.approx(state["alpha"], {0: 0.5, 1: 0.5, 2: 0.5})
+        assert pytest.approx(
+            state["norms2"], {1: 0.09166666666666667, 2: 0.07283333333333333}
+        )
+
+        assert numpy.allclose(
+            poly(data, degree=3, _state=state),
+            r_reference,
+        )
+
+    def test_reuse_state(self, data):
+        state = {}
+        V = poly(data, degree=3, _state=state)  # as tested above
+
+        # Reuse state but with different data
+        V = poly(data ** 2, degree=3, _state=state)
+
+        assert V.shape[1] == 3
+
+        # Comparison data copied from R output of:
+        # > data = seq(from=0, to=1, by=0.05)
+        # > coefs = attr(poly(data, 3), 'coefs')
+        # > poly(data^2, 3, coefs=coefs)
+        r_reference = numpy.array(
+            [
+                [-0.36037499, 0.422855413, -0.43329786],
+                [-0.35857311, 0.416195441, -0.41855671],
+                [-0.35316749, 0.396415822, -0.37546400],
+                [-0.34415811, 0.364117458, -0.30735499],
+                [-0.33154499, 0.320301848, -0.21959840],
+                [-0.31532811, 0.266371091, -0.11931132],
+                [-0.29550749, 0.204127887, -0.01496018],
+                [-0.27208311, 0.135775535, 0.08415243],
+                [-0.24505499, 0.063917934, 0.16851486],
+                [-0.21442312, -0.008440417, 0.22914758],
+                [-0.18018749, -0.077894418, 0.25845837],
+                [-0.14234812, -0.140638372, 0.25121156],
+                [-0.10090500, -0.192465980, 0.20561124],
+                [-0.05585812, -0.228770342, 0.12449854],
+                [-0.00720750, -0.244543962, 0.01666296],
+                [0.04504687, -0.234378741, -0.10173235],
+                [0.10090500, -0.192465980, -0.20561124],
+                [0.16036687, -0.112596382, -0.25933114],
+                [0.22343249, 0.011839952, -0.21491574],
+                [0.29010186, 0.187853517, -0.01017347],
+                [0.36037499, 0.422855413, 0.43329786],
+            ]
+        )
+
+        assert numpy.allclose(
+            V,
+            r_reference,
+        )
+
+    def test_raw(self, data):
+        assert numpy.allclose(
+            poly(data, 3, raw=True), numpy.array([data, data ** 2, data ** 3]).T
+        )