Skip to content

Commit

Permalink
Add support for the poly transform.
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewwardrop committed Oct 16, 2021
1 parent 279b97c commit dc6e38d
Show file tree
Hide file tree
Showing 4 changed files with 266 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/basic/grammar.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ that have *not* been implemented by `formualaic` are explicitly noted also.
| `center(...)` | Shift column data so mean is zero. ||| 🗙 |
| `scale(...)` | Shift column so mean is zero and variance is 1. ||[^7] ||
| `standardize(...)` | Alias of `scale`. | 🗙 || 🗙 |
| `poly(...)` | Generates a polynomial basis, allowing non-linear fits. || 🗙 ||
| `bs(...)` | Generates a B-Spline basis, allowing non-linear fits. ||||
| `cr(...)` | Generates a natural cubic spline basis, allowing non-linear fits. | 🗙 |||
| `cc(...)` | Generates a cyclic cubic spline basis, allowing non-linear fits. | 🗙 |||
Expand Down
2 changes: 2 additions & 0 deletions formulaic/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from .basis_spline import basis_spline
from .identity import identity
from .encode_categorical import encode_categorical
from .poly import poly
from .scale import center, scale


TRANSFORMS = {
"bs": basis_spline,
"center": center,
"poly": poly,
"scale": scale,
"C": encode_categorical,
"I": identity,
Expand Down
108 changes: 108 additions & 0 deletions formulaic/transforms/poly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from formulaic.utils.stateful_transforms import stateful_transform

import numpy
import numpy.typing


@stateful_transform
def poly(
x: numpy.typing.ArrayLike, degree: int = 1, raw: bool = False, _state=None
) -> numpy.ndarray:
"""
Generate a basis for a polynomial vector-space representation of `x`.
The basis vectors returned by this transform can be used, for example, to
capture non-linear dependence on `x` in a linear regression.
Args:
x: The vector for which a polynomial vector space should be generated.
degree: The degree of the polynomial vector space.
raw: Whether to return "raw" basis vectors (e.g. `[x, x**2, x**3]`). If
`False`, an orthonormal set of basis vectors is returned instead
(see notes below for more information).
Returns:
A two-dimensional numpy array with `len(x)` rows, and `degree` columns.
The columns represent the basis vectors of the polynomial vector-space.
Notes:
This transform is an implementation of the "three-term recurrence
relation" for monic orthogonal polynomials. There are many good
introductions to these recurrence relations, including:
https://dec41.user.srcf.net/h/IB_L/numerical_analysis/2_3
Another common approach is QR factorisation, where the columns of Q are
the orthogonal basis vectors. However, our implementation outperforms
numpy's QR decomposition, and does not require needless computation of
the R matrix. It should also be noted that orthogonal polynomial bases
are unique up to the choice of inner-product and scaling, and so all
methods will result in the same set of polynomials.
When used as a stateful transform, we retain the coefficients that
uniquely define the polynomials; and so new data will be evaluated
against the same polynomial bases as the original dataset. However,
the polynomial basis will almost certainly *not* be orthogonal for the
new data. This is because changing the incoming dataset is equivalent to
changing your choice of inner product.
Using orthogonal basis vectors (as compared to the "raw" vectors) allows
you to increase the degree of the polynomial vector space without
affecting the coefficients of lower-order components in a linear
regression. This stability is often attractive during exploratory data
analysis, but does not otherwise change the results of a linear
regression.
`nan` values in `x` will be ignored and progagated through to generated
polynomials.
The signature of this transform is intentionally chosen to be compatible
with R.
"""

if raw:
return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1)

x = numpy.array(x)

# Check if we already have already generated the alpha and beta coefficients.
# If not, we enter "training" mode.
training = False
alpha = _state.get("alpha")
norms2 = _state.get("norms2")

if alpha is None:
training = True
alpha = {}
norms2 = {}

# Build polynomials iteratively using the monic three-term recurrence relation
# Note that alpha and beta are fixed if not in "training" mode.
P = numpy.empty((x.shape[0], degree + 1))
P[:, 0] = 1

def get_alpha(k):
if training and k not in alpha:
alpha[k] = numpy.sum(x * P[:, k] ** 2) / numpy.sum(P[:, k] ** 2)
return alpha[k]

def get_norm(k):
if training and k not in norms2:
norms2[k] = numpy.sum(P[:, k] ** 2)
return norms2[k]

def get_beta(k):
return get_norm(k) / get_norm(k - 1)

for i in range(1, degree + 1):
P[:, i] = (x - get_alpha(i - 1)) * P[:, i - 1]
if i >= 2:
P[:, i] -= get_beta(i - 1) * P[:, i - 2]

# Renormalize so we provide an orthonormal basis.
P /= numpy.array([numpy.sqrt(get_norm(k)) for k in range(0, degree + 1)])

if training:
_state["alpha"] = alpha
_state["norms2"] = norms2

# Return basis dropping the first (constant) column
return P[:, 1:]
155 changes: 155 additions & 0 deletions tests/transforms/test_poly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import numpy
import pytest

from formulaic.transforms.poly import poly


class TestPoly:
@pytest.fixture(scope="session")
def data(self):
return numpy.linspace(0, 1, 21)

def test_basic(self, data):
state = {}
V = poly(data, _state=state)

assert V.shape[1] == 1

# Comparison data copied from R output of:
# > data = seq(from=0, to=1, by=0.05)
# > poly(data)
r_reference = [
-3.603750e-01,
-3.243375e-01,
-2.883000e-01,
-2.522625e-01,
-2.162250e-01,
-1.801875e-01,
-1.441500e-01,
-1.081125e-01,
-7.207500e-02,
-3.603750e-02,
-3.000725e-17,
3.603750e-02,
7.207500e-02,
1.081125e-01,
1.441500e-01,
1.801875e-01,
2.162250e-01,
2.522625e-01,
2.883000e-01,
3.243375e-01,
3.603750e-01,
]

assert numpy.allclose(
V[:, 0],
r_reference,
)

assert pytest.approx(state["alpha"], {0: 0.5})
assert pytest.approx(state["norms2"], {0: 21.0, 2: 1.925})

assert numpy.allclose(
poly(data, _state=state)[:, 0],
r_reference,
)

def test_degree(self, data):
state = {}
V = poly(data, degree=3, _state=state)

assert V.shape[1] == 3

# Comparison data copied from R output of:
# > data = seq(from=0, to=1, by=0.05)
# > poly(data, 3)
r_reference = numpy.array(
[
[-3.603750e-01, 0.42285541, -4.332979e-01],
[-3.243375e-01, 0.29599879, -1.733191e-01],
[-2.883000e-01, 0.18249549, 1.824412e-02],
[-2.522625e-01, 0.08234553, 1.489937e-01],
[-2.162250e-01, -0.00445111, 2.265312e-01],
[-1.801875e-01, -0.07789442, 2.584584e-01],
[-1.441500e-01, -0.13798440, 2.523770e-01],
[-1.081125e-01, -0.18472105, 2.158888e-01],
[-7.207500e-02, -0.21810437, 1.565954e-01],
[-3.603750e-02, -0.23813436, 8.209854e-02],
[-3.000725e-17, -0.24481103, -4.395626e-17],
[3.603750e-02, -0.23813436, -8.209854e-02],
[7.207500e-02, -0.21810437, -1.565954e-01],
[1.081125e-01, -0.18472105, -2.158888e-01],
[1.441500e-01, -0.13798440, -2.523770e-01],
[1.801875e-01, -0.07789442, -2.584584e-01],
[2.162250e-01, -0.00445111, -2.265312e-01],
[2.522625e-01, 0.08234553, -1.489937e-01],
[2.883000e-01, 0.18249549, -1.824412e-02],
[3.243375e-01, 0.29599879, 1.733191e-01],
[3.603750e-01, 0.42285541, 4.332979e-01],
]
)

assert numpy.allclose(
V,
r_reference,
)

assert pytest.approx(state["alpha"], {0: 0.5, 1: 0.5, 2: 0.5})
assert pytest.approx(
state["norms2"], {1: 0.09166666666666667, 2: 0.07283333333333333}
)

assert numpy.allclose(
poly(data, degree=3, _state=state),
r_reference,
)

def test_reuse_state(self, data):
state = {}
V = poly(data, degree=3, _state=state) # as tested above

# Reuse state but with different data
V = poly(data ** 2, degree=3, _state=state)

assert V.shape[1] == 3

# Comparison data copied from R output of:
# > data = seq(from=0, to=1, by=0.05)
# > coefs = attr(poly(data, 3), 'coefs')
# > poly(data^2, 3, coefs=coefs)
r_reference = numpy.array(
[
[-0.36037499, 0.422855413, -0.43329786],
[-0.35857311, 0.416195441, -0.41855671],
[-0.35316749, 0.396415822, -0.37546400],
[-0.34415811, 0.364117458, -0.30735499],
[-0.33154499, 0.320301848, -0.21959840],
[-0.31532811, 0.266371091, -0.11931132],
[-0.29550749, 0.204127887, -0.01496018],
[-0.27208311, 0.135775535, 0.08415243],
[-0.24505499, 0.063917934, 0.16851486],
[-0.21442312, -0.008440417, 0.22914758],
[-0.18018749, -0.077894418, 0.25845837],
[-0.14234812, -0.140638372, 0.25121156],
[-0.10090500, -0.192465980, 0.20561124],
[-0.05585812, -0.228770342, 0.12449854],
[-0.00720750, -0.244543962, 0.01666296],
[0.04504687, -0.234378741, -0.10173235],
[0.10090500, -0.192465980, -0.20561124],
[0.16036687, -0.112596382, -0.25933114],
[0.22343249, 0.011839952, -0.21491574],
[0.29010186, 0.187853517, -0.01017347],
[0.36037499, 0.422855413, 0.43329786],
]
)

assert numpy.allclose(
V,
r_reference,
)

def test_raw(self, data):
assert numpy.allclose(
poly(data, 3, raw=True), numpy.array([data, data ** 2, data ** 3]).T
)

0 comments on commit dc6e38d

Please sign in to comment.