Skip to content

Commit

Permalink
ENH add IsotonicRegression class (#78)
Browse files Browse the repository at this point in the history
* MNT replace assert_almost_equal by assert_allclose

* ENH add IsotonicRegression class
  • Loading branch information
lorentzenchr committed Jul 10, 2023
1 parent 14d5ee3 commit a66cb76
Show file tree
Hide file tree
Showing 2 changed files with 209 additions and 19 deletions.
147 changes: 145 additions & 2 deletions src/model_diagnostics/_utils/isotonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import numpy as np
import numpy.typing as npt
import polars as pl
from scipy.interpolate import interp1d
from scipy.stats import expectile

from ._array import validate_2_arrays
from ._array import length_of_second_dimension, validate_2_arrays


def pava(
Expand Down Expand Up @@ -300,7 +302,7 @@ def isotonic_regression(
x : (n_obs,) ndarray
Isotonic regression solution, i.e. an increasing (or decresing) array
of the same length than y.
r : (n_blokcs+1,) ndarray
r : (n_blocks+1,) ndarray
Array of indices with the start position of each block / pool B.
For the j-th block, all values of `x[r[j]:r[j+1]]` are the same.
Expand Down Expand Up @@ -395,3 +397,144 @@ def quantile_upper(x, wx):
x = x[::-1]
r = r[-1] - r[::-1]
return x, r


class IsotonicRegression:
"""Isotonic regression model.
Parameters
----------
increasing : bool or 'auto', default=True
Determines whether the predictions should be constrained to increase
or decrease with `X`. 'auto' will decide based on the Spearman
correlation estimate's sign.
functional : str
The functional that is induced by the identification function `V`. Options are:
- `"mean"`. Argument `level` is neglected.
- `"median"`. Argument `level` is neglected.
- `"expectile"`
- `"quantile"`
level : float
The level of the expectile or quantile. (Often called \\(\alpha\\).)
It must be `0 <= level <= 1`.
`level=0.5` and `functional="expectile"` gives the mean.
`level=0.5` and `functional="quantile"` gives the median.
Attributes
----------
X_thresholds_ : ndarray of shape (n_thresholds,)
Unique ascending `X` values used to interpolate
the y = f(X) monotonic function.
y_thresholds_ : ndarray of shape (n_thresholds,)
De-duplicated `y` values suitable to interpolate the y = f(X)
monotonic function.
f_ : function
The stepwise interpolating function that covers the input domain ``X``.
increasing_ : bool
Inferred value for ``increasing``.
"""

def __init__(
self,
*,
increasing: bool = True,
functional: str = "mean",
level: float = 0.5,
):
self.increasing = increasing
self.functional = functional
self.level = level

def fit(
self,
X: npt.ArrayLike,
y: npt.ArrayLike,
sample_weight: Optional[npt.ArrayLike] = None,
):
"""Fit the model using X, y as training data.
Parameters
----------
X : array-like of shape (n_samples,) or (n_samples, 1)
Training data.
y : array-like of shape (n_samples,)
Training target.
sample_weight : array-like of shape (n_samples,), default=None
Weights. If set to None, all weights will be set to 1 (equal
weights).
Returns
-------
self : object
Returns an instance of self.
"""
if (n_cols := length_of_second_dimension(X)) >= 2:
msg = f"X must have only one colume, got X.shape[1] = {n_cols}."
raise ValueError(msg)
if n_cols == 1:
X = np.asarray(X)[:, 0]
df = pl.DataFrame({"_X": X, "_target_y": y})
if sample_weight is not None:
df = df.hstack([pl.Series(name="_weights", values=sample_weight)])
# We deal with duplicate values in X by also sorting y.
df = df.sort(by=["_X", "_target_y"], descending=[False, self.increasing])
yy = df["_target_y"].to_numpy()
wy = df["_weights"].to_numpy() if sample_weight is not None else None

y_iso, r = isotonic_regression(
y=yy,
weights=wy,
increasing=self.increasing,
functional=self.functional,
level=self.level,
)

X_sorted = df.get_column("_X")
idx_list = [r[0]]
for i in range(1, len(r) - 1):
# Check previous block has more than one element.
if r[i] - 1 - r[i - 1] >= 1:
idx_list.append(r[i] - 1)
idx_list.append(r[i])
# FIXME: Older versions of polars don't allow numpy integers as indices.
if (X_sorted[int(r[-1] - 1)] != X_sorted[int(r[-2])]) and (
y_iso[0] == y_iso[-1] or r[-1] - 1 - r[-2] >= 1
):
# In case all y values are the same, we include this index.
idx_list.append(r[-1] - 1)
idx = np.asarray(idx_list)

# Almost the same, might have some duplicates:
# idx = np.sort(np.r_[r[:-1], r[1:] - 1])

self.X_thresholds_, self.y_thresholds_ = X_sorted[idx].to_numpy(), y_iso[idx]

# Build the interpolation function
self.f_ = interp1d(
self.X_thresholds_,
self.y_thresholds_,
kind="linear",
bounds_error=False,
fill_value=(self.y_thresholds_[0], self.y_thresholds_[-1]),
)
return self

def predict(self, X):
"""Predict new data by linear interpolation.
Parameters
----------
X : array-like of shape (n_samples,) or (n_samples, 1)
Data to predict.
Returns
-------
y_pred : ndarray of shape (n_samples,)
Transformed data.
"""
return self.f_(X)
81 changes: 64 additions & 17 deletions src/model_diagnostics/_utils/tests/test_isotonic.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import numpy as np
import pytest
from numpy.testing import assert_almost_equal, assert_array_equal
from numpy.testing import assert_allclose, assert_array_equal
from scipy.optimize import minimize
from sklearn.isotonic import IsotonicRegression as skl_IsotonicRegression

from model_diagnostics._utils.isotonic import gpava, isotonic_regression, pava
from model_diagnostics._utils.isotonic import (
IsotonicRegression,
gpava,
isotonic_regression,
pava,
)
from model_diagnostics.scoring import PinballLoss, SquaredError


Expand Down Expand Up @@ -221,9 +227,9 @@ def test_simple_isotonic_regression(w):
# https://doi.org/10.18637/jss.v102.c01
y = np.array([8, 4, 8, 2, 2, 0, 8], dtype=np.float64)
x, r = isotonic_regression(y, w)
assert_almost_equal(x, [4, 4, 4, 4, 4, 4, 8])
assert_almost_equal(np.add.reduceat(np.ones_like(y), r[:-1]), [6, 1])
assert_almost_equal(r, [0, 6, 7])
assert_allclose(x, [4, 4, 4, 4, 4, 4, 8])
assert_allclose(np.add.reduceat(np.ones_like(y), r[:-1]), [6, 1])
assert_allclose(r, [0, 6, 7])
# Assert that y was not overwritten
assert_array_equal(y, np.array([8, 4, 8, 2, 2, 0, 8], dtype=np.float64))

Expand All @@ -243,8 +249,8 @@ def test_simple_isotonic_regression_functionals(functional, level, x_res, r_res)
# https://doi.org/10.18637/jss.v102.c01
y = np.array([8, 4, 8, 2, 2, 0, 8], dtype=np.float64)
x, r = isotonic_regression(y, functional=functional, level=level)
assert_almost_equal(x, x_res)
assert_almost_equal(r, r_res)
assert_allclose(x, x_res)
assert_allclose(r, r_res)
# Assert that y was not overwritten
assert_array_equal(y, np.array([8, 4, 8, 2, 2, 0, 8], dtype=np.float64))

Expand All @@ -254,26 +260,26 @@ def test_linspace(increasing):
n = 10
y = np.linspace(0, 1, n) if increasing else np.linspace(1, 0, n)
x, r = isotonic_regression(y, increasing=increasing)
assert_almost_equal(x, y)
assert_allclose(x, y)
assert_array_equal(r, np.arange(n + 1))


def test_weights():
w = np.array([1, 2, 5, 0.5, 0.5, 0.5, 1, 3])
y = np.array([3, 2, 1, 10, 9, 8, 20, 10])
x, r = isotonic_regression(y, w)
assert_almost_equal(x, [12 / 8, 12 / 8, 12 / 8, 9, 9, 9, 50 / 4, 50 / 4])
assert_almost_equal(np.add.reduceat(w, r[:-1]), [8, 1.5, 4])
assert_allclose(x, [12 / 8, 12 / 8, 12 / 8, 9, 9, 9, 50 / 4, 50 / 4])
assert_allclose(np.add.reduceat(w, r[:-1]), [8, 1.5, 4])
assert_array_equal(r, [0, 3, 6, 8])

# weights are like repeated observations, we repeat the 3rd element 5
# times.
w2 = np.array([1, 2, 1, 1, 1, 1, 1, 0.5, 0.5, 0.5, 1, 3])
y2 = np.array([3, 2, 1, 1, 1, 1, 1, 10, 9, 8, 20, 10])
x2, r2 = isotonic_regression(y2, w2)
assert_almost_equal(np.diff(x2[0:7]), 0)
assert_almost_equal(x2[4:], x)
assert_almost_equal(np.add.reduceat(w2, r2[:-1]), np.add.reduceat(w, r[:-1]))
assert_allclose(np.diff(x2[0:7]), 0)
assert_allclose(x2[4:], x)
assert_allclose(np.add.reduceat(w2, r2[:-1]), np.add.reduceat(w, r[:-1]))
assert_array_equal(r2 - [0, 4, 4, 4], r)


Expand All @@ -296,7 +302,7 @@ def test_against_R_monotone():
6.6666667,
6.6666667,
]
assert_almost_equal(x, res)
assert_allclose(x, res)
assert_array_equal(r, [0, 1, 7, 10])

n = 100
Expand Down Expand Up @@ -409,15 +415,56 @@ def test_against_R_monotone():
4.8656413,
4.8656413,
]
assert_almost_equal(x, res)
assert_allclose(x, res, rtol=2e-7)

# Test increasing
assert np.all(np.diff(x) >= 0)

# Test balance property: sum(y) == sum(x)
assert_almost_equal(np.sum(x), np.sum(y))
assert_allclose(np.sum(x), np.sum(y))

# Reverse order
x, rinv = isotonic_regression(-y, increasing=False)
assert_almost_equal(-x, res)
assert_allclose(-x, res, rtol=2e-7)
assert_array_equal(rinv, r)


@pytest.mark.parametrize("increasing", [True, False])
def test_isotonic_regression_class(increasing):
"""Test that IsotonicRegression gives the same as the scikit-learn version."""
X = [0, 2, 4, -1, -2, 3, 2, 2, 1, 4]
y = np.arange(10)
m_skl = skl_IsotonicRegression(increasing=increasing, out_of_bounds="clip").fit(
X, y
)
m = IsotonicRegression(increasing=increasing).fit(X, y)

assert_allclose(m.X_thresholds_, m_skl.X_thresholds_)
assert_allclose(m.y_thresholds_, m_skl.y_thresholds_)

assert m.X_thresholds_[0] == m_skl.X_min_
assert m.X_thresholds_[-1] == m_skl.X_max_

X_pred = [-10, -2, 1, 2.5, 10]
assert_allclose(m.predict(X_pred), m_skl.predict(X_pred))

m_exp = IsotonicRegression(increasing=increasing, functional="expectile").fit(X, y)
assert_allclose(m.X_thresholds_, m_exp.X_thresholds_)
assert_allclose(m.y_thresholds_, m_exp.y_thresholds_)
assert_allclose(m.predict(X_pred), m_exp.predict(X_pred))


def test_isotonic_regression_class_median():
"""Test IsotonicRegression for median regression."""
rng = np.random.default_rng(42)
# Test case of Busing 2020
# https://doi.org/10.18637/jss.v102.c01
y = np.array([8, 4, 8, 2, 2, 0, 8], dtype=np.float64)
y_iso = np.array([3, 3, 3, 3, 3, 3, 8])
X = np.arange(len(y))
idx = rng.permutation(X)

m = IsotonicRegression(functional="quantile", level=0.5).fit(X[idx], y[idx])
assert_allclose(m.X_thresholds_, [0, 5, 6])
assert_allclose(m.y_thresholds_, [3, 3, 8])
assert_allclose(m.predict(X), y_iso)

0 comments on commit a66cb76

Please sign in to comment.