Skip to content

Commit

Permalink
Merge pull request #81 from lorentzenchr/quantile_decompose
Browse files Browse the repository at this point in the history
ENH add quantiles and expectile to scoring.decompose
  • Loading branch information
lorentzenchr committed Jul 13, 2023
2 parents 54252f6 + 7bbbd91 commit b8816d8
Show file tree
Hide file tree
Showing 7 changed files with 384 additions and 25 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ dependencies = [
"black>=22.8.0",
"mypy>=0.971",
"numpy>=1.22",
"ruff==0.0.256",
"ruff==0.0.278",
]

[tool.hatch.envs.lint.scripts]
Expand Down
21 changes: 16 additions & 5 deletions src/model_diagnostics/_utils/isotonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,14 +378,24 @@ def expectile_fun(x, w):
x, r = gpava(expectile_fun, x, wx)
elif functional == "quantile":

def quantile_lower(x, wx):
def quantile_lower(x, wx=None):
return np.quantile(x, level, method="inverted_cdf")

def quantile_upper(x, wx):
return np.quantile(x, level, method="higher")
def quantile_upper(x, wx=None):
# np.quantile(x, level, method="higher") is not the same as
return -np.quantile(-x, 1 - level, method="inverted_cdf")

xl, rl = gpava(quantile_lower, x, wx)
xu, ru = gpava(quantile_upper, x, wx)
# Applying gpava on fun=np.quantile(x, level, method="higher") does not work
# for unknown reasons. What works is to calculate the upper quantile on the
# blocks given by rl from the lower quantile.
q = np.fromiter(
(quantile_upper(x[rl[i] : rl[i + 1]]) for i in range(len(rl) - 1)),
dtype=xl.dtype,
)
# Take mininum from the right.
q = np.minimum.accumulate(q[::-1])[::-1]
xu = np.repeat(q, np.diff(rl))
# We are free to use any value in the interval [xl, xu], as long as it is
# increasing. We choose the midpoint.
x = 0.5 * (xl + xu)
Expand Down Expand Up @@ -481,7 +491,8 @@ def fit(
df = pl.DataFrame({"_X": X, "_target_y": y})
if sample_weight is not None:
df = df.hstack([pl.Series(name="_weights", values=sample_weight)])
# We deal with duplicate values in X by also sorting y.
# We deal with duplicate values in X, aka ties, by also sorting y, but in
# reverse order.
df = df.sort(by=["_X", "_target_y"], descending=[False, self.increasing])
yy = df["_target_y"].to_numpy()
wy = df["_weights"].to_numpy() if sample_weight is not None else None
Expand Down
81 changes: 74 additions & 7 deletions src/model_diagnostics/_utils/tests/test_isotonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def median_lower(x, w=None):


def median_upper(x, w=None):
return np.quantile(x, q=0.5, method="higher")
return -np.quantile(-np.asarray(x), q=0.5, method="inverted_cdf")


median_upper.loss = PinballLoss(level=0.5) # type: ignore
Expand All @@ -42,12 +42,26 @@ def quantile80_lower(x, w=None):


def quantile80_upper(x, w=None):
return np.quantile(x, q=0.8, method="higher")
return -np.quantile(-np.asarray(x), q=1 - 0.8, method="inverted_cdf")


quantile80_upper.loss = PinballLoss(level=0.8) # type: ignore


def quantile10_lower(x, w=None):
return np.quantile(x, q=0.1, method="inverted_cdf")


quantile10_lower.loss = PinballLoss(level=0.1) # type: ignore


def quantile10_upper(x, w=None):
return -np.quantile(-np.asarray(x), q=1 - 0.1, method="inverted_cdf")


quantile10_upper.loss = PinballLoss(level=0.1) # type: ignore


def gpava_mean(x, w=None):
return gpava(mean_fun, x, w)

Expand Down Expand Up @@ -135,12 +149,65 @@ def test_pava_weighted(pava):
median_upper,
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8],
),
(
[
-3.444949308,
-2.854784791,
-3.071818855,
-1.164616209,
-3.947223934,
-2.644162705,
-2.609553073,
-0.270605801,
-3.208354459,
-2.012322283,
-1.059124491,
-2.199654113,
-1.906487545,
-2.681030502,
-1.553997732,
-3.489485619,
-1.672190775,
-2.863511214,
-0.491263988,
0.169577379,
],
quantile10_lower,
# from R package isotonic, function
# gpava(1:20, y, solver = weighted.fractile, p=0.1, ties = "secondary")$x
np.repeat(
[-3.947223934, -3.208354459, -2.863511214, -0.491263988, 0.169577379],
[5, 11, 2, 1, 1],
),
quantile10_upper,
np.repeat(
[-3.947223934, -3.208354459, -2.863511214, -0.491263988, 0.169577379],
[5, 11, 2, 1, 1],
),
),
],
)
def test_gpava_simple(y, fun_lower, x_lower, fun_upper, x_upper):
w = np.ones_like(y)
xl, rl = gpava(fun_lower, y, w)
xu, ru = gpava(fun_upper, y, w)
# The distinction of lower and upper only applies to set-valued functionals like
# quantiles.
# Applying gpava on fun=np.quantile(x, level, method="higher") does not work
# for unknown reasons. What works is to calculate the upper quantile on the
# block given by rl.
upper_values = np.fromiter(
(fun_upper(y[rl[i] : rl[i + 1]]) for i in range(len(rl) - 1)), dtype=xl.dtype
)
# Take mininum from the right.
upper_values = np.minimum.accumulate(upper_values[::-1])[::-1]
xu = np.repeat(upper_values, np.diff(rl))

# Check that isotonic_regression gives midpoint.
functional = fun_lower.loss.functional
level = getattr(fun_lower.loss, "level", 0.5)
x_iso, _ = isotonic_regression(y=y, functional=functional, level=level)
assert_allclose(0.5 * (xl + xu), x_iso)

for i in range(len(xl) - 1):
# Eq. 10 of Jordan et al. https://doi.org/10.1007/s10463-021-00808-0
assert xl[i] <= xu[i + 1]
Expand All @@ -166,13 +233,13 @@ def constraint(y_pred, y_obs):
x0=y,
args=(y,),
constraints=[{"type": "ineq", "fun": lambda x: constraint(x, y)}],
tol=1e-10,
tol=1e-12,
)

assert result.fun == pytest.approx(loss_lower)
assert result.fun == pytest.approx(loss_lower, rel=1e-8)

# Check that is is the boundary of solutions.
eps = 1e-10 * np.mean(xl)
# Check that it is the boundary of solutions.
eps = 1e-8 * np.mean(np.abs(xl))
for i in range(len(xl)):
x = np.array(xl, copy=True, dtype=float)
if i == 0 or x[i] > x[i - 1]:
Expand Down
2 changes: 1 addition & 1 deletion src/model_diagnostics/calibration/identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def compute_bias(
"Due to ties, the effective number of bins is 0. "
f"Consider to increase n_bins>={n_bins_tmp}."
)
warnings.warn(msg, UserWarning)
warnings.warn(msg, UserWarning, stacklevel=2)
else:
# Binning
# We use method="inverted_cdf" (same as "lower") instead of the
Expand Down
2 changes: 1 addition & 1 deletion src/model_diagnostics/calibration/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def plot_bias(
"shown for that y_pred/model, despite the fact that with_errorbars was "
"set to True."
)
warnings.warn(msg, UserWarning)
warnings.warn(msg, UserWarning, stacklevel=2)

if "model_" in df.columns:
col_model = "model_"
Expand Down
38 changes: 29 additions & 9 deletions src/model_diagnostics/scoring/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
import numpy.typing as npt
import polars as pl
from scipy import special
from sklearn.isotonic import IsotonicRegression
from scipy.stats import expectile
from sklearn.isotonic import IsotonicRegression as IsotonicRegression_skl

from model_diagnostics._utils._array import (
validate_2_arrays,
validate_same_first_dimension,
)
from model_diagnostics._utils.isotonic import IsotonicRegression
from model_diagnostics.calibration import identification_function


Expand Down Expand Up @@ -776,19 +778,26 @@ def decompose(
)
raise ValueError(msg)
if level is None:
if functional == "mean":
level = 0.5
elif functional in ("expectile", "quantile"):
level = 0.5
if functional in ("expectile", "quantile"):
if hasattr(scoring_function, "level"):
level = scoring_function.level
level = float(scoring_function.level)
else:
msg = (
"You set level=None, but scoring_function has no attribute "
"level."
)
raise ValueError(msg)
if not (functional == "mean" or (functional == "expectile" and level == 0.5)):
msg = f"The given {functional=} and {level=} are not supported (yet)."

allowed_functionals = ("mean", "median", "expectile", "quantile")
if functional not in allowed_functionals:
msg = (
f"The functional must be one of {allowed_functionals}, got "
f"{functional}."
)
raise ValueError(msg)
if functional in ("expectile", "quantile") and (level <= 0 or level >= 1):
msg = f"The level must fulfil 0 < level < 1, got {level}."
raise ValueError(msg)

y: np.ndarray
Expand All @@ -800,9 +809,20 @@ def decompose(
validate_same_first_dimension(weights, y)
w = np.asarray(weights) # needed to satisfy mypy

iso = IsotonicRegression(y_min=None, y_max=None).fit(z, y, sample_weight=w)
if functional == "mean":
iso = IsotonicRegression_skl(y_min=None, y_max=None).fit(z, y, sample_weight=w)
marginal = np.average(y, weights=w)
else:
iso = IsotonicRegression(functional=functional, level=level).fit(
z, y, sample_weight=w
)
if functional == "expectile":
marginal = expectile(y, alpha=level, weights=w)
elif functional == "quantile":
marginal = np.quantile(y, q=level, method="inverted_cdf")

marginal = np.full(shape=y.shape, fill_value=marginal)
recalibrated = np.squeeze(iso.predict(z))
marginal = np.full(shape=y.shape, fill_value=np.average(y, weights=w))

score = scoring_function(y, z, w)
score_recalibrated = scoring_function(y, recalibrated, w)
Expand Down

0 comments on commit b8816d8

Please sign in to comment.