Merge pull request #81 from lorentzenchr/quantile_decompose

ENH add quantiles and expectile to scoring.decompose
lorentzenchr · Jul 13, 2023 · b8816d8 · b8816d8
2 parents 54252f6 + 7bbbd91
commit b8816d8
Show file tree

Hide file tree

Showing 7 changed files with 384 additions and 25 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -189,7 +189,7 @@ dependencies = [
   "black>=22.8.0",
   "mypy>=0.971",
   "numpy>=1.22",
-  "ruff==0.0.256",
+  "ruff==0.0.278",
 ]
 
 [tool.hatch.envs.lint.scripts]

diff --git a/src/model_diagnostics/_utils/isotonic.py b/src/model_diagnostics/_utils/isotonic.py
@@ -378,14 +378,24 @@ def expectile_fun(x, w):
         x, r = gpava(expectile_fun, x, wx)
     elif functional == "quantile":
 
-        def quantile_lower(x, wx):
+        def quantile_lower(x, wx=None):
             return np.quantile(x, level, method="inverted_cdf")
 
-        def quantile_upper(x, wx):
-            return np.quantile(x, level, method="higher")
+        def quantile_upper(x, wx=None):
+            # np.quantile(x, level, method="higher") is not the same as
+            return -np.quantile(-x, 1 - level, method="inverted_cdf")
 
         xl, rl = gpava(quantile_lower, x, wx)
-        xu, ru = gpava(quantile_upper, x, wx)
+        # Applying gpava on fun=np.quantile(x, level, method="higher") does not work
+        # for unknown reasons. What works is to calculate the upper quantile on the
+        # blocks given by rl from the lower quantile.
+        q = np.fromiter(
+            (quantile_upper(x[rl[i] : rl[i + 1]]) for i in range(len(rl) - 1)),
+            dtype=xl.dtype,
+        )
+        # Take mininum from the right.
+        q = np.minimum.accumulate(q[::-1])[::-1]
+        xu = np.repeat(q, np.diff(rl))
         # We are free to use any value in the interval [xl, xu], as long as it is
         # increasing. We choose the midpoint.
         x = 0.5 * (xl + xu)
@@ -481,7 +491,8 @@ def fit(
         df = pl.DataFrame({"_X": X, "_target_y": y})
         if sample_weight is not None:
             df = df.hstack([pl.Series(name="_weights", values=sample_weight)])
-        # We deal with duplicate values in X by also sorting y.
+        # We deal with duplicate values in X, aka ties, by also sorting y, but in
+        # reverse order.
         df = df.sort(by=["_X", "_target_y"], descending=[False, self.increasing])
         yy = df["_target_y"].to_numpy()
         wy = df["_weights"].to_numpy() if sample_weight is not None else None

diff --git a/src/model_diagnostics/_utils/tests/test_isotonic.py b/src/model_diagnostics/_utils/tests/test_isotonic.py
@@ -28,7 +28,7 @@ def median_lower(x, w=None):
 
 
 def median_upper(x, w=None):
-    return np.quantile(x, q=0.5, method="higher")
+    return -np.quantile(-np.asarray(x), q=0.5, method="inverted_cdf")
 
 
 median_upper.loss = PinballLoss(level=0.5)  # type: ignore
@@ -42,12 +42,26 @@ def quantile80_lower(x, w=None):
 
 
 def quantile80_upper(x, w=None):
-    return np.quantile(x, q=0.8, method="higher")
+    return -np.quantile(-np.asarray(x), q=1 - 0.8, method="inverted_cdf")
 
 
 quantile80_upper.loss = PinballLoss(level=0.8)  # type: ignore
 
 
+def quantile10_lower(x, w=None):
+    return np.quantile(x, q=0.1, method="inverted_cdf")
+
+
+quantile10_lower.loss = PinballLoss(level=0.1)  # type: ignore
+
+
+def quantile10_upper(x, w=None):
+    return -np.quantile(-np.asarray(x), q=1 - 0.1, method="inverted_cdf")
+
+
+quantile10_upper.loss = PinballLoss(level=0.1)  # type: ignore
+
+
 def gpava_mean(x, w=None):
     return gpava(mean_fun, x, w)
 
@@ -135,12 +149,65 @@ def test_pava_weighted(pava):
             median_upper,
             [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8],
         ),
+        (
+            [
+                -3.444949308,
+                -2.854784791,
+                -3.071818855,
+                -1.164616209,
+                -3.947223934,
+                -2.644162705,
+                -2.609553073,
+                -0.270605801,
+                -3.208354459,
+                -2.012322283,
+                -1.059124491,
+                -2.199654113,
+                -1.906487545,
+                -2.681030502,
+                -1.553997732,
+                -3.489485619,
+                -1.672190775,
+                -2.863511214,
+                -0.491263988,
+                0.169577379,
+            ],
+            quantile10_lower,
+            # from R package isotonic, function
+            # gpava(1:20, y, solver = weighted.fractile, p=0.1, ties = "secondary")$x
+            np.repeat(
+                [-3.947223934, -3.208354459, -2.863511214, -0.491263988, 0.169577379],
+                [5, 11, 2, 1, 1],
+            ),
+            quantile10_upper,
+            np.repeat(
+                [-3.947223934, -3.208354459, -2.863511214, -0.491263988, 0.169577379],
+                [5, 11, 2, 1, 1],
+            ),
+        ),
     ],
 )
 def test_gpava_simple(y, fun_lower, x_lower, fun_upper, x_upper):
     w = np.ones_like(y)
     xl, rl = gpava(fun_lower, y, w)
-    xu, ru = gpava(fun_upper, y, w)
+    # The distinction of lower and upper only applies to set-valued functionals like
+    # quantiles.
+    # Applying gpava on fun=np.quantile(x, level, method="higher") does not work
+    # for unknown reasons. What works is to calculate the upper quantile on the
+    # block given by rl.
+    upper_values = np.fromiter(
+        (fun_upper(y[rl[i] : rl[i + 1]]) for i in range(len(rl) - 1)), dtype=xl.dtype
+    )
+    # Take mininum from the right.
+    upper_values = np.minimum.accumulate(upper_values[::-1])[::-1]
+    xu = np.repeat(upper_values, np.diff(rl))
+
+    # Check that isotonic_regression gives midpoint.
+    functional = fun_lower.loss.functional
+    level = getattr(fun_lower.loss, "level", 0.5)
+    x_iso, _ = isotonic_regression(y=y, functional=functional, level=level)
+    assert_allclose(0.5 * (xl + xu), x_iso)
+
     for i in range(len(xl) - 1):
         # Eq. 10 of Jordan et al. https://doi.org/10.1007/s10463-021-00808-0
         assert xl[i] <= xu[i + 1]
@@ -166,13 +233,13 @@ def constraint(y_pred, y_obs):
         x0=y,
         args=(y,),
         constraints=[{"type": "ineq", "fun": lambda x: constraint(x, y)}],
-        tol=1e-10,
+        tol=1e-12,
     )
 
-    assert result.fun == pytest.approx(loss_lower)
+    assert result.fun == pytest.approx(loss_lower, rel=1e-8)
 
-    # Check that is is the boundary of solutions.
-    eps = 1e-10 * np.mean(xl)
+    # Check that it is the boundary of solutions.
+    eps = 1e-8 * np.mean(np.abs(xl))
     for i in range(len(xl)):
         x = np.array(xl, copy=True, dtype=float)
         if i == 0 or x[i] > x[i - 1]:

diff --git a/src/model_diagnostics/calibration/identification.py b/src/model_diagnostics/calibration/identification.py
@@ -316,7 +316,7 @@ def compute_bias(
                         "Due to ties, the effective number of bins is 0. "
                         f"Consider to increase n_bins>={n_bins_tmp}."
                     )
-                    warnings.warn(msg, UserWarning)
+                    warnings.warn(msg, UserWarning, stacklevel=2)
             else:
                 # Binning
                 # We use method="inverted_cdf" (same as "lower") instead of the

diff --git a/src/model_diagnostics/calibration/plots.py b/src/model_diagnostics/calibration/plots.py
@@ -320,7 +320,7 @@ def plot_bias(
             "shown for that y_pred/model, despite the fact that with_errorbars was "
             "set to True."
         )
-        warnings.warn(msg, UserWarning)
+        warnings.warn(msg, UserWarning, stacklevel=2)
 
     if "model_" in df.columns:
         col_model = "model_"

diff --git a/src/model_diagnostics/scoring/scoring.py b/src/model_diagnostics/scoring/scoring.py
@@ -9,12 +9,14 @@
 import numpy.typing as npt
 import polars as pl
 from scipy import special
-from sklearn.isotonic import IsotonicRegression
+from scipy.stats import expectile
+from sklearn.isotonic import IsotonicRegression as IsotonicRegression_skl
 
 from model_diagnostics._utils._array import (
     validate_2_arrays,
     validate_same_first_dimension,
 )
+from model_diagnostics._utils.isotonic import IsotonicRegression
 from model_diagnostics.calibration import identification_function
 
 
@@ -776,19 +778,26 @@ def decompose(
             )
             raise ValueError(msg)
     if level is None:
-        if functional == "mean":
-            level = 0.5
-        elif functional in ("expectile", "quantile"):
+        level = 0.5
+        if functional in ("expectile", "quantile"):
             if hasattr(scoring_function, "level"):
-                level = scoring_function.level
+                level = float(scoring_function.level)
             else:
                 msg = (
                     "You set level=None, but scoring_function has no attribute "
                     "level."
                 )
                 raise ValueError(msg)
-    if not (functional == "mean" or (functional == "expectile" and level == 0.5)):
-        msg = f"The given {functional=} and {level=} are not supported (yet)."
+
+    allowed_functionals = ("mean", "median", "expectile", "quantile")
+    if functional not in allowed_functionals:
+        msg = (
+            f"The functional must be one of {allowed_functionals}, got "
+            f"{functional}."
+        )
+        raise ValueError(msg)
+    if functional in ("expectile", "quantile") and (level <= 0 or level >= 1):
+        msg = f"The level must fulfil 0 < level < 1, got {level}."
         raise ValueError(msg)
 
     y: np.ndarray
@@ -800,9 +809,20 @@ def decompose(
         validate_same_first_dimension(weights, y)
         w = np.asarray(weights)  # needed to satisfy mypy
 
-    iso = IsotonicRegression(y_min=None, y_max=None).fit(z, y, sample_weight=w)
+    if functional == "mean":
+        iso = IsotonicRegression_skl(y_min=None, y_max=None).fit(z, y, sample_weight=w)
+        marginal = np.average(y, weights=w)
+    else:
+        iso = IsotonicRegression(functional=functional, level=level).fit(
+            z, y, sample_weight=w
+        )
+        if functional == "expectile":
+            marginal = expectile(y, alpha=level, weights=w)
+        elif functional == "quantile":
+            marginal = np.quantile(y, q=level, method="inverted_cdf")
+
+    marginal = np.full(shape=y.shape, fill_value=marginal)
     recalibrated = np.squeeze(iso.predict(z))
-    marginal = np.full(shape=y.shape, fill_value=np.average(y, weights=w))
 
     score = scoring_function(y, z, w)
     score_recalibrated = scoring_function(y, recalibrated, w)