Experiments (#17)

pawel-czyz · Mar 6, 2024 · acc8677 · acc8677
1 parent e10a0ad
commit acc8677
Show file tree

Hide file tree

Showing 34 changed files with 1,774 additions and 111 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,23 @@
 local/
 private/
 
+# Data
+*.json
+*.yml
+*.yaml
+*.csv
+*.npy
+*.npz
+
+# Plots
+*.eps
+*.gif
+*.jpg
+*.jpeg
+*.pdf
+*.png
+*.svg
+
 # Editors
 .idea/
 .vscode/

diff --git a/labelshift/algorithms/api.py b/labelshift/algorithms/api.py
@@ -8,10 +8,12 @@
 from labelshift.algorithms.bbse import BlackBoxShiftEstimator
 from labelshift.algorithms.classify_and_count import ClassifyAndCount
 from labelshift.algorithms.ratio_estimator import InvariantRatioEstimator
+from labelshift.interfaces.point_estimators import SummaryStatistic
 
 __all__ = [
     "BlackBoxShiftEstimator",
     "ClassifyAndCount",
     "DiscreteCategoricalMAPEstimator",
     "InvariantRatioEstimator",
+    "SummaryStatistic",
 ]
diff --git a/labelshift/algorithms/bayesian_discrete.py b/labelshift/algorithms/bayesian_discrete.py
@@ -3,7 +3,7 @@
 Proposed in
    TODO(Pawel): Add citation to pre-print after AISTATS reviews.
 """
-from typing import cast, NewType, Optional
+from typing import cast, NewType, Optional, Union
 
 import arviz as az
 import numpy as np
@@ -35,11 +35,31 @@ class SamplingParams(pydantic.BaseModel):
 )
 
 
+def dirichlet_alphas(L: int, alpha: Union[float, ArrayLike]) -> np.ndarray:
+    """Convenient initialization of alpha (pseudocounts)
+    parameters of the Dirichlet prior.
+
+    Args:
+        alpha: either an array of shape (L,) or a float.
+          If a float, vector (alpha, alpha, ..., alpha)
+          is created
+
+    Returns:
+         alphas, shape (L,)
+    """
+    if isinstance(alpha, float):
+        return np.ones(L) * alpha
+    else:
+        alpha = np.asarray(alpha)
+        assert alpha.shape == (L,)
+        return alpha
+
+
 def build_model(
     n_y_and_c_labeled: ArrayLike,
     n_c_unlabeled: ArrayLike,
-    alpha_p_y_labeled: Optional[ArrayLike] = None,
-    alpha_p_y_unlabeled: Optional[ArrayLike] = None,
+    alpha_p_y_labeled: Union[float, ArrayLike] = 1.0,
+    alpha_p_y_unlabeled: Union[float, ArrayLike] = 1.0,
 ) -> DiscreteBayesianQuantificationModel:
     """Builds the discrete Bayesian quantification model,
      basing on the sufficient statistic of the data.
@@ -59,15 +79,8 @@ def build_model(
     assert n_y_labeled.shape == (L,)
     assert n_c_unlabeled.shape == (K,)
 
-    alpha_p_y_labeled = (
-        np.ones(L) if alpha_p_y_labeled is None else np.asarray(alpha_p_y_labeled)
-    )
-    alpha_p_y_unlabeled = (
-        np.ones(L) if alpha_p_y_unlabeled is None else np.asarray(alpha_p_y_unlabeled)
-    )
-
-    assert alpha_p_y_labeled.shape == (L,)
-    assert alpha_p_y_unlabeled.shape == (L,)
+    alpha_p_y_labeled = dirichlet_alphas(L, alpha_p_y_labeled)
+    alpha_p_y_unlabeled = dirichlet_alphas(L, alpha_p_y_unlabeled)
 
     model = pm.Model()
     with model:
@@ -140,13 +153,16 @@ class DiscreteCategoricalMAPEstimator(pe.SummaryStatisticPrevalenceEstimator):
     """A version of Bayesian quantification
     which finds the Maximum a Posteriori solution."""
 
-    def __init__(self, max_eval: int = 10_000) -> None:
+    def __init__(
+        self, max_eval: int = 10_000, alpha_unlabeled: Union[float, ArrayLike] = 1.0
+    ) -> None:
         """
         Args:
             max_eval: maximal number of evaluations of the posterior
               during the optimization to find the MAP
         """
         self._max_eval = max_eval
+        self._alpha_unlabeled = alpha_unlabeled
 
     def estimate_from_summary_statistic(
         self, /, statistic: pe.SummaryStatistic
@@ -155,6 +171,7 @@ def estimate_from_summary_statistic(
         model = build_model(
             n_c_unlabeled=statistic.n_c_unlabeled,
             n_y_and_c_labeled=statistic.n_y_and_c_labeled,
+            alpha_p_y_unlabeled=self._alpha_unlabeled,
         )
         with model:
             optimal = pymc.find_MAP(maxeval=self._max_eval)

diff --git a/labelshift/algorithms/expectation_maximization.py b/labelshift/algorithms/expectation_maximization.py
@@ -14,7 +14,7 @@ def expectation_maximization(
     *,
     initial_prevalences: Optional[ArrayLike] = None,
     max_steps: int = 10000,
-    atol: float = 0.01,
+    tolerance: float = 0.01,
 ) -> np.ndarray:
     """Expectation maximization algorithm, as described in
 
@@ -24,13 +24,13 @@ def expectation_maximization(
 
     Args:
         predictions: test set probability predictions. Shape (n_samples, n_classes).
-        prevalences: prevalences in the training data set.
+        training_prevalences: prevalences in the training data set.
             Shape (n_classes,), (n_classes, 1) or (1, n_classes). Will be normalized.
         initial_prevalences: starting prevalences for optimization.
             If not provided, the training prevalences are used.
             Shape (n_classes,), (n_classes, 1) or (1, n_classes). Will be normalized.
         max_steps: maximal number of iteration steps
-        atol: desired accuracy (for early stopping)
+        tolerance: desired accuracy (for early stopping)
 
     Returns:
         test set prevalences, shape (n_classes,).
@@ -48,6 +48,7 @@ def expectation_maximization(
         test_prevalences = training_prevalences.copy()
 
     # Iteratively improve the estimate of the test set prevalences
+    converged: bool = False
     for _ in range(max_steps):
         old_prevalences = test_prevalences.copy()
 
@@ -59,10 +60,12 @@ def expectation_maximization(
         ) / len(new_predictions)
 
         # Check if converged
-        if np.allclose(old_prevalences, test_prevalences, atol=atol, rtol=0):
+        if np.max(np.abs(old_prevalences - test_prevalences)) < tolerance:
+            converged = True
             break
 
-    warnings.warn(
-        RuntimeWarning(f"Required accuracy not reached in {max_steps} steps.")
-    )
+    if not converged:
+        warnings.warn(
+            RuntimeWarning(f"Required accuracy not reached in {max_steps} steps.")
+        )
     return test_prevalences.ravel()
diff --git a/labelshift/algorithms/gaussian_mixture_model.py b/labelshift/algorithms/gaussian_mixture_model.py
@@ -5,7 +5,7 @@
 Note:
     This algorithm models the data in 1D.
 """
-from typing import Optional, Sequence, Tuple
+from typing import Sequence, Tuple, Union
 
 import numpy as np
 import pymc as pm
@@ -21,7 +21,7 @@ def build_model(
     unlabeled_data: ArrayLike,
     mean_params: Tuple[float, float] = (0.0, 1.0),
     sigma_param: float = 1.0,
-    alpha: Optional[ArrayLike] = None,
+    alpha: Union[float, ArrayLike] = None,
 ) -> pm.Model:
     """Builds a PyMC model for Bayesian quantification for 1D data
     assumed to be sampled from a mixture of normals.
@@ -38,7 +38,8 @@ def build_model(
         mean_params: used to initialize the prior on the component means
         sigma_param: used to initialize the prior on the component sigmas
         alpha: used to initialize the Dirichlet prior on P_unlabeled(Y).
-          Shape (n_components,)
+          Can be an array of shape (n_components,)
+          or a float, so that a uniform vector (alpha, alpha, ...) is used.
 
     Returns:
         a PyMC model with the following variables:
@@ -51,8 +52,8 @@ def build_model(
     assert unlabeled_data.shape == (len(unlabeled_data),)
 
     n_y = len(labeled_data)
-    if alpha is None:
-        alpha = np.ones(n_y)
+    if isinstance(alpha, float):
+        alpha = alpha * np.ones(n_y)
     else:
         alpha = np.asarray(alpha)
 

diff --git a/labelshift/algorithms/ratio_estimator.py b/labelshift/algorithms/ratio_estimator.py
@@ -41,7 +41,7 @@
   ``H_hat[l, k] = G_hat[l, k] = E_labeled[ g(X)[k]  | Y = l] \\in R^{L x (K-1)}.``
 
 """
-from typing import Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 from numpy.typing import ArrayLike
@@ -103,9 +103,42 @@ def prevalence_from_vector_and_matrix(
 def calculate_vector_and_matrix_from_predictions(
     unlabeled_predictions: ArrayLike,
     labeled_predictions: ArrayLike,
-) -> None:
-    """This method has not been implemented yet."""
-    raise NotImplementedError
+    labeled_ground_truth: ArrayLike,
+    L: Optional[int] = None,
+    enforce_square: bool = True,
+    restricted: bool = True,
+    rcond: float = 1e-4,
+) -> np.ndarray:
+    """TODO(Pawel): Fix this docstring.
+
+    Args:
+        unlabeled_predictions: shape (N', K)
+        labeled_predictions: shape (N, K)
+        labeled_ground_truth: shape (N,). Each entry is in {0, ..., L-1}.
+    """
+    unlabeled_predictions = np.asarray(unlabeled_predictions)
+    labeled_predictions = np.asarray(labeled_predictions)
+    labeled_ground_truth = np.asarray(labeled_ground_truth, dtype=int)
+
+    K = unlabeled_predictions.shape[1]
+    L: int = K if L is None else L
+
+    assert labeled_predictions.shape == (len(labeled_ground_truth), K)
+
+    unlabeled_vector = unlabeled_predictions.mean(axis=0)[: K - 1]  # Shape (K - 1,)
+    labeled_matrix = np.zeros((L, K - 1))
+
+    for l in range(L):  # noqa: E741 ambiguous name variable
+        index = labeled_ground_truth == l
+        labeled_matrix[l, :] = labeled_predictions[index, : K - 1].mean(axis=0)
+
+    return prevalence_from_vector_and_matrix(
+        vector=unlabeled_vector,
+        matrix=labeled_matrix,
+        restricted=restricted,
+        enforce_square=enforce_square,
+        rcond=rcond,
+    )
 
 
 def calculate_vector_and_matrix_from_summary_statistics(
@@ -200,4 +233,6 @@ def estimate_from_summary_statistic(
         return prevalence_from_summary_statistics(
             n_c_unlabeled=statistic.n_c_unlabeled,
             n_y_and_c_labeled=statistic.n_y_and_c_labeled,
+            enforce_square=self._enforce_square,
+            rcond=self._rcond,
         )
diff --git a/labelshift/api.py b/labelshift/api.py
@@ -0,0 +1,7 @@
+import labelshift.experiments.api as experiments
+import labelshift.datasets.api as datasets
+
+__all__ = [
+    "experiments",
+    "datasets",
+]
diff --git a/labelshift/datasets/api.py b/labelshift/datasets/api.py
@@ -0,0 +1,20 @@
+from labelshift.datasets.split import (
+    IDataset,
+    n_classes,
+    SplitDataset,
+    SplitSpecification,
+    split_dataset,
+)
+from labelshift.datasets.discrete_categorical import DiscreteSampler, almost_eye
+
+__all__ = [
+    # `split` submodule
+    "IDataset",
+    "n_classes",
+    "SplitDataset",
+    "SplitSpecification",
+    "split_dataset",
+    # `discrete_categorical` submodule
+    "DiscreteSampler",
+    "almost_eye",
+]