Skip to content

Commit

Permalink
Experiments (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
pawel-czyz committed Mar 6, 2024
1 parent e10a0ad commit acc8677
Show file tree
Hide file tree
Showing 34 changed files with 1,774 additions and 111 deletions.
17 changes: 17 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,23 @@
local/
private/

# Data
*.json
*.yml
*.yaml
*.csv
*.npy
*.npz

# Plots
*.eps
*.gif
*.jpg
*.jpeg
*.pdf
*.png
*.svg

# Editors
.idea/
.vscode/
Expand Down
2 changes: 2 additions & 0 deletions labelshift/algorithms/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
from labelshift.algorithms.bbse import BlackBoxShiftEstimator
from labelshift.algorithms.classify_and_count import ClassifyAndCount
from labelshift.algorithms.ratio_estimator import InvariantRatioEstimator
from labelshift.interfaces.point_estimators import SummaryStatistic

__all__ = [
"BlackBoxShiftEstimator",
"ClassifyAndCount",
"DiscreteCategoricalMAPEstimator",
"InvariantRatioEstimator",
"SummaryStatistic",
]
43 changes: 30 additions & 13 deletions labelshift/algorithms/bayesian_discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Proposed in
TODO(Pawel): Add citation to pre-print after AISTATS reviews.
"""
from typing import cast, NewType, Optional
from typing import cast, NewType, Optional, Union

import arviz as az
import numpy as np
Expand Down Expand Up @@ -35,11 +35,31 @@ class SamplingParams(pydantic.BaseModel):
)


def dirichlet_alphas(L: int, alpha: Union[float, ArrayLike]) -> np.ndarray:
"""Convenient initialization of alpha (pseudocounts)
parameters of the Dirichlet prior.
Args:
alpha: either an array of shape (L,) or a float.
If a float, vector (alpha, alpha, ..., alpha)
is created
Returns:
alphas, shape (L,)
"""
if isinstance(alpha, float):
return np.ones(L) * alpha
else:
alpha = np.asarray(alpha)
assert alpha.shape == (L,)
return alpha


def build_model(
n_y_and_c_labeled: ArrayLike,
n_c_unlabeled: ArrayLike,
alpha_p_y_labeled: Optional[ArrayLike] = None,
alpha_p_y_unlabeled: Optional[ArrayLike] = None,
alpha_p_y_labeled: Union[float, ArrayLike] = 1.0,
alpha_p_y_unlabeled: Union[float, ArrayLike] = 1.0,
) -> DiscreteBayesianQuantificationModel:
"""Builds the discrete Bayesian quantification model,
basing on the sufficient statistic of the data.
Expand All @@ -59,15 +79,8 @@ def build_model(
assert n_y_labeled.shape == (L,)
assert n_c_unlabeled.shape == (K,)

alpha_p_y_labeled = (
np.ones(L) if alpha_p_y_labeled is None else np.asarray(alpha_p_y_labeled)
)
alpha_p_y_unlabeled = (
np.ones(L) if alpha_p_y_unlabeled is None else np.asarray(alpha_p_y_unlabeled)
)

assert alpha_p_y_labeled.shape == (L,)
assert alpha_p_y_unlabeled.shape == (L,)
alpha_p_y_labeled = dirichlet_alphas(L, alpha_p_y_labeled)
alpha_p_y_unlabeled = dirichlet_alphas(L, alpha_p_y_unlabeled)

model = pm.Model()
with model:
Expand Down Expand Up @@ -140,13 +153,16 @@ class DiscreteCategoricalMAPEstimator(pe.SummaryStatisticPrevalenceEstimator):
"""A version of Bayesian quantification
which finds the Maximum a Posteriori solution."""

def __init__(self, max_eval: int = 10_000) -> None:
def __init__(
self, max_eval: int = 10_000, alpha_unlabeled: Union[float, ArrayLike] = 1.0
) -> None:
"""
Args:
max_eval: maximal number of evaluations of the posterior
during the optimization to find the MAP
"""
self._max_eval = max_eval
self._alpha_unlabeled = alpha_unlabeled

def estimate_from_summary_statistic(
self, /, statistic: pe.SummaryStatistic
Expand All @@ -155,6 +171,7 @@ def estimate_from_summary_statistic(
model = build_model(
n_c_unlabeled=statistic.n_c_unlabeled,
n_y_and_c_labeled=statistic.n_y_and_c_labeled,
alpha_p_y_unlabeled=self._alpha_unlabeled,
)
with model:
optimal = pymc.find_MAP(maxeval=self._max_eval)
Expand Down
17 changes: 10 additions & 7 deletions labelshift/algorithms/expectation_maximization.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def expectation_maximization(
*,
initial_prevalences: Optional[ArrayLike] = None,
max_steps: int = 10000,
atol: float = 0.01,
tolerance: float = 0.01,
) -> np.ndarray:
"""Expectation maximization algorithm, as described in
Expand All @@ -24,13 +24,13 @@ def expectation_maximization(
Args:
predictions: test set probability predictions. Shape (n_samples, n_classes).
prevalences: prevalences in the training data set.
training_prevalences: prevalences in the training data set.
Shape (n_classes,), (n_classes, 1) or (1, n_classes). Will be normalized.
initial_prevalences: starting prevalences for optimization.
If not provided, the training prevalences are used.
Shape (n_classes,), (n_classes, 1) or (1, n_classes). Will be normalized.
max_steps: maximal number of iteration steps
atol: desired accuracy (for early stopping)
tolerance: desired accuracy (for early stopping)
Returns:
test set prevalences, shape (n_classes,).
Expand All @@ -48,6 +48,7 @@ def expectation_maximization(
test_prevalences = training_prevalences.copy()

# Iteratively improve the estimate of the test set prevalences
converged: bool = False
for _ in range(max_steps):
old_prevalences = test_prevalences.copy()

Expand All @@ -59,10 +60,12 @@ def expectation_maximization(
) / len(new_predictions)

# Check if converged
if np.allclose(old_prevalences, test_prevalences, atol=atol, rtol=0):
if np.max(np.abs(old_prevalences - test_prevalences)) < tolerance:
converged = True
break

warnings.warn(
RuntimeWarning(f"Required accuracy not reached in {max_steps} steps.")
)
if not converged:
warnings.warn(
RuntimeWarning(f"Required accuracy not reached in {max_steps} steps.")
)
return test_prevalences.ravel()
11 changes: 6 additions & 5 deletions labelshift/algorithms/gaussian_mixture_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Note:
This algorithm models the data in 1D.
"""
from typing import Optional, Sequence, Tuple
from typing import Sequence, Tuple, Union

import numpy as np
import pymc as pm
Expand All @@ -21,7 +21,7 @@ def build_model(
unlabeled_data: ArrayLike,
mean_params: Tuple[float, float] = (0.0, 1.0),
sigma_param: float = 1.0,
alpha: Optional[ArrayLike] = None,
alpha: Union[float, ArrayLike] = None,
) -> pm.Model:
"""Builds a PyMC model for Bayesian quantification for 1D data
assumed to be sampled from a mixture of normals.
Expand All @@ -38,7 +38,8 @@ def build_model(
mean_params: used to initialize the prior on the component means
sigma_param: used to initialize the prior on the component sigmas
alpha: used to initialize the Dirichlet prior on P_unlabeled(Y).
Shape (n_components,)
Can be an array of shape (n_components,)
or a float, so that a uniform vector (alpha, alpha, ...) is used.
Returns:
a PyMC model with the following variables:
Expand All @@ -51,8 +52,8 @@ def build_model(
assert unlabeled_data.shape == (len(unlabeled_data),)

n_y = len(labeled_data)
if alpha is None:
alpha = np.ones(n_y)
if isinstance(alpha, float):
alpha = alpha * np.ones(n_y)
else:
alpha = np.asarray(alpha)

Expand Down
43 changes: 39 additions & 4 deletions labelshift/algorithms/ratio_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
``H_hat[l, k] = G_hat[l, k] = E_labeled[ g(X)[k] | Y = l] \\in R^{L x (K-1)}.``
"""
from typing import Tuple
from typing import Optional, Tuple

import numpy as np
from numpy.typing import ArrayLike
Expand Down Expand Up @@ -103,9 +103,42 @@ def prevalence_from_vector_and_matrix(
def calculate_vector_and_matrix_from_predictions(
unlabeled_predictions: ArrayLike,
labeled_predictions: ArrayLike,
) -> None:
"""This method has not been implemented yet."""
raise NotImplementedError
labeled_ground_truth: ArrayLike,
L: Optional[int] = None,
enforce_square: bool = True,
restricted: bool = True,
rcond: float = 1e-4,
) -> np.ndarray:
"""TODO(Pawel): Fix this docstring.
Args:
unlabeled_predictions: shape (N', K)
labeled_predictions: shape (N, K)
labeled_ground_truth: shape (N,). Each entry is in {0, ..., L-1}.
"""
unlabeled_predictions = np.asarray(unlabeled_predictions)
labeled_predictions = np.asarray(labeled_predictions)
labeled_ground_truth = np.asarray(labeled_ground_truth, dtype=int)

K = unlabeled_predictions.shape[1]
L: int = K if L is None else L

assert labeled_predictions.shape == (len(labeled_ground_truth), K)

unlabeled_vector = unlabeled_predictions.mean(axis=0)[: K - 1] # Shape (K - 1,)
labeled_matrix = np.zeros((L, K - 1))

for l in range(L): # noqa: E741 ambiguous name variable
index = labeled_ground_truth == l
labeled_matrix[l, :] = labeled_predictions[index, : K - 1].mean(axis=0)

return prevalence_from_vector_and_matrix(
vector=unlabeled_vector,
matrix=labeled_matrix,
restricted=restricted,
enforce_square=enforce_square,
rcond=rcond,
)


def calculate_vector_and_matrix_from_summary_statistics(
Expand Down Expand Up @@ -200,4 +233,6 @@ def estimate_from_summary_statistic(
return prevalence_from_summary_statistics(
n_c_unlabeled=statistic.n_c_unlabeled,
n_y_and_c_labeled=statistic.n_y_and_c_labeled,
enforce_square=self._enforce_square,
rcond=self._rcond,
)
7 changes: 7 additions & 0 deletions labelshift/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import labelshift.experiments.api as experiments
import labelshift.datasets.api as datasets

__all__ = [
"experiments",
"datasets",
]
20 changes: 20 additions & 0 deletions labelshift/datasets/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from labelshift.datasets.split import (
IDataset,
n_classes,
SplitDataset,
SplitSpecification,
split_dataset,
)
from labelshift.datasets.discrete_categorical import DiscreteSampler, almost_eye

__all__ = [
# `split` submodule
"IDataset",
"n_classes",
"SplitDataset",
"SplitSpecification",
"split_dataset",
# `discrete_categorical` submodule
"DiscreteSampler",
"almost_eye",
]
Loading

0 comments on commit acc8677

Please sign in to comment.