Remove most of advanced setups

optuna · Feb 1, 2024 · ed635c1 · ed635c1
1 parent dd9711f
commit ed635c1
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 145 deletions.
diff --git a/optuna/importance/_ped_anova/_evaluator.py b/optuna/importance/_ped_anova/_evaluator.py
@@ -6,14 +6,14 @@
 import numpy as np
 
 from optuna.distributions import BaseDistribution
-from optuna.distributions import CategoricalChoiceType
 from optuna.importance._base import _get_distributions
 from optuna.importance._base import _get_filtered_trials
 from optuna.importance._base import _sort_dict_by_importance
 from optuna.importance._base import BaseImportanceEvaluator
 from optuna.importance._ped_anova._scott_parzen_estimator import _build_parzen_estimator
 from optuna.importance.filters import get_trial_filter
 from optuna.study import Study
+from optuna.study import StudyDirection
 from optuna.trial import FrozenTrial
 
 
@@ -25,11 +25,9 @@ class PedAnovaImportanceEvaluator(BaseImportanceEvaluator):
       <https://arxiv.org/abs/2304.10255>`_.
 
     PED-ANOVA fits Parzen estimators of :class:`~optuna.trial.TrialState.COMPLETE` trials better
-    than a user-specified baseline. Users can specify the baseline either by a quantile or a value.
+    than a user-specified baseline. Users can specify the baseline either by a quantile.
     The importance can be interpreted as how important each hyperparameter is to get
     the performance better than baseline.
-    Users can also remove trials worse than `cutoff` so that the interpretation removes the bias
-    caused by the initial trials.
 
     For further information about PED-ANOVA algorithm, please refer to the following paper:
 
@@ -46,34 +44,11 @@ class PedAnovaImportanceEvaluator(BaseImportanceEvaluator):
         Please refer to the original work available at https://github.com/nabenabe0928/local-anova.
 
     Args:
-        is_lower_better:
-            Whether `target_value` is better when it is lower.
-        n_steps:
-            The number of grids in continuous domains.
-            For example, if one of the parameters has the domain of [`low`, `high`],
-            we discretize it as `np.linspace(low, high, n_steps)`.
         baseline_quantile:
             Compute the importance of achieving top-`baseline_quantile` quantile `target_value`.
             For example, `baseline_quantile=0.1` means that the importances give the information
             of which parameters were important to achieve the top-10% performance during
             the specified `study`.
-        min_n_top_trials:
-            How many `trials` must be included in `top_trials`.
-        consider_prior:
-            Whether we use non-informative prior to regularize the Parzen estimators.
-            This might be helpful to avoid overfitting.
-        prior_weight:
-            How much we regularize the Parzen estimator fitting.
-            The larger `prior_weight` becomes, the more we regularize the fitting.
-            All the observations receive `weight=1.0`, so the default value is `prior_weight=1.0`.
-        categorical_distance_func:
-            A dictionary of distance functions for categorical parameters. The key is the name of
-            the categorical parameter and the value is a distance function that takes two
-            :class:`~optuna.distributions.CategoricalChoiceType` s and returns a :obj:`float`
-            value. The distance function must return a non-negative value.
-
-            While categorical choices are handled equally by default, this option allows users to
-            specify prior knowledge on the structure of categorical parameters.
         evaluate_on_local:
             Whether we measure the importance in the local or global space.
             If `True`, the importances imply how importance each parameter is during `study`.
@@ -82,58 +57,49 @@ class PedAnovaImportanceEvaluator(BaseImportanceEvaluator):
             space during the specified `study`.
     """
 
-    def __init__(
-        self,
-        is_lower_better: bool,
-        *,
-        n_steps: int = 50,
-        baseline_quantile: float = 0.1,
-        consider_prior: bool = False,
-        prior_weight: float = 1.0,
-        categorical_distance_func: dict[
-            str, Callable[[CategoricalChoiceType, CategoricalChoiceType], float]
-        ]
-        | None = None,
-        evaluate_on_local: bool = True,
-        min_n_top_trials: int = 2,
-    ):
-        if n_steps <= 1:
-            raise ValueError(f"`n_steps` must be larger than 1, but got {n_steps}.")
-
-        if min_n_top_trials < 2:
-            raise ValueError(
-                f"min_n_top_trials must be larger than 1, but got {min_n_top_trials}."
-            )
-
-        self._n_steps = n_steps
-        self._categorical_distance_func = (
-            categorical_distance_func if categorical_distance_func is not None else {}
-        )
-        self._consider_prior = consider_prior
-        self._prior_weight = prior_weight
-        self._is_lower_better = is_lower_better
-        self._min_n_top_trials = min_n_top_trials
+    def __init__(self, *, baseline_quantile: float = 0.1, evaluate_on_local: bool = True):
         self._baseline_quantile = baseline_quantile
         self._evaluate_on_local = evaluate_on_local
 
+        # Advanced Setups.
+        # Discretize a domain [low, high] as `np.linspace(low, high, n_steps)`.
+        self._n_steps: int = 50
+        # Prior is used for regularization.
+        self._consider_prior = True
+        # Control the regularization effect.
+        self._prior_weight = 1.0
+        # How many `trials` must be included in `top_trials`.
+        self._min_n_top_trials = 2
+
     def _get_top_trials(
         self,
+        study: Study,
         trials: list[FrozenTrial],
         params: list[str],
         target: Callable[[FrozenTrial], float] | None,
     ) -> list[FrozenTrial]:
+        if target is None and study._is_multi_objective():
+            raise ValueError(
+                "If the `study` is being used for multi-objective optimization, "
+                "please specify the `target`. For example, use "
+                "`target=lambda t: t.values[0]` for the first objective value."
+            )
+
+        is_lower_better = study.directions[0] == StudyDirection.MINIMIZE
+        if target is not None:
+            warnings.warn(
+                f"{self.__class__.__name__} computes the importances of params to achieve "
+                "low `target` values. If this is not what you want, please multiply target by -1."
+            )
+            is_lower_better = True
+
         trial_filter = get_trial_filter(
-            quantile=self._baseline_quantile,
-            is_lower_better=self._is_lower_better,
-            min_n_top_trials=self._min_n_top_trials,
-            target=target,
+            self._baseline_quantile, is_lower_better, self._min_n_top_trials, target
         )
         top_trials = trial_filter(trials)
 
         if len(trials) == len(top_trials):
-            warnings.warn(
-                "All the trials were considered to be in top and it gives equal importances."
-            )
+            warnings.warn("All trials are in top trials, which gives equal importances.")
 
         return top_trials
 
@@ -144,37 +110,23 @@ def _compute_pearson_divergence(
         top_trials: list[FrozenTrial],
         all_trials: list[FrozenTrial],
     ) -> float:
-        cat_dist_func = self._categorical_distance_func.get(param_name, None)
+        consider_prior, prior_weight = self._consider_prior, self._prior_weight
         pe_top = _build_parzen_estimator(
-            param_name=param_name,
-            dist=dist,
-            trials=top_trials,
-            n_steps=self._n_steps,
-            consider_prior=self._consider_prior,
-            prior_weight=self._prior_weight,
-            categorical_distance_func=cat_dist_func,
+            param_name, dist, top_trials, self._n_steps, consider_prior, prior_weight
         )
-        n_grids = pe_top.n_grids
-        grids = np.arange(n_grids)
+        # NOTE: n_steps can be different from self._n_steps when param is discrete.
+        grids = np.arange(pe_top.n_steps)
         pdf_top = pe_top.pdf(grids) + 1e-12
 
         if self._evaluate_on_local:
-            # Compute the integral on the local space.
-            # It gives us the importances of hyperparameters during the search.
+            # The importance of param during the study.
             pe_local = _build_parzen_estimator(
-                param_name=param_name,
-                dist=dist,
-                trials=all_trials,
-                n_steps=self._n_steps,
-                consider_prior=self._consider_prior,
-                prior_weight=self._prior_weight,
-                categorical_distance_func=cat_dist_func,
+                param_name, dist, all_trials, self._n_steps, consider_prior, prior_weight
             )
             pdf_local = pe_local.pdf(grids) + 1e-12
         else:
-            # Compute the integral on the global space.
-            # It gives us the importances of hyperparameters in the search space.
-            pdf_local = np.full(n_grids, 1.0 / n_grids)
+            # The importance of param in the search space.
+            pdf_local = np.full(pe_top.n_steps, 1.0 / pe_top.n_steps)
 
         return float(pdf_local @ ((pdf_top / pdf_local - 1) ** 2))
 
@@ -185,13 +137,6 @@ def evaluate(
         *,
         target: Callable[[FrozenTrial], float] | None = None,
     ) -> dict[str, float]:
-        if target is None and study._is_multi_objective():
-            raise ValueError(
-                "If the `study` is being used for multi-objective optimization, "
-                "please specify the `target`. For example, use "
-                "`target=lambda t: t.values[0]` for the first objective value."
-            )
-
         distributions = _get_distributions(study, params=params)
         if params is None:
             params = list(distributions.keys())
@@ -210,15 +155,12 @@ def evaluate(
             return {}
 
         trials = _get_filtered_trials(study, params=params, target=target)
-        top_trials = self._get_top_trials(trials, params, target)
+        top_trials = self._get_top_trials(study, trials, params, target)
         importance_sum = 0.0
         param_importances = {}
         for param_name, dist in non_single_distributions.items():
             param_importances[param_name] = self._compute_pearson_divergence(
-                param_name,
-                dist,
-                top_trials=top_trials,
-                all_trials=trials,
+                param_name, dist, top_trials=top_trials, all_trials=trials
             )
             importance_sum += param_importances[param_name]
 

diff --git a/optuna/importance/_ped_anova/_scott_parzen_estimator.py b/optuna/importance/_ped_anova/_scott_parzen_estimator.py
@@ -1,11 +1,8 @@
 from __future__ import annotations
 
-from collections.abc import Callable
-
 import numpy as np
 
 from optuna.distributions import BaseDistribution
-from optuna.distributions import CategoricalChoiceType
 from optuna.distributions import CategoricalDistribution
 from optuna.distributions import FloatDistribution
 from optuna.distributions import IntDistribution
@@ -26,22 +23,17 @@ def __init__(
         counts: np.ndarray,
         consider_prior: bool,
         prior_weight: float,
-        categorical_distance_func: Callable[[CategoricalChoiceType, CategoricalChoiceType], float]
-        | None,
     ):
         if not isinstance(dist, (CategoricalDistribution, IntDistribution)):
             raise ValueError(
                 f"Only IntDistribution and CategoricalDistribution are supported, but got {dist}."
             )
 
-        self._n_grids = len(counts)
+        self._n_steps = len(counts)
         self._param_name = param_name
         self._counts = counts.copy()
-        cat_dist_func: dict[
-            str, Callable[[CategoricalChoiceType, CategoricalChoiceType], float]
-        ] = ({} if categorical_distance_func is None else {param_name: categorical_distance_func})
         super().__init__(
-            observations={param_name: np.arange(self._n_grids)[counts > 0.0]},
+            observations={param_name: np.arange(self._n_steps)[counts > 0.0]},
             search_space={param_name: dist},
             parameters=_ParzenEstimatorParameters(
                 consider_prior=consider_prior,
@@ -50,27 +42,26 @@ def __init__(
                 consider_endpoints=False,
                 weights=lambda x: np.empty(0),
                 multivariate=True,
-                categorical_distance_func=cat_dist_func,
+                categorical_distance_func={},
             ),
             predetermined_weights=counts[counts > 0.0],
         )
 
     def _calculate_numerical_distributions(
         self,
         observations: np.ndarray,
-        low: float,
-        high: float,
+        low: float,  # <-- int (, but typing follows the original)
+        high: float,  # <-- int (, but typing follows the original)
         step: float | None,
         parameters: _ParzenEstimatorParameters,
     ) -> _BatchedDistributions:
-        # NOTE: low and high are actually `int` in this class.
         # NOTE: The Optuna TPE bandwidth selection is too wide for this analysis.
         assert step is not None and np.isclose(step, 1.0), "MyPy redefinition."
 
         n_trials = np.sum(self._counts)
         counts_non_zero = self._counts[self._counts > 0]
         weights = counts_non_zero / n_trials
-        mus = np.arange(self.n_grids)[self._counts > 0]
+        mus = np.arange(self.n_steps)[self._counts > 0]
         mean_est = mus @ weights
         sigma_est = np.sqrt((mus - mean_est) ** 2 @ counts_non_zero / max(1, n_trials - 1))
 
@@ -89,16 +80,12 @@ def _calculate_numerical_distributions(
             sigmas = np.append(sigmas, [1.0 * (high - low + 1)])
 
         return _BatchedDiscreteTruncNormDistributions(
-            mu=mus,
-            sigma=sigmas,
-            low=0,
-            high=self.n_grids - 1,
-            step=1,
+            mu=mus, sigma=sigmas, low=0, high=self.n_steps - 1, step=1
         )
 
     @property
-    def n_grids(self) -> int:
-        return self._n_grids
+    def n_steps(self) -> int:
+        return self._n_steps
 
     def pdf(self, samples: np.ndarray) -> np.ndarray:
         return np.exp(self.log_pdf({self._param_name: samples}))
@@ -119,15 +106,15 @@ def _get_grids_and_grid_indices_of_trials(
             params = np.asarray([t.params[param_name] for t in trials])
     elif isinstance(dist, IntDistribution):
         if dist.log:
-            log_2_n_grids = int(np.ceil(np.log(dist.high - dist.low + 1) / np.log(2)))
-            n_steps_in_log_scale = min(log_2_n_grids, n_steps)
+            log_2_n_steps = int(np.ceil(np.log(dist.high - dist.low + 1) / np.log(2)))
+            n_steps_in_log_scale = min(log_2_n_steps, n_steps)
             grids = np.linspace(np.log(dist.low), np.log(dist.high), n_steps_in_log_scale)
             params = np.log([t.params[param_name] for t in trials])
         else:
-            n_grids = (dist.high + 1 - dist.low) // dist.step
+            n_steps = (dist.high + 1 - dist.low) // dist.step
             grids = (
                 np.arange(dist.low, dist.high + 1)[:: dist.step]
-                if n_grids <= n_steps
+                if n_steps <= n_steps
                 else np.linspace(dist.low, dist.high, n_steps)
             )
             params = np.asarray([t.params[param_name] for t in trials])
@@ -147,10 +134,7 @@ def _count_numerical_param_in_grid(
     n_steps: int,
 ) -> np.ndarray:
     grids, grid_indices_of_trials = _get_grids_and_grid_indices_of_trials(
-        param_name,
-        dist,
-        trials,
-        n_steps,
+        param_name, dist, trials, n_steps
     )
     unique_vals, counts_in_unique = np.unique(grid_indices_of_trials, return_counts=True)
     counts = np.zeros(grids.size, dtype=np.int32)
@@ -159,15 +143,10 @@ def _count_numerical_param_in_grid(
 
 
 def _count_categorical_param_in_grid(
-    param_name: str,
-    dist: CategoricalDistribution,
-    trials: list[FrozenTrial],
+    param_name: str, dist: CategoricalDistribution, trials: list[FrozenTrial]
 ) -> np.ndarray:
-    choice_to_index = {c: i for i, c in enumerate(dist.choices)}
-    unique_vals, counts_in_unique = np.unique(
-        [choice_to_index[t.params[param_name]] for t in trials],
-        return_counts=True,
-    )
+    cat_indices = [int(dist.to_internal_repr(t.params[param_name])) for t in trials]
+    unique_vals, counts_in_unique = np.unique(cat_indices, return_counts=True)
     counts = np.zeros(len(dist.choices), dtype=np.int32)
     counts[unique_vals] += counts_in_unique
     return counts
@@ -180,8 +159,6 @@ def _build_parzen_estimator(
     n_steps: int,
     consider_prior: bool,
     prior_weight: float,
-    categorical_distance_func: Callable[[CategoricalChoiceType, CategoricalChoiceType], float]
-    | None,
 ) -> _ScottParzenEstimator:
     rounded_dist: IntDistribution | CategoricalDistribution
     if isinstance(dist, (IntDistribution, FloatDistribution)):
@@ -194,10 +171,5 @@ def _build_parzen_estimator(
         raise ValueError(f"Got an unknown dist with the type {type(dist)}.")
 
     return _ScottParzenEstimator(
-        param_name=param_name,
-        dist=rounded_dist,
-        counts=counts.astype(np.float64),
-        consider_prior=consider_prior,
-        prior_weight=prior_weight,
-        categorical_distance_func=categorical_distance_func,
+        param_name, rounded_dist, counts.astype(np.float64), consider_prior, prior_weight
     )