Skip to content

Commit

Permalink
Remove most of advanced setups
Browse files Browse the repository at this point in the history
  • Loading branch information
nabenabe0928 committed Feb 1, 2024
1 parent dd9711f commit ed635c1
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 145 deletions.
140 changes: 41 additions & 99 deletions optuna/importance/_ped_anova/_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
import numpy as np

from optuna.distributions import BaseDistribution
from optuna.distributions import CategoricalChoiceType
from optuna.importance._base import _get_distributions
from optuna.importance._base import _get_filtered_trials
from optuna.importance._base import _sort_dict_by_importance
from optuna.importance._base import BaseImportanceEvaluator
from optuna.importance._ped_anova._scott_parzen_estimator import _build_parzen_estimator
from optuna.importance.filters import get_trial_filter
from optuna.study import Study
from optuna.study import StudyDirection
from optuna.trial import FrozenTrial


Expand All @@ -25,11 +25,9 @@ class PedAnovaImportanceEvaluator(BaseImportanceEvaluator):
<https://arxiv.org/abs/2304.10255>`_.
PED-ANOVA fits Parzen estimators of :class:`~optuna.trial.TrialState.COMPLETE` trials better
than a user-specified baseline. Users can specify the baseline either by a quantile or a value.
than a user-specified baseline. Users can specify the baseline either by a quantile.
The importance can be interpreted as how important each hyperparameter is to get
the performance better than baseline.
Users can also remove trials worse than `cutoff` so that the interpretation removes the bias
caused by the initial trials.
For further information about PED-ANOVA algorithm, please refer to the following paper:
Expand All @@ -46,34 +44,11 @@ class PedAnovaImportanceEvaluator(BaseImportanceEvaluator):
Please refer to the original work available at https://github.com/nabenabe0928/local-anova.
Args:
is_lower_better:
Whether `target_value` is better when it is lower.
n_steps:
The number of grids in continuous domains.
For example, if one of the parameters has the domain of [`low`, `high`],
we discretize it as `np.linspace(low, high, n_steps)`.
baseline_quantile:
Compute the importance of achieving top-`baseline_quantile` quantile `target_value`.
For example, `baseline_quantile=0.1` means that the importances give the information
of which parameters were important to achieve the top-10% performance during
the specified `study`.
min_n_top_trials:
How many `trials` must be included in `top_trials`.
consider_prior:
Whether we use non-informative prior to regularize the Parzen estimators.
This might be helpful to avoid overfitting.
prior_weight:
How much we regularize the Parzen estimator fitting.
The larger `prior_weight` becomes, the more we regularize the fitting.
All the observations receive `weight=1.0`, so the default value is `prior_weight=1.0`.
categorical_distance_func:
A dictionary of distance functions for categorical parameters. The key is the name of
the categorical parameter and the value is a distance function that takes two
:class:`~optuna.distributions.CategoricalChoiceType` s and returns a :obj:`float`
value. The distance function must return a non-negative value.
While categorical choices are handled equally by default, this option allows users to
specify prior knowledge on the structure of categorical parameters.
evaluate_on_local:
Whether we measure the importance in the local or global space.
If `True`, the importances imply how importance each parameter is during `study`.
Expand All @@ -82,58 +57,49 @@ class PedAnovaImportanceEvaluator(BaseImportanceEvaluator):
space during the specified `study`.
"""

def __init__(
self,
is_lower_better: bool,
*,
n_steps: int = 50,
baseline_quantile: float = 0.1,
consider_prior: bool = False,
prior_weight: float = 1.0,
categorical_distance_func: dict[
str, Callable[[CategoricalChoiceType, CategoricalChoiceType], float]
]
| None = None,
evaluate_on_local: bool = True,
min_n_top_trials: int = 2,
):
if n_steps <= 1:
raise ValueError(f"`n_steps` must be larger than 1, but got {n_steps}.")

if min_n_top_trials < 2:
raise ValueError(
f"min_n_top_trials must be larger than 1, but got {min_n_top_trials}."
)

self._n_steps = n_steps
self._categorical_distance_func = (
categorical_distance_func if categorical_distance_func is not None else {}
)
self._consider_prior = consider_prior
self._prior_weight = prior_weight
self._is_lower_better = is_lower_better
self._min_n_top_trials = min_n_top_trials
def __init__(self, *, baseline_quantile: float = 0.1, evaluate_on_local: bool = True):
self._baseline_quantile = baseline_quantile
self._evaluate_on_local = evaluate_on_local

# Advanced Setups.
# Discretize a domain [low, high] as `np.linspace(low, high, n_steps)`.
self._n_steps: int = 50
# Prior is used for regularization.
self._consider_prior = True
# Control the regularization effect.
self._prior_weight = 1.0
# How many `trials` must be included in `top_trials`.
self._min_n_top_trials = 2

def _get_top_trials(
self,
study: Study,
trials: list[FrozenTrial],
params: list[str],
target: Callable[[FrozenTrial], float] | None,
) -> list[FrozenTrial]:
if target is None and study._is_multi_objective():
raise ValueError(
"If the `study` is being used for multi-objective optimization, "
"please specify the `target`. For example, use "
"`target=lambda t: t.values[0]` for the first objective value."
)

is_lower_better = study.directions[0] == StudyDirection.MINIMIZE
if target is not None:
warnings.warn(
f"{self.__class__.__name__} computes the importances of params to achieve "
"low `target` values. If this is not what you want, please multiply target by -1."
)
is_lower_better = True

trial_filter = get_trial_filter(
quantile=self._baseline_quantile,
is_lower_better=self._is_lower_better,
min_n_top_trials=self._min_n_top_trials,
target=target,
self._baseline_quantile, is_lower_better, self._min_n_top_trials, target
)
top_trials = trial_filter(trials)

if len(trials) == len(top_trials):
warnings.warn(
"All the trials were considered to be in top and it gives equal importances."
)
warnings.warn("All trials are in top trials, which gives equal importances.")

return top_trials

Expand All @@ -144,37 +110,23 @@ def _compute_pearson_divergence(
top_trials: list[FrozenTrial],
all_trials: list[FrozenTrial],
) -> float:
cat_dist_func = self._categorical_distance_func.get(param_name, None)
consider_prior, prior_weight = self._consider_prior, self._prior_weight
pe_top = _build_parzen_estimator(
param_name=param_name,
dist=dist,
trials=top_trials,
n_steps=self._n_steps,
consider_prior=self._consider_prior,
prior_weight=self._prior_weight,
categorical_distance_func=cat_dist_func,
param_name, dist, top_trials, self._n_steps, consider_prior, prior_weight
)
n_grids = pe_top.n_grids
grids = np.arange(n_grids)
# NOTE: n_steps can be different from self._n_steps when param is discrete.
grids = np.arange(pe_top.n_steps)
pdf_top = pe_top.pdf(grids) + 1e-12

if self._evaluate_on_local:
# Compute the integral on the local space.
# It gives us the importances of hyperparameters during the search.
# The importance of param during the study.
pe_local = _build_parzen_estimator(
param_name=param_name,
dist=dist,
trials=all_trials,
n_steps=self._n_steps,
consider_prior=self._consider_prior,
prior_weight=self._prior_weight,
categorical_distance_func=cat_dist_func,
param_name, dist, all_trials, self._n_steps, consider_prior, prior_weight
)
pdf_local = pe_local.pdf(grids) + 1e-12
else:
# Compute the integral on the global space.
# It gives us the importances of hyperparameters in the search space.
pdf_local = np.full(n_grids, 1.0 / n_grids)
# The importance of param in the search space.
pdf_local = np.full(pe_top.n_steps, 1.0 / pe_top.n_steps)

return float(pdf_local @ ((pdf_top / pdf_local - 1) ** 2))

Expand All @@ -185,13 +137,6 @@ def evaluate(
*,
target: Callable[[FrozenTrial], float] | None = None,
) -> dict[str, float]:
if target is None and study._is_multi_objective():
raise ValueError(
"If the `study` is being used for multi-objective optimization, "
"please specify the `target`. For example, use "
"`target=lambda t: t.values[0]` for the first objective value."
)

distributions = _get_distributions(study, params=params)
if params is None:
params = list(distributions.keys())
Expand All @@ -210,15 +155,12 @@ def evaluate(
return {}

trials = _get_filtered_trials(study, params=params, target=target)
top_trials = self._get_top_trials(trials, params, target)
top_trials = self._get_top_trials(study, trials, params, target)
importance_sum = 0.0
param_importances = {}
for param_name, dist in non_single_distributions.items():
param_importances[param_name] = self._compute_pearson_divergence(
param_name,
dist,
top_trials=top_trials,
all_trials=trials,
param_name, dist, top_trials=top_trials, all_trials=trials
)
importance_sum += param_importances[param_name]

Expand Down
64 changes: 18 additions & 46 deletions optuna/importance/_ped_anova/_scott_parzen_estimator.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from __future__ import annotations

from collections.abc import Callable

import numpy as np

from optuna.distributions import BaseDistribution
from optuna.distributions import CategoricalChoiceType
from optuna.distributions import CategoricalDistribution
from optuna.distributions import FloatDistribution
from optuna.distributions import IntDistribution
Expand All @@ -26,22 +23,17 @@ def __init__(
counts: np.ndarray,
consider_prior: bool,
prior_weight: float,
categorical_distance_func: Callable[[CategoricalChoiceType, CategoricalChoiceType], float]
| None,
):
if not isinstance(dist, (CategoricalDistribution, IntDistribution)):
raise ValueError(
f"Only IntDistribution and CategoricalDistribution are supported, but got {dist}."
)

self._n_grids = len(counts)
self._n_steps = len(counts)
self._param_name = param_name
self._counts = counts.copy()
cat_dist_func: dict[
str, Callable[[CategoricalChoiceType, CategoricalChoiceType], float]
] = ({} if categorical_distance_func is None else {param_name: categorical_distance_func})
super().__init__(
observations={param_name: np.arange(self._n_grids)[counts > 0.0]},
observations={param_name: np.arange(self._n_steps)[counts > 0.0]},
search_space={param_name: dist},
parameters=_ParzenEstimatorParameters(
consider_prior=consider_prior,
Expand All @@ -50,27 +42,26 @@ def __init__(
consider_endpoints=False,
weights=lambda x: np.empty(0),
multivariate=True,
categorical_distance_func=cat_dist_func,
categorical_distance_func={},
),
predetermined_weights=counts[counts > 0.0],
)

def _calculate_numerical_distributions(
self,
observations: np.ndarray,
low: float,
high: float,
low: float, # <-- int (, but typing follows the original)
high: float, # <-- int (, but typing follows the original)
step: float | None,
parameters: _ParzenEstimatorParameters,
) -> _BatchedDistributions:
# NOTE: low and high are actually `int` in this class.
# NOTE: The Optuna TPE bandwidth selection is too wide for this analysis.
assert step is not None and np.isclose(step, 1.0), "MyPy redefinition."

n_trials = np.sum(self._counts)
counts_non_zero = self._counts[self._counts > 0]
weights = counts_non_zero / n_trials
mus = np.arange(self.n_grids)[self._counts > 0]
mus = np.arange(self.n_steps)[self._counts > 0]
mean_est = mus @ weights
sigma_est = np.sqrt((mus - mean_est) ** 2 @ counts_non_zero / max(1, n_trials - 1))

Expand All @@ -89,16 +80,12 @@ def _calculate_numerical_distributions(
sigmas = np.append(sigmas, [1.0 * (high - low + 1)])

return _BatchedDiscreteTruncNormDistributions(
mu=mus,
sigma=sigmas,
low=0,
high=self.n_grids - 1,
step=1,
mu=mus, sigma=sigmas, low=0, high=self.n_steps - 1, step=1
)

@property
def n_grids(self) -> int:
return self._n_grids
def n_steps(self) -> int:
return self._n_steps

def pdf(self, samples: np.ndarray) -> np.ndarray:
return np.exp(self.log_pdf({self._param_name: samples}))
Expand All @@ -119,15 +106,15 @@ def _get_grids_and_grid_indices_of_trials(
params = np.asarray([t.params[param_name] for t in trials])
elif isinstance(dist, IntDistribution):
if dist.log:
log_2_n_grids = int(np.ceil(np.log(dist.high - dist.low + 1) / np.log(2)))
n_steps_in_log_scale = min(log_2_n_grids, n_steps)
log_2_n_steps = int(np.ceil(np.log(dist.high - dist.low + 1) / np.log(2)))
n_steps_in_log_scale = min(log_2_n_steps, n_steps)
grids = np.linspace(np.log(dist.low), np.log(dist.high), n_steps_in_log_scale)
params = np.log([t.params[param_name] for t in trials])
else:
n_grids = (dist.high + 1 - dist.low) // dist.step
n_steps = (dist.high + 1 - dist.low) // dist.step
grids = (
np.arange(dist.low, dist.high + 1)[:: dist.step]
if n_grids <= n_steps
if n_steps <= n_steps
else np.linspace(dist.low, dist.high, n_steps)
)
params = np.asarray([t.params[param_name] for t in trials])
Expand All @@ -147,10 +134,7 @@ def _count_numerical_param_in_grid(
n_steps: int,
) -> np.ndarray:
grids, grid_indices_of_trials = _get_grids_and_grid_indices_of_trials(
param_name,
dist,
trials,
n_steps,
param_name, dist, trials, n_steps
)
unique_vals, counts_in_unique = np.unique(grid_indices_of_trials, return_counts=True)
counts = np.zeros(grids.size, dtype=np.int32)
Expand All @@ -159,15 +143,10 @@ def _count_numerical_param_in_grid(


def _count_categorical_param_in_grid(
param_name: str,
dist: CategoricalDistribution,
trials: list[FrozenTrial],
param_name: str, dist: CategoricalDistribution, trials: list[FrozenTrial]
) -> np.ndarray:
choice_to_index = {c: i for i, c in enumerate(dist.choices)}
unique_vals, counts_in_unique = np.unique(
[choice_to_index[t.params[param_name]] for t in trials],
return_counts=True,
)
cat_indices = [int(dist.to_internal_repr(t.params[param_name])) for t in trials]
unique_vals, counts_in_unique = np.unique(cat_indices, return_counts=True)
counts = np.zeros(len(dist.choices), dtype=np.int32)
counts[unique_vals] += counts_in_unique
return counts
Expand All @@ -180,8 +159,6 @@ def _build_parzen_estimator(
n_steps: int,
consider_prior: bool,
prior_weight: float,
categorical_distance_func: Callable[[CategoricalChoiceType, CategoricalChoiceType], float]
| None,
) -> _ScottParzenEstimator:
rounded_dist: IntDistribution | CategoricalDistribution
if isinstance(dist, (IntDistribution, FloatDistribution)):
Expand All @@ -194,10 +171,5 @@ def _build_parzen_estimator(
raise ValueError(f"Got an unknown dist with the type {type(dist)}.")

return _ScottParzenEstimator(
param_name=param_name,
dist=rounded_dist,
counts=counts.astype(np.float64),
consider_prior=consider_prior,
prior_weight=prior_weight,
categorical_distance_func=categorical_distance_func,
param_name, rounded_dist, counts.astype(np.float64), consider_prior, prior_weight
)

0 comments on commit ed635c1

Please sign in to comment.