openml · PGijsbers · Jun 3, 2023 · Jun 3, 2023 · Jun 7, 2023 · Jun 7, 2023
diff --git a/amlb/benchmark.py b/amlb/benchmark.py
@@ -24,9 +24,10 @@
 from .datautils import read_csv
 from .resources import get as rget, config as rconfig, output_dirs as routput_dirs
 from .results import ErrorResult, Scoreboard, TaskResult
-from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, json_dump, lazy_property, profile, repr_def, \
-    run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, system_memory_mb, system_volume_mb, touch
-
+from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, \
+    json_dump, lazy_property, profile, repr_def, \
+    run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, \
+    system_memory_mb, system_volume_mb, touch, Namespace
 
 log = logging.getLogger(__name__)
 
@@ -371,9 +372,33 @@ def _is_task_enabled(task_def):
 
 class TaskConfig:
 
-    def __init__(self, name, fold, metrics, seed,
+    def __init__(self, *, name, fold, seed,
                  max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb,
-                 input_dir, output_dir):
+                 input_dir, output_dir,
+                 metrics: Union[list[str], str, None] = None,
+                 optimization_metrics: Union[list[str], str, None] = None,
+                 evaluation_metrics: Union[list[str], str, None] = None,
+                 ):
+
+        if metrics:
+            log.warning(
+                "WARNING: The `metric` field of the task definition is deprecated"
+                " and will not work in the future. Please specify the metric(s) to "
+                "optimize for with `optimization_metrics` and any additional metric(s) "
+                "used only for evaluation in `evaluation_metrics`."
+            )
+            if optimization_metrics:
+                raise ValueError(
+                    "Detected both `metric` and `optimization_metrics` for task "
+                    f"'{name}'. Aborting because desired setup is unclear."
+                    "Please only use `optimization_metrics`."
+                )
+            optimization_metrics = as_list(metrics)[:1]
+            evaluation_metrics = as_list(metrics)[1:]
+
+        self.optimization_metrics = optimization_metrics or []
+        self._evaluation_metrics = evaluation_metrics or []
+
         self.framework = None
         self.framework_params = None
         self.framework_version = None
@@ -391,16 +416,25 @@ def __init__(self, name, fold, metrics, seed,
         self.output_predictions_file = os.path.join(output_dir, "predictions.csv")
         self.ext = ns()  # used if frameworks require extra config points
 
+    @property
+    def evaluation_metrics(self) -> list[str]:
+        return list(set(self.optimization_metrics) | set(self._evaluation_metrics))
+
+    def load_default_metrics(self, *, dataset_type: str):
+        """ Sets `optimization/evaluation_metrics` based on defaults from config.yaml"""
+        self.optimization_metrics = as_list(rconfig().benchmarks.optimization_metrics[dataset_type])
+        self._evaluation_metrics = as_list(rconfig().benchmarks.evaluation_metrics[dataset_type])
+
     def __setattr__(self, name, value):
-        if name == 'metrics':
-            self.metric = value[0] if isinstance(value, list) else value
-        elif name == 'max_runtime_seconds':
-            self.job_timeout_seconds = min(value * 2,
-                                           value + rconfig().benchmarks.overhead_time_seconds)
+        if name == 'max_runtime_seconds':
+            self.job_timeout_seconds = min(
+                value * 2,
+                value + rconfig().benchmarks.overhead_time_seconds
+            )
         super().__setattr__(name, value)
 
     def __json__(self):
-        return self.__dict__
+        return self.__dict__ | {"evaluation_metrics": self.evaluation_metrics}
 
     def __repr__(self):
         return repr_def(self)
@@ -458,10 +492,13 @@ def __init__(self, benchmark: Benchmark, task_def, fold):
         self.benchmark = benchmark
         self._task_def = task_def
         self.fold = fold
+
         self.task_config = TaskConfig(
             name=task_def.name,
             fold=fold,
             metrics=task_def.metric,
+            optimization_metrics=Namespace.get(task_def, "optimization_metrics"),
+            evaluation_metrics=Namespace.get(task_def, "evaluation_metrics"),
             seed=rget().seed(fold),
             max_runtime_seconds=task_def.max_runtime_seconds,
             cores=task_def.cores,
@@ -542,9 +579,8 @@ def run(self):
         task_config.output_predictions_file = results._predictions_file
         task_config.output_metadata_file = results._metadata_file
         touch(os.path.dirname(task_config.output_predictions_file), as_dir=True)
-        if task_config.metrics is None:
-            task_config.metrics = as_list(rconfig().benchmarks.metrics[self._dataset.type.name])
-            task_config.metric = task_config.metrics[0]
+        if not task_config.optimization_metrics:
+            task_config.load_default_metrics(dataset_type=self._dataset.type.name)
 
         result = meta_result = None
         try:

diff --git a/amlb/results.py b/amlb/results.py
@@ -2,15 +2,15 @@
 **results** module provides the logic to format, save and read predictions generated by the *automl frameworks* (cf. ``TaskResult``),
 as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``).
 """
-from functools import partial
+import inspect
 import collections
 import io
 import logging
 import math
 import os
 import re
 import statistics
-from typing import Union
+from typing import Union, Callable
 
 import numpy as np
 from numpy import nan, sort
@@ -130,24 +130,20 @@ def __init__(self, scores=None, framework_name=None, benchmark_name=None, task_n
 
     @cached
     def as_data_frame(self):
-        # index = ['task', 'framework', 'fold']
-        index = []
         df = (self.scores if is_data_frame(self.scores)
               else to_data_frame([dict(sc) for sc in self.scores]))
         if df.empty:
             # avoid dtype conversions during reindexing on empty frame
             return df
-        fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result', 'metric', 'mode', 'version',
+        fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'optimization_metrics', 'mode', 'version',
                       'params', 'app_version', 'utc', 'duration', 'training_duration', 'predict_duration', 'models_count', 'seed', 'info']
-        fixed_cols = [col for col in fixed_cols if col not in index]
-        metrics_cols = [col for col in df.columns
-                        if (col in dir(ClassificationResult) or col in dir(RegressionResult))
-                        and not col.startswith('_')]
+        metrics_cols = [
+            col for col in df.columns
+            if col in ClassificationResult.metrics() + RegressionResult.metrics()
+        ]
         metrics_cols.sort()
         dynamic_cols = [col for col in df.columns
-                        if col not in index
-                        and col not in fixed_cols
-                        and col not in metrics_cols]
+                        if col not in fixed_cols + metrics_cols]
         dynamic_cols.sort()
         df = df.reindex(columns=[]+fixed_cols+metrics_cols+dynamic_cols)
         log.debug("Scores columns: %s.", df.columns)
@@ -174,9 +170,12 @@ def as_printable_data_frame(self, verbosity=3):
         for col in high_precision_float_cols:
             df[col] = df[col].map("{:.6g}".format).astype(float)
 
+        unique_metrics = (set(metrics.split(",")) for metrics in df['optimization_metrics'].unique())
+        optimized_metrics = set.union(*unique_metrics)
+
         cols = ([] if verbosity == 0
-                else ['task', 'fold', 'framework', 'constraint', 'result', 'metric', 'info'] if verbosity == 1
-                else ['id', 'task', 'fold', 'framework', 'constraint', 'result', 'metric',
+                else ['task', 'fold', 'framework', 'constraint', *optimized_metrics, 'optimization_metrics', 'info'] if verbosity == 1
+                else ['id', 'task', 'fold', 'framework', 'constraint', *optimized_metrics, 'optimization_metrics',
                       'duration', 'seed', 'info'] if verbosity == 2
                 else slice(None))
         return df.loc[:, cols]
@@ -426,38 +425,22 @@ def compute_score(self, result=None, meta_result=None):
             seed=metadata.seed,
             app_version=rget().app_version,
             utc=datetime_iso(),
-            metric=metadata.metric,
-            duration=nan
+            optimization_metrics=metadata.optimization_metrics,
+            duration=nan,
         )
         required_meta_res = ['training_duration', 'predict_duration', 'models_count']
         for m in required_meta_res:
             entry[m] = meta_result[m] if m in meta_result else nan
         result = self.get_result() if result is None else result
 
         scoring_errors = []
-
-        def do_score(m):
-            score = result.evaluate(m)
+        for metric_ in metadata.evaluation_metrics:
+            score = result.evaluate(metric_)
             if 'message' in score:
                 scoring_errors.append(score.message)
-            return score
-
-        def set_score(score):
-            entry.metric = score.metric
-            entry.result = score.value
-            if score.higher_is_better is False:  # if unknown metric, and higher_is_better is None, then no change
-                entry.metric = f"neg_{entry.metric}"
-                entry.result = - entry.result
-
-        for metric in metadata.metrics or []:
-            sc = do_score(metric)
-            entry[metric] = sc.value
-            if metric == entry.metric:
-                set_score(sc)
-
-        if 'result' not in entry:
-            set_score(do_score(entry.metric))
+            entry[metric_] = score.value
 
+        entry.optimization_metrics = ','.join(entry.optimization_metrics)
         entry.info = result.info
         if scoring_errors:
             entry.info = "; ".join(filter(lambda it: it, [entry.info, *scoring_errors]))
@@ -501,6 +484,14 @@ def evaluate(self, metric):
             eval_res += Namespace(value=nan, higher_is_better=None, message=f"Unsupported metric `{metric}` for {pb_type} problems")
         return eval_res
 
+    @classmethod
+    def metrics(cls) -> list[str]:
+        def has_metric_metadata(fn: Callable) -> bool:
+            return get_metadata(fn, "higher_is_better") is not None
+        return [
+            name for name, _ in inspect.getmembers(cls, predicate=has_metric_metadata)
+        ]
+
 
 class NoResult(Result):
 

diff --git a/examples/custom/extensions/Stacking/exec.py b/examples/custom/extensions/Stacking/exec.py
@@ -33,8 +33,8 @@ def run(dataset, config):
     estimators_params = {e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final']}
 
     log.info("Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs))
-    log.warning("We completely ignore the requirement to stay within the time limit.")
-    log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric))
+    log.warning("We ignore the requirement to stay within the time limit.")
+    log.warning(f"We ignore the advice to optimize for: {config.optimization_metrics}.")
 
 
     if is_classification:

diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
@@ -38,10 +38,10 @@ def run(dataset, config):
         rmse=metrics.root_mean_squared_error,
     )
 
-    perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    perf_metric = metrics_mapping.get(config.optimization_metrics[0])
     if perf_metric is None:
         # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
-        log.warning("Performance metric %s not supported.", config.metric)
+        log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.")
 
     is_classification = config.type == 'classification'
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}

diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py
@@ -142,9 +142,9 @@ def get_eval_metric(config):
         rmse="RMSE",
     )
 
-    eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    eval_metric = metrics_mapping.get(config.optimization_metrics[0])
     if eval_metric is None:
-        log.warning("Performance metric %s not supported.", config.metric)
+        log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.")
     return eval_metric
 
 

diff --git a/frameworks/AutoWEKA/exec.py b/frameworks/AutoWEKA/exec.py
@@ -24,9 +24,10 @@ def run(dataset: Dataset, config: TaskConfig):
         auc='areaUnderROC',
         logloss='kBInformation'
     )
-    metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    metric = metrics_mapping.get(config.optimization_metrics[0])
     if metric is None:
-        raise ValueError("Performance metric {} not supported.".format(config.metric))
+        msg = f"Performance metric {config.optimization_metrics[0]} not supported."
+        raise ValueError(msg)
 
     train_file = dataset.train.path
     test_file = dataset.test.path

diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py
@@ -43,9 +43,10 @@ def run(dataset, config):
         r2='r2',
         rmse='neg_mean_squared_error',
     )
-    scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    scoring_metric = metrics_mapping.get(config.optimization_metrics[0])
     if scoring_metric is None:
-        raise ValueError("Performance metric {} not supported.".format(config.metric))
+        msg = f"Performance metric '{config.optimization_metrics[0]}' not supported."
+        raise ValueError(msg)
 
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
     n_jobs = config.framework_params.get('_n_jobs', config.cores)  # useful to disable multicore, regardless of the dataset config

diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py
@@ -43,10 +43,10 @@ def run(dataset, config):
         rmse='rmse',
         rmsle='rmsle'
     )
-    sort_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    sort_metric = metrics_mapping.get(config.optimization_metrics[0])
     if sort_metric is None:
         # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
-        log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric)
+        log.warning(f"Performance metric {config.optimization_metrics[0]} not supported, defaulting to AUTO.")
 
     try:
         training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}

diff --git a/frameworks/MLPlan/exec.py b/frameworks/MLPlan/exec.py
@@ -30,10 +30,10 @@ def run(dataset, config):
         rmsle='ROOT_MEAN_SQUARED_LOGARITHM_ERROR',
         mae='MEAN_ABSOLUTE_ERROR'
     )
-
-    metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    metric = metrics_mapping.get(config.optimization_metrics[0])
     if metric is None:
-        raise ValueError('Performance metric {} is not supported.'.format(config.metric))
+        msg = f'Performance metric {config.optimization_metrics[0]} is not supported.'
+        raise ValueError(msg)
 
     train_file = dataset.train.path
     test_file = dataset.test.path

diff --git a/frameworks/RandomForest/exec.py b/frameworks/RandomForest/exec.py
@@ -47,7 +47,7 @@ def run(dataset, config):
     memory_margin = config.framework_params.get('_memory_margin', 0.9)
 
     log.info("Running RandomForest with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs))
-    log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric))
+    log.warning(f"We ignore the advice to optimize for: {config.optimization_metrics}.")
 
     estimator = RandomForestClassifier if is_classification else RandomForestRegressor
     rf = estimator(n_jobs=n_jobs,

diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py
@@ -36,9 +36,10 @@ def run(dataset, config):
         r2='r2',
         rmse='neg_mean_squared_error',  # TPOT can score on mse, as app computes rmse independently on predictions
     )
-    scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    scoring_metric = metrics_mapping.get(config.optimization_metrics[0])
     if scoring_metric is None:
-        raise ValueError("Performance metric {} not supported.".format(config.metric))
+        msg = f"Performance metric {config.optimization_metrics[0]} not supported."
+        raise ValueError(msg)
 
     X_train = dataset.train.X
     y_train = dataset.train.y

diff --git a/frameworks/TunedRandomForest/exec.py b/frameworks/TunedRandomForest/exec.py
@@ -76,7 +76,11 @@ def run(dataset, config):
         mse='neg_mean_squared_error',
         r2='r2',
         rmse='neg_root_mean_squared_error',
-    )[config.metric]
+    ).get(config.optimization_metrics[0])
+
+    if not metric:
+        msg = f"TunedRandomForest doesn't support {config.optimization_metrics[0]}"
+        raise ValueError(msg)
 
     n_features = X_train.shape[1]
     default_value = max(1, int(math.sqrt(n_features)))

diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py
@@ -45,10 +45,10 @@ def run(dataset, config):
         rmse=metrics.mean_squared_error if askl_version < version.parse("0.10") else metrics.root_mean_squared_error,
         r2=metrics.r2
     )
-    perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
+    perf_metric = metrics_mapping.get(config.optimization_metrics[0])
     if perf_metric is None:
         # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
-        log.warning("Performance metric %s not supported.", config.metric)
+        log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.")
 
     # Set resources based on datasize
     log.info(

diff --git a/frameworks/flaml/exec.py b/frameworks/flaml/exec.py
@@ -32,10 +32,12 @@ def run(dataset, config):
         rmse='rmse',
         r2='r2',
     )
-    perf_metric = metrics_mapping[
-        config.metric] if config.metric in metrics_mapping else 'auto'
-    if perf_metric is None:
-        log.warning("Performance metric %s not supported.", config.metric)
+    perf_metric = metrics_mapping.get(config.optimization_metrics[0], 'auto')
+    if perf_metric == 'auto' and config.optimization_metrics[0] != 'auto':
+        log.warning(
+            f"Performance metric '{config.optimization_metrics[0]}' not supported, "
+            f"using metric='auto' instead.",
+        )
 
     training_params = {k: v for k, v in config.framework_params.items()
                        if not k.startswith('_')}

diff --git a/frameworks/hyperoptsklearn/exec.py b/frameworks/hyperoptsklearn/exec.py
@@ -36,7 +36,11 @@ def run(dataset, config):
         r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred)
         rmse=(mean_squared_error, False),
     )
-    loss_fn, continuous_loss_fn = metrics_to_loss_mapping[config.metric] if config.metric in metrics_to_loss_mapping else (None, False)
+
+    loss_fn, continuous_loss_fn = metrics_to_loss_mapping.get(
+        config.optimization_metrics[0],
+        (None, False)
+    )
     if loss_fn is None:
         log.warning("Performance metric %s not supported: defaulting to %s.",
                     config.metric, 'accuracy' if is_classification else 'r2')