open-mmlab · zhouzaida · Mar 6, 2023 · Nov 17, 2022 · Nov 17, 2022 · Nov 18, 2022
diff --git a/docs/en/api/hooks.rst b/docs/en/api/hooks.rst
@@ -25,3 +25,4 @@ mmengine.hooks
    ProfilerHook
    NPUProfilerHook
    PrepareTTAHook
+   EarlyStoppingHook
diff --git a/docs/zh_cn/api/hooks.rst b/docs/zh_cn/api/hooks.rst
@@ -25,3 +25,4 @@ mmengine.hooks
    ProfilerHook
    NPUProfilerHook
    PrepareTTAHook
+   EarlyStoppingHook
diff --git a/mmengine/hooks/__init__.py b/mmengine/hooks/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .checkpoint_hook import CheckpointHook
+from .early_stopping_hook import EarlyStoppingHook
 from .ema_hook import EMAHook
 from .empty_cache_hook import EmptyCacheHook
 from .hook import Hook
@@ -17,5 +18,5 @@
     'Hook', 'IterTimerHook', 'DistSamplerSeedHook', 'ParamSchedulerHook',
     'SyncBuffersHook', 'EmptyCacheHook', 'CheckpointHook', 'LoggerHook',
     'NaiveVisualizationHook', 'EMAHook', 'RuntimeInfoHook', 'ProfilerHook',
-    'NPUProfilerHook', 'PrepareTTAHook'
+    'PrepareTTAHook', 'NPUProfilerHook', 'EarlyStoppingHook'
 ]
diff --git a/mmengine/hooks/early_stopping_hook.py b/mmengine/hooks/early_stopping_hook.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from math import inf, isfinite
+from typing import Optional, Tuple, Union
+
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class EarlyStoppingHook(Hook):
+    """Early stop the training when the monitored metric reached a plateau.
+
+    Args:
+        monitor (str): The monitored metric key to decide early stopping.
+        rule (str, optional): Comparison rule. Options are 'greater',
+            'less'. Defaults to None.
+        min_delta (float, optional): Minimum difference to continue the
+            training. Defaults to 0.01.
+        strict (bool, optional): Whether to crash the training when `monitor`
+            is not found in the `metrics`. Defaults to False.
+        check_finite: Whether to stop training when the monitor becomes NaN or
+            infinite. Defaults to True.
+        patience (int, optional): The times of validation with no improvement
+            after which training will be stopped. Defaults to 5.
+        stopping_threshold (float, optional): Stop training immediately once
+            the monitored quantity reaches this threshold. Defaults to None.
+
+       Note:
+           `New in version 0.6.0.`
+    """
+    priority = 'LOWEST'
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(
+        self,
+        monitor: str,
+        rule: Optional[str] = None,
+        min_delta: float = 0.1,
+        strict: bool = False,
+        check_finite: bool = True,
+        patience: int = 5,
+        stopping_threshold: Optional[float] = None,
+    ):
+
+        self.monitor = monitor
+        if monitor in self._default_greater_keys:
+            rule = 'greater'
+        elif monitor in self._default_less_keys:
+            rule = 'less'
+        assert rule in ['greater', 'less'], \
+            '`rule` should be either "greater" or "less".'
+        self.rule = rule
+        self.min_delta = min_delta if rule == 'greater' else -1 * min_delta
+        self.strict = strict
+        self.check_finite = check_finite
+        self.patience = patience
+        self.stopping_threshold = stopping_threshold
+
+        self.wait_count = 0
+        self.best_score = -inf if rule == 'greater' else inf
+
+    def _check_stop_condition(self, current_score: float) -> Tuple[bool, str]:
+        compare = self.rule_map[self.rule]
+        stop_training = False
+        reason_message = ''
+
+        if self.check_finite and not isfinite(current_score):
+            stop_training = True
+            reason_message = (f'Monitored metric {self.monitor} = '
+                              f'{current_score} is infinite. '
+                              f'Previous best value was '
+                              f'{self.best_score:.3f}.')
+
+        elif self.stopping_threshold is not None and compare(
+                current_score, self.stopping_threshold):
+            stop_training = True
+            self.best_score = current_score
+            reason_message = (f'Stopping threshold reached: '
+                              f'`{self.monitor}` = {current_score} is '
+                              f'{self.rule} than {self.stopping_threshold}.')
+        elif compare(self.best_score + self.min_delta, current_score):
+
+            self.wait_count += 1
+
+            if self.wait_count >= self.patience:
+                reason_message = (f'the monitored metric did not improve '
+                                  f'in the last {self.wait_count} records. '
+                                  f'best score: {self.best_score:.3f}. ')
+                stop_training = True
+        else:
+            self.best_score = current_score
+            self.wait_count = 0
+
+        return stop_training, reason_message
+
+    def before_run(self, runner) -> None:
+        """Check `stop_training` variable in `runner.train_loop`.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+        assert hasattr(runner.train_loop, 'stop_training'), \
+            '`train_loop` should contain `stop_training` variable.'
+
+    def after_val_epoch(self, runner, metrics):
+        """Decide whether to stop the training process.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            metrics (dict): Evaluation results of all metrics
+        """
+
+        if self.monitor not in metrics:
+            if self.strict:
+                raise RuntimeError(
+                    'Early stopping conditioned on metric '
+                    f'`{self.monitor} is not available. Please check available'
+                    f' metrics {metrics}, or set `strict=False` in '
+                    '`EarlyStoppingHook`.')
+            warnings.warn(
+                'Skip early stopping process since the evaluation '
+                f'results ({metrics.keys()}) do not include `monitor` '
+                f'({self.monitor}).')
+            return
+
+        current_score = metrics[self.monitor]
+
+        stop_training, message = self._check_stop_condition(current_score)
+        if stop_training:
+            runner.train_loop.stop_training = True
+            runner.logger.info(message)
diff --git a/mmengine/runner/loops.py b/mmengine/runner/loops.py
@@ -49,6 +49,9 @@ def __init__(
         self._iter = 0
         self.val_begin = val_begin
         self.val_interval = val_interval
+        # This attribute will be updated by `EarlyStoppingHook`
+        # when it is enabled.
+        self.stop_training = False
         if hasattr(self.dataloader.dataset, 'metainfo'):
             self.runner.visualizer.dataset_meta = \
                 self.dataloader.dataset.metainfo
@@ -86,7 +89,7 @@ def run(self) -> torch.nn.Module:
         """Launch training."""
         self.runner.call_hook('before_train')
 
-        while self._epoch < self._max_epochs:
+        while self._epoch < self._max_epochs and not self.stop_training:
             self.run_epoch()
 
             self._decide_current_val_interval()
@@ -216,6 +219,9 @@ def __init__(
         self._iter = 0
         self.val_begin = val_begin
         self.val_interval = val_interval
+        # This attribute will be updated by `EarlyStoppingHook`
+        # when it is enabled.
+        self.stop_training = False
         if hasattr(self.dataloader.dataset, 'metainfo'):
             self.runner.visualizer.dataset_meta = \
                 self.dataloader.dataset.metainfo
@@ -257,7 +263,7 @@ def run(self) -> None:
         # In iteration-based training loop, we treat the whole training process
         # as a big epoch and execute the corresponding hook.
         self.runner.call_hook('before_train_epoch')
-        while self._iter < self._max_iters:
+        while self._iter < self._max_iters and not self.stop_training:
             self.runner.model.train()
 
             data_batch = next(self.dataloader_iterator)