mlflow · WeichenXu123 · Feb 14, 2024 · Jan 29, 2024 · Jan 29, 2024 · Jan 29, 2024
diff --git a/mlflow/pytorch/__init__.py b/mlflow/pytorch/__init__.py
@@ -24,6 +24,7 @@
 
 import mlflow
 from mlflow import pyfunc
+from mlflow.client import MlflowClient
 from mlflow.environment_variables import MLFLOW_DEFAULT_PREDICTION_DEVICE
 from mlflow.exceptions import MlflowException
 from mlflow.ml_package_versions import _ML_PACKAGE_VERSIONS
@@ -901,6 +902,13 @@ def autolog(
     silent=False,
     registered_model_name=None,
     extra_tags=None,
+    model_checkpoint=True,
+    model_checkpoint_monitor="val_loss",
+    model_checkpoint_mode="min",
+    model_checkpoint_save_best_only=True,
+    model_checkpoint_save_weights_only=True,
+    model_checkpoint_every_n_epochs=None,
+    model_checkpoint_train_time_interval_S=600,
 ):  # pylint: disable=unused-argument
     """
     Enables (or disables) and configures autologging from `PyTorch Lightning
@@ -955,6 +963,26 @@ def autolog(
             new model version of the registered model with this name. The registered model is
             created if it does not already exist.
         extra_tags: A dictionary of extra tags to set on each managed run created by autologging.
+        :param model_checkpoint: Enable automatic model checkpointing, this feature only supports
+            pytorch-lightning >= 1.4.0
+        :param model_checkpoint_monitor: In automatic model checkpointing, the metric name to monitor if
+            you set `model_checkpoint_save_best_only` to True.
+        :param model_checkpoint_save_best_only: If True, automatic model checkpointing only saves when
+            the model is considered the "best" and the latest best model according to the quantity
+            monitored will not be overwritten.
+        :param model_checkpoint_mode: one of {"min", "max"}. In automatic model checkpointing,
+            if save_best_only=True, the decision to overwrite the current save file is made based on
+            either the maximization or the minimization of the monitored quantity.
+        :param model_checkpoint_save_weights_only: In automatic model checkpointing, if True, then
+            only the model’s weights will be saved. Otherwise, the optimizer states,
+            lr-scheduler states, etc are added in the checkpoint too.
+        :param model_checkpoint_every_n_epochs: Number of epochs between checkpoints for automatic
+            model checkpointing.
+        :param model_checkpoint_train_time_interval_S: Automatic model checkpoints are monitored
+            at the specified time interval in seconds. For all practical purposes, this cannot be
+            smaller than the amount of time it takes to process a single training batch. This is
+            not guaranteed to execute at the exact time specified, but should be close.
+            This must be mutually exclusive with `model_checkpoint_every_n_epochs`.
 
     .. testcode:: python
         :caption: Example
@@ -1099,3 +1127,36 @@ def print_auto_logged_info(r):
     autolog.__doc__ = autolog.__doc__.replace("MIN_REQ_VERSION", str(MIN_REQ_VERSION)).replace(
         "MAX_REQ_VERSION", str(MAX_REQ_VERSION)
     )
+
+
+def load_latest_checkpoint(model_class, run_id=None):
+    """
+    If you enable model_checkpoint in autologging, during pytorch-lightning model
+    training execution, checkpointed models are logged as MLflow artifacts.
+    Using this API, you can load the latest checkpointed model.
+
+    :param model_class: The class of the training model
+    :param run_id: The id of the run which model is logged to. If not provided,
+      current active run is used.
+    """
+    from mlflow.pytorch._lightning_autolog import _LATEST_CHECKPOINT_ARTIFACT_TAG_KEY
+
+    client = MlflowClient()
+
+    if run_id is None:
+        run = mlflow.active_run()
+        if run is None:
+            raise MlflowException(
+                "There is no active run, please provide the 'run_id' for "
+                "'load_best_checkpoint' call."
+            )
+        run_id = run.info.run_id
+    else:
+        run = client.get_run(run_id)
+
+    best_checkpoint_artifact = run.data.tags.get(_LATEST_CHECKPOINT_ARTIFACT_TAG_KEY)
+    if best_checkpoint_artifact is None:
+        raise MlflowException("There is no logged checkpoint artifact in current run.")
+
+    downloaded_checkpoint_filepath = client.download_artifacts(run_id, best_checkpoint_artifact)
+    return model_class.load_from_checkpoint(downloaded_checkpoint_filepath)
diff --git a/mlflow/pytorch/_lightning_autolog.py b/mlflow/pytorch/_lightning_autolog.py
@@ -1,10 +1,13 @@
 import logging
 import os
+import shutil
 import tempfile
+import time
 import warnings
 
 from packaging.version import Version
 
+from mlflow.utils.file_utils import create_tmp_dir
 import mlflow.pytorch
 from mlflow.exceptions import MlflowException
 from mlflow.ml_package_versions import _ML_PACKAGE_VERSIONS
@@ -287,6 +290,109 @@ def on_test_end(self, trainer, pl_module):
         self.metrics_logger.flush()
 
 
+_LATEST_CHECKPOINT_ARTIFACT_TAG_KEY = "_latest_checkpoint_artifact"
+
+
+class __MLflowModelCheckpointCallback(pl.Callback, metaclass=ExceptionSafeAbstractClass):
+
+    def __init__(
+        self,
+        monitor,
+        mode,
+        save_best_only,
+        save_weights_only,
+        every_n_epochs,
+        train_time_interval_S,
+    ):
+        self.monitor = monitor
+        self.mode = mode
+        self.save_best_only = save_best_only
+        self.save_weights_only = save_weights_only
+        self.every_n_epochs = every_n_epochs
+        self.train_time_interval_S = train_time_interval_S
+        self.latest_checkpoint_timestamp = time.time()
+        self.last_monitor_value = None
+
+    def _is_new_checkpoint_better(self, new_monitor_value):
+        if self.last_monitor_value is None:
+            return True
+
+        if self.mode == "min":
+            return new_monitor_value <= self.last_monitor_value
+
+        if self.mode == "max":
+            return new_monitor_value >= self.last_monitor_value
+
+        assert False, "Illegal __MLflowModelCheckpoint config."
+
+    def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        current_epoch = trainer.current_epoch
+        metric_dict = {k: float(v) for k, v in trainer.callback_metrics.items()}
+
+        should_checkpoint = False
+        if self.every_n_epochs and (current_epoch % self.every_n_epochs == 0):
+            should_checkpoint = True
+        elif (
+                self.train_time_interval_S and
+                time.time() - self.latest_checkpoint_timestamp > self.train_time_interval_S
+        ):
+            should_checkpoint = True
+
+        if not should_checkpoint:
+            return
+
+        if self.save_best_only:
+            if self.monitor not in metric_dict:
+                # "save-best-only" requires comparing the monitor metric value,
+                # but the provided monitor metric is not available,
+                # skip model checkpoint autologging
+                return
+
+            new_monitor_value = metric_dict[self.monitor]
+            if not self._is_new_checkpoint_better(new_monitor_value):
+                # Current checkpoint is worse than last saved checkpoint,
+                # so skip checkpointing.
+                return
+
+            self.last_monitor_value = new_monitor_value
+
+        if self.save_best_only:
+            if self.save_weights_only:
+                checkpoint_model_filename = "latest_checkpoint_model.weights.pth"
+            else:
+                checkpoint_model_filename = "latest_checkpoint_model.pth"
+            checkpoint_metrics_filename = "latest_checkpoint_metrics.json"
+            checkpoint_artifact_dir = ""
+        else:
+            if self.save_weights_only:
+                checkpoint_model_filename = f"checkpoint_model_epoch_{current_epoch}.weights.pth"
+            else:
+                checkpoint_model_filename = f"checkpoint_model_epoch_{current_epoch}.pth"
+            checkpoint_metrics_filename = f"checkpoint_metrics_epoch_{current_epoch}.json"
+            checkpoint_artifact_dir = "checkpoints"
+
+        mlflow.set_tag(
+            _LATEST_CHECKPOINT_ARTIFACT_TAG_KEY,
+            os.path.join(checkpoint_artifact_dir, checkpoint_model_filename)
+        )
+
+        mlflow.log_dict(
+            {**metric_dict, "epoch": current_epoch},
+            os.path.join(checkpoint_artifact_dir, checkpoint_metrics_filename)
+        )
+
+        tmp_dir = create_tmp_dir()
+        try:
+            tmp_model_save_path = os.path.join(tmp_dir, checkpoint_model_filename)
+            trainer.save_checkpoint(tmp_model_save_path, weights_only=self.save_weights_only)
+
+            mlflow.log_artifact(tmp_model_save_path, checkpoint_artifact_dir)
+        finally:
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+
+        self.latest_checkpoint_timestamp = time.time()
+
+
 # PyTorch-Lightning refactored the LoggerConnector class in version 1.4.0 and made metrics
 # update on demand. Prior to this, the metrics from the current step were not available to
 # callbacks immediately, so the view of metrics was off by one step.
@@ -396,6 +502,47 @@ def patched_fit(original, self, *args, **kwargs):
                 )
             ]
 
+        model_checkpoint = get_autologging_config(
+            mlflow.pytorch.FLAVOR_NAME, "model_checkpoint", True
+        )
+        if model_checkpoint:
+            if _pl_version >= Version("1.4.0"):
+                model_checkpoint_monitor = get_autologging_config(
+                    mlflow.pytorch.FLAVOR_NAME, "model_checkpoint_monitor", "val_loss"
+                )
+                model_checkpoint_mode = get_autologging_config(
+                    mlflow.pytorch.FLAVOR_NAME, "model_checkpoint_mode", "min"
+                )
+                model_checkpoint_save_best_only = get_autologging_config(
+                    mlflow.pytorch.FLAVOR_NAME, "model_checkpoint_save_best_only", True
+                )
+                model_checkpoint_save_weights_only = get_autologging_config(
+                    mlflow.pytorch.FLAVOR_NAME, "model_checkpoint_save_weights_only", True
+                )
+                model_checkpoint_every_n_epochs = get_autologging_config(
+                    mlflow.pytorch.FLAVOR_NAME, "model_checkpoint_every_n_epochs", None
+                )
+                model_checkpoint_train_time_interval_S = get_autologging_config(
+                    mlflow.pytorch.FLAVOR_NAME, "model_checkpoint_train_time_interval_S", None
+                )
+
+                # __MLflowModelCheckpoint only supports pytorch-lightning >- 1.4.0
+                if not any(isinstance(callbacks, __MLflowModelCheckpointCallback) for callbacks in self.callbacks):
+                    self.callbacks += [
+                        __MLflowModelCheckpointCallback(
+                            monitor=model_checkpoint_monitor,
+                            mode=model_checkpoint_mode,
+                            save_best_only=model_checkpoint_save_best_only,
+                            save_weights_only=model_checkpoint_save_weights_only,
+                            every_n_epochs=model_checkpoint_every_n_epochs,
+                            train_time_interval_S=model_checkpoint_train_time_interval_S,
+                        )
+                    ]
+            else:
+                warnings.warn(
+                    "Automatic model checkpointing is disabled because this feature only "
+                    "supports pytorch-lightning >= 1.4.0.")
+
         client.flush(synchronous=False)
 
         result = original(self, *args, **kwargs)