PR comments

microsoft · Sep 2, 2022 · 4ec3f22 · 4ec3f22
1 parent 467fc6c
commit 4ec3f22
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 9 deletions.
diff --git a/docs/source/runner.md b/docs/source/runner.md
@@ -274,8 +274,7 @@ There are two additional flags that can be used to control the logging behaviour
 - The `--tag` flag sets the display name for the AzureML run. You can use that to give your run a memorable name,
     and later easily find it in the AzureML UI.
 
-Using these command line arguments, the following command will log to the experiment `my_experiment`, in a run that is showing
-as `my_first_run`:
+The following command will log to the experiment `my_experiment`, in a run that is labelled `my_first_run` in the UI:
 
 ```bash
 himl-runner --model=HelloWorld --log_from_vm --experiment=my_experiment --tag=my_first_run

diff --git a/hi-ml/src/health_ml/model_trainer.py b/hi-ml/src/health_ml/model_trainer.py
@@ -65,8 +65,9 @@ def create_lightning_trainer(container: LightningContainer,
     :param container: The container with model and data.
     :param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
     :param num_nodes: The number of nodes to use in distributed training.
-    :param azureml_run_for_logging: An optional AzureML Run object to which all metrics should be logged. If None and
-        the present code is running in AzureML, the current run is used.
+    :param azureml_run_for_logging: An optional AzureML Run object to which all metrics should be logged. Use this
+        argument to log to AzureML when the training is happening outside of AzureML. If `azureml_run_for_logging` is
+        None and the present code is running in AzureML, the current run is used.
     :return: A tuple [Trainer object, diagnostic logger]
     """
     logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")

diff --git a/hi-ml/src/health_ml/utils/logging.py b/hi-ml/src/health_ml/utils/logging.py
@@ -143,7 +143,8 @@ def finalize(self, status: str) -> None:
         if self.enable_logging_outside_azure_ml and not self.is_running_in_azure_ml and self.run is not None:
             if self.has_user_provided_run:
                 # The logger uses a run that was provided by the user: Flush it, but do not complete it.
-                # The user should complete the run after finishing the experiment.
+                # The user should complete the run after finishing the experiment. This is important when running
+                # training outside of AzureML, so that training and inference metrics can be written to the same run.
                 self.run.flush()
             else:
                 # Run.complete should only be called if we created an AzureML run here in the constructor.

diff --git a/hi-ml/testhiml/testhiml/test_run_ml.py b/hi-ml/testhiml/testhiml/test_run_ml.py
@@ -9,6 +9,8 @@
 from typing import Generator
 from unittest.mock import DEFAULT, MagicMock, Mock, patch
 
+from azureml._restclient.constants import RunStatus
+
 from health_ml.configs.hello_world import HelloWorld  # type: ignore
 from health_ml.experiment_config import ExperimentConfig
 from health_ml.lightning_container import LightningContainer
@@ -355,9 +357,9 @@ def test_log_on_vm(log_from_vm: bool) -> None:
         metrics = logger.run.get_metrics()
         assert "test_mse" in metrics
         assert "loss" in metrics
-        # The run must have been correctly marked as completed. However, I did not find a way to check this.
-        # When we get here, the run is still returning "Running" as the status, even though it is completed in the UI.
-        # assert logger.run.status == RunStatus.COMPLETED
+        # The run must have been correctly marked as completed.
+        logger.run.wait_for_completion()
+        assert logger.run.status == RunStatus.COMPLETED
     else:
         assert logger.run is None
 

diff --git a/hi-ml/testhiml/testhiml/utils/test_logging.py b/hi-ml/testhiml/testhiml/utils/test_logging.py
@@ -352,9 +352,10 @@ def test_azureml_logger_finalize() -> None:
     logger = AzureMLLogger(enable_logging_outside_azure_ml=True, run=run_mock)
     assert logger.run is not None
     assert logger.has_user_provided_run
-    logger.finalize("nothing")
     run_mock.flush = MagicMock()
     run_mock.complete = MagicMock()
+    # When providing a run explicitly, the finalize method should not call the run's complete method. Completing
+    # the run is the responsibility of the user.
     logger.finalize(status="nothing")
     run_mock.flush.assert_called_once()
     run_mock.complete.assert_not_called()