ENH: Enable logging to AzureML when running training outside AzureML (#…

…580)
microsoft · Sep 13, 2022 · 1f965e4 · 1f965e4
1 parent 378fbb3
commit 1f965e4
Show file tree

Hide file tree

Showing 13 changed files with 268 additions and 76 deletions.
diff --git a/docs/source/runner.md b/docs/source/runner.md
@@ -34,9 +34,10 @@ for example, `--model health_cpath.PandaImageNetMIL` is effectively telling the
 To train in AzureML, use the flag `--cluster` to specify the name of the cluster
 in your Workspace that you want to submit the job to. So the whole command would look like:
 
-```
+```bash
 himl-runner --model=HelloWorld --cluster=my_cluster_name
 ```
+
 You can also specify `--num_nodes` if you wish to distribute the model training.
 
 When starting the runner, you need to do that from a directory that contains all the code that your experiment needs:
@@ -47,7 +48,7 @@ AzureML needs to know which Python/Conda environment it should use. For that, th
 that contains a Conda environment definition. This file needs to be present either in the current working directory or
 one of its parents. To specify a Conda environment that is located elsewhere, you can use
 
-```shell
+```bash
 himl-runner --model=HelloWorld --cluster=my_cluster_name --conda_env=/my/folder/to/special_environment.yml
 ```
 
@@ -216,9 +217,9 @@ and returns a tuple containing the Optimizer and LRScheduler objects
 You can use the hi-ml-runner in inference mode only by switching the `--run_inference_only` flag on and specifying
 the model weights by setting `--src_checkpoint` argument that supports three types of checkpoints:
 
-* A local path where the checkpoint is stored `--src_checkpoint=local/path/to/my_checkpoint/model.ckpt`
-* A remote URL from where to download the weights `--src_checkpoint=https://my_checkpoint_url.com/model.ckpt`
-* An AzureML run id where checkpoints are saved in `outputs/checkpoints`. For this specific use case, you can experiment
+- A local path where the checkpoint is stored `--src_checkpoint=local/path/to/my_checkpoint/model.ckpt`
+- A remote URL from where to download the weights `--src_checkpoint=https://my_checkpoint_url.com/model.ckpt`
+- An AzureML run id where checkpoints are saved in `outputs/checkpoints`. For this specific use case, you can experiment
   with different checkpoints by setting `--src_checkpoint` according to the format
   `<azureml_run_id>:<optional/custom/path/to/checkpoints/><filename.ckpt>`. If no custom path is provided
   (e.g., `--src_checkpoint=AzureML_run_id:best.ckpt`), we assume the checkpoints to be saved in the default
@@ -228,7 +229,7 @@ the model weights by setting `--src_checkpoint` argument that supports three typ
 Running the following command line will run inference using `MyContainer` model with weights from the checkpoint saved
 in the AzureMl run `MyContainer_XXXX_yyyy` at the best validation loss epoch `/outputs/checkpoints/best_val_loss.ckpt`.
 
-```
+```bash
 himl-runner --model=Mycontainer --run_inference_only --src_checkpoint=MyContainer_XXXX_yyyy:best_val_loss.ckpt
 ```
 
@@ -238,13 +239,43 @@ Analogously, one can resume training by setting `--src_checkpoint` and `--resume
 The pytorch lightning trainer will initialize the lightning module from the given checkpoint corresponding to the best
 validation loss epoch as set in the following comandline.
 
-```
+```bash
 himl-runner --model=Mycontainer --cluster=my_cluster_name --src_checkpoint=MyContainer_XXXX_yyyy:best_val_loss.ckpt --resume_training
 ```
 
 Warning: When resuming training, one should make sure to set `container.max_epochs` greater than the last epoch of the
 specified checkpoint. A misconfiguration exception will be raised otherwise:
 
-```
+```text
 pytorch_lightning.utilities.exceptions.MisconfigurationException: You restored a checkpoint with current_epoch=19, but you have set Trainer(max_epochs=4).
 ```
+
+## Logging to AzureML when running outside AzureML
+
+The runner offers the ability to log metrics to AzureML, even if the present training is not running
+inside of AzureML. This adds an additional level of traceability for runs on GPU VMs, where there is otherwise
+no record of any past training.
+
+You can trigger this behaviour by specifying the `--log_from_vm` flag. For the `HelloWorld` model, this
+will look like:
+
+```bash
+himl-runner --model=HelloWorld --log_from_vm
+```
+
+For logging to work, you need have a `config.json` file in the current working directory (or one of its
+parent folders) that specifies the AzureML workspace itself. When starting the runner, you will be asked
+to authenticate to AzureML.
+
+There are two additional flags that can be used to control the logging behaviour:
+
+- The `--experiment` flag sets which AzureML experiment to log to. By default, the experiment name will be
+    the name of the model class (`HelloWorld` in the above example).
+- The `--tag` flag sets the display name for the AzureML run. You can use that to give your run a memorable name,
+    and later easily find it in the AzureML UI.
+
+The following command will log to the experiment `my_experiment`, in a run that is labelled `my_first_run` in the UI:
+
+```bash
+himl-runner --model=HelloWorld --log_from_vm --experiment=my_experiment --tag=my_first_run
+```
diff --git a/hi-ml-azure/src/health_azure/utils.py b/hi-ml-azure/src/health_azure/utils.py
@@ -1898,7 +1898,7 @@ def create_aml_run_object(
     exp = Experiment(workspace=actual_workspace, name=experiment_name)
     if snapshot_directory is None or snapshot_directory == "":
         snapshot_directory = tempfile.mkdtemp()
-    return exp.start_logging(name=run_name, snapshot_directory=str(snapshot_directory))  # type: ignore
+    return exp.start_logging(display_name=run_name, snapshot_directory=str(snapshot_directory))  # type: ignore
 
 
 def aml_workspace_for_unittests() -> Workspace:

diff --git a/hi-ml-azure/testazure/testazure/test_azure_util.py b/hi-ml-azure/testazure/testazure/test_azure_util.py
@@ -2112,7 +2112,7 @@ def test_create_run() -> None:
         run = util.create_aml_run_object(experiment_name=experiment_name, run_name=run_name,
                                          workspace=DEFAULT_WORKSPACE.workspace)
         assert run is not None
-        assert run.name == run_name
+        assert run.display_name == run_name
         assert run.experiment.name == experiment_name
         metric_name = "mymetric"
         metric_value = 1.234

diff --git a/hi-ml/src/health_ml/configs/hello_world.py b/hi-ml/src/health_ml/configs/hello_world.py
@@ -230,6 +230,7 @@ def on_test_epoch_end(self) -> None:
         average_mse = torch.mean(torch.stack(self.test_mse))
         Path("test_mse.txt").write_text(str(average_mse.item()))
         Path("test_mae.txt").write_text(str(self.test_mae.compute().item()))
+        self.log("test_mse", average_mse, on_epoch=True, on_step=False)
 
 
 class HelloWorld(LightningContainer):

diff --git a/hi-ml/src/health_ml/deep_learning_config.py b/hi-ml/src/health_ml/deep_learning_config.py
@@ -177,6 +177,13 @@ class WorkflowParams(param.Parameterized):
                                                         "model weights from the specified checkpoint in "
                                                         "`src_checkpoint` flag. If False, run training and inference.")
     resume_training: bool = param.Boolean(False, doc="If True, resume training from the src_checkpoint.")
+    tag: str = param.String(doc="A string that will be used as the display name of the run in AzureML.")
+    experiment: str = param.String(default="", doc="The name of the AzureML experiment to use for this run. If not "
+                                   "provided, the name of the model class will be used.")
+    log_from_vm: bool = param.Boolean(False, doc="If True, a training run outside AzureML will still log its "
+                                      "metrics to AzureML. Both intermediate validation metrics and final test results"
+                                      "will be recorded. You need to have an AzureML workspace config.json file "
+                                      "and will be asked for interactive authentication.")
 
     CROSSVAL_INDEX_ARG_NAME = "crossval_index"
     CROSSVAL_COUNT_ARG_NAME = "crossval_count"

diff --git a/hi-ml/src/health_ml/experiment_config.py b/hi-ml/src/health_ml/experiment_config.py
@@ -21,7 +21,6 @@ class ExperimentConfig(param.Parameterized):
                                                   "job in AzureML.")
     model: str = param.String(doc="The fully qualified name of the model to train/test -e.g."
                                   "mymodule.configs.MyConfig.")
-    tag: str = param.String(doc="A string that will be used as the display name of the run in AzureML.")
     mount_in_azureml: bool = param.Boolean(False,
                                            doc="If False (default), consume datasets in AzureML by downloading at "
                                                "job start. If True, datasets in AzureML are mounted (read on demand "

diff --git a/hi-ml/src/health_ml/lightning_container.py b/hi-ml/src/health_ml/lightning_container.py
@@ -215,6 +215,12 @@ def has_custom_test_step(self) -> bool:
         """
         return type(self.model).test_step != LightningModule.test_step
 
+    @property
+    def effective_experiment_name(self) -> str:
+        """Returns the name of the AzureML experiment that should be used. This is taken from the commandline
+        argument `experiment`, falling back to the model class name if not set."""
+        return self.experiment or self.model_name
+
 
 class LightningModuleWithOptimizer(LightningModule):
     """

diff --git a/hi-ml/src/health_ml/model_trainer.py b/hi-ml/src/health_ml/model_trainer.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import Any, List, Optional, Tuple, TypeVar
 
+from azureml.core import Run
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.callbacks import GPUStatsMonitor, ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
@@ -53,7 +54,8 @@ def get_pl_profiler(pl_profiler: Optional[str], outputs_folder: Path) -> Optiona
 def create_lightning_trainer(container: LightningContainer,
                              resume_from_checkpoint: Optional[Path] = None,
                              num_nodes: int = 1,
-                             multiple_trainloader_mode: str = "max_size_cycle") -> \
+                             multiple_trainloader_mode: str = "max_size_cycle",
+                             azureml_run_for_logging: Optional[Run] = None) -> \
         Tuple[Trainer, StoringLogger]:
     """
     Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
@@ -63,6 +65,9 @@ def create_lightning_trainer(container: LightningContainer,
     :param container: The container with model and data.
     :param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
     :param num_nodes: The number of nodes to use in distributed training.
+    :param azureml_run_for_logging: An optional AzureML Run object to which all metrics should be logged. Use this
+        argument to log to AzureML when the training is happening outside of AzureML. If `azureml_run_for_logging` is
+        None and the present code is running in AzureML, the current run is used.
     :return: A tuple [Trainer object, diagnostic logger]
     """
     logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")
@@ -86,7 +91,9 @@ def create_lightning_trainer(container: LightningContainer,
             message += "s per node with DDP"
     logging.info(f"Using {message}")
     tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="")
-    loggers = [tensorboard_logger, AzureMLLogger(False)]
+    azureml_logger = AzureMLLogger(enable_logging_outside_azure_ml=container.log_from_vm,
+                                   run=azureml_run_for_logging)
+    loggers = [tensorboard_logger, azureml_logger]
     storing_logger = StoringLogger()
     loggers.append(storing_logger)
     # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.

diff --git a/hi-ml/src/health_ml/run_ml.py b/hi-ml/src/health_ml/run_ml.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional
 
+from azureml.core import Run
 from pytorch_lightning import Trainer, seed_everything
 
 from health_azure import AzureRunInfo
@@ -18,7 +19,7 @@
                                 is_running_in_azure_ml, PARENT_RUN_CONTEXT, RUN_CONTEXT,
                                 aggregate_hyperdrive_metrics, get_metrics_for_childless_run,
                                 ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK,
-                                is_local_rank_zero, is_global_rank_zero,)
+                                is_local_rank_zero, is_global_rank_zero, create_aml_run_object)
 
 from health_ml.experiment_config import ExperimentConfig
 from health_ml.lightning_container import LightningContainer
@@ -78,6 +79,7 @@ def __init__(self,
                                                     project_root=self.project_root,
                                                     run_context=RUN_CONTEXT)
         self.trainer: Optional[Trainer] = None
+        self.azureml_run_for_logging: Optional[Run] = None
 
     def set_run_tags_from_parent(self) -> None:
         """
@@ -176,9 +178,20 @@ def init_training(self) -> None:
         # Set random seeds just before training. Ensure that dataloader workers are also seeded correctly.
         seed_everything(self.container.get_effective_random_seed(), workers=True)
 
-        # get the container's datamodule
+        # Get the container's datamodule
         self.data_module = self.container.get_data_module()
 
+        # Create an AzureML run for logging if running outside AzureML. This run will be used for metrics logging
+        # during both training and inference. We can't rely on the automatically generated run inside the AzureMLLogger
+        # class because two of those logger objects will be created, so training and inference metrics would be logged
+        # in different runs.
+        if self.container.log_from_vm:
+            run = create_aml_run_object(experiment_name=self.container.effective_experiment_name)
+            # Display name should already be set when creating the Run object, but in some scenarios this
+            # does not happen. Hence, set it again.
+            run.display_name = self.container.tag if self.container.tag else None
+            self.azureml_run_for_logging = run
+
         if not self.container.run_inference_only:
 
             checkpoint_path_for_recovery = self.checkpoint_handler.get_recovery_or_checkpoint_path_train()
@@ -191,7 +204,8 @@ def init_training(self) -> None:
                 container=self.container,
                 resume_from_checkpoint=checkpoint_path_for_recovery,
                 num_nodes=self.container.num_nodes,
-                multiple_trainloader_mode=self.get_multiple_trainloader_mode())
+                multiple_trainloader_mode=self.get_multiple_trainloader_mode(),
+                azureml_run_for_logging=self.azureml_run_for_logging)
 
             rank_info = ", ".join(
                 f"{env}: {os.getenv(env)}" for env in [ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK]
@@ -287,7 +301,10 @@ def run_inference(self) -> None:
                 self.checkpoint_handler.get_checkpoint_to_test() if self.container.src_checkpoint else None
             )
             trainer, _ = create_lightning_trainer(
-                self.container, resume_from_checkpoint=checkpoint_path, num_nodes=1
+                container=self.container,
+                resume_from_checkpoint=checkpoint_path,
+                num_nodes=1,
+                azureml_run_for_logging=self.azureml_run_for_logging
             )
 
             # Change to the outputs folder so that the model can write to current working directory, and still
@@ -341,29 +358,37 @@ def run(self) -> None:
         Driver function to run a ML experiment
         """
         self.setup()
-        self.init_training()
-        if not self.container.run_inference_only:
-            # Backup the environment variables in case we need to run a second training in the unit tests.
-            old_environ = dict(os.environ)
-
-            # do training
-            with logging_section("Model training"):
-                self.run_training()
-
-            # load model checkpoint for custom inference or additional validation step
-            if self.container.has_custom_test_step() or self.container.run_extra_val_epoch:
-                self.load_model_checkpoint()
-
-            # Run extra validation epoch if enabled
-            if self.container.run_extra_val_epoch:
-                with logging_section("Model Validation to save plots on validation set"):
-                    self.run_validation()
-
-            # Kill all processes besides rank 0
-            self.after_ddp_cleanup(old_environ)
-
-        # Run inference on a single device
-        with logging_section("Model inference"):
-            self.run_inference()
-
-        self.run_regression_test()
+        try:
+            self.init_training()
+            if not self.container.run_inference_only:
+                # Backup the environment variables in case we need to run a second training in the unit tests.
+                old_environ = dict(os.environ)
+
+                # do training
+                with logging_section("Model training"):
+                    self.run_training()
+
+                # load model checkpoint for custom inference or additional validation step
+                if self.container.has_custom_test_step() or self.container.run_extra_val_epoch:
+                    self.load_model_checkpoint()
+
+                # Run extra validation epoch if enabled
+                if self.container.run_extra_val_epoch:
+                    with logging_section("Model Validation to save plots on validation set"):
+                        self.run_validation()
+
+                # Kill all processes besides rank 0
+                self.after_ddp_cleanup(old_environ)
+
+            # Run inference on a single device
+            with logging_section("Model inference"):
+                self.run_inference()
+
+            self.run_regression_test()
+
+        finally:
+            if self.azureml_run_for_logging is not None:
+                try:
+                    self.azureml_run_for_logging.complete()
+                except Exception as ex:
+                    logging.error("Failed to complete AzureML run: %s", ex)
diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py
@@ -170,7 +170,7 @@ def additional_run_tags(self, script_params: List[str]) -> Dict[str, str]:
         """
         return {
             "commandline_args": " ".join(script_params),
-            "tag": self.experiment_config.tag
+            "tag": self.lightning_container.tag
         }
 
     def run(self) -> Tuple[LightningContainer, AzureRunInfo]:
@@ -207,8 +207,8 @@ def after_submission_hook(azure_run: Run) -> None:
             """
             # Set the default display name to what was provided as the "tag". This will affect single runs
             # and Hyperdrive parent runs
-            if self.experiment_config.tag:
-                azure_run.display_name = self.experiment_config.tag
+            if self.lightning_container.tag:
+                azure_run.display_name = self.lightning_container.tag
 
         root_folder = self.project_root
         entry_script = Path(sys.argv[0]).resolve()
@@ -258,7 +258,7 @@ def after_submission_hook(azure_run: Run) -> None:
                 compute_cluster_name=self.experiment_config.cluster,
                 environment_variables=environment_variables,
                 default_datastore=default_datastore,
-                experiment_name=self.lightning_container.model_name,  # create_experiment_name(),
+                experiment_name=self.lightning_container.effective_experiment_name,
                 input_datasets=input_datasets,  # type: ignore
                 num_nodes=self.experiment_config.num_nodes,
                 wait_for_completion=self.experiment_config.wait_for_completion,
@@ -271,12 +271,12 @@ def after_submission_hook(azure_run: Run) -> None:
                 after_submission=after_submission_hook,
                 tags=self.additional_run_tags(script_params)
             )
-            if self.experiment_config.tag and azure_run_info.run:
+            if self.lightning_container.tag and azure_run_info.run:
                 if self.lightning_container.is_crossvalidation_enabled:
                     # This code is only reached inside Azure. Set display name again - this will now affect
                     # Hypdrive child runs (for other jobs, this has already been done after submission)
                     cv_index = self.lightning_container.crossval_index
-                    full_display_name = f"{self.experiment_config.tag} {cv_index}"
+                    full_display_name = f"{self.lightning_container.tag} {cv_index}"
                     azure_run_info.run.display_name = full_display_name
 
         else: