diff --git a/docs/source/runner.md b/docs/source/runner.md index a1b34a26d..bb54c6499 100644 --- a/docs/source/runner.md +++ b/docs/source/runner.md @@ -34,9 +34,10 @@ for example, `--model health_cpath.PandaImageNetMIL` is effectively telling the To train in AzureML, use the flag `--cluster` to specify the name of the cluster in your Workspace that you want to submit the job to. So the whole command would look like: -``` +```bash himl-runner --model=HelloWorld --cluster=my_cluster_name ``` + You can also specify `--num_nodes` if you wish to distribute the model training. When starting the runner, you need to do that from a directory that contains all the code that your experiment needs: @@ -47,7 +48,7 @@ AzureML needs to know which Python/Conda environment it should use. For that, th that contains a Conda environment definition. This file needs to be present either in the current working directory or one of its parents. To specify a Conda environment that is located elsewhere, you can use -```shell +```bash himl-runner --model=HelloWorld --cluster=my_cluster_name --conda_env=/my/folder/to/special_environment.yml ``` @@ -216,9 +217,9 @@ and returns a tuple containing the Optimizer and LRScheduler objects You can use the hi-ml-runner in inference mode only by switching the `--run_inference_only` flag on and specifying the model weights by setting `--src_checkpoint` argument that supports three types of checkpoints: -* A local path where the checkpoint is stored `--src_checkpoint=local/path/to/my_checkpoint/model.ckpt` -* A remote URL from where to download the weights `--src_checkpoint=https://my_checkpoint_url.com/model.ckpt` -* An AzureML run id where checkpoints are saved in `outputs/checkpoints`. For this specific use case, you can experiment +- A local path where the checkpoint is stored `--src_checkpoint=local/path/to/my_checkpoint/model.ckpt` +- A remote URL from where to download the weights `--src_checkpoint=https://my_checkpoint_url.com/model.ckpt` +- An AzureML run id where checkpoints are saved in `outputs/checkpoints`. For this specific use case, you can experiment with different checkpoints by setting `--src_checkpoint` according to the format `:`. If no custom path is provided (e.g., `--src_checkpoint=AzureML_run_id:best.ckpt`), we assume the checkpoints to be saved in the default @@ -228,7 +229,7 @@ the model weights by setting `--src_checkpoint` argument that supports three typ Running the following command line will run inference using `MyContainer` model with weights from the checkpoint saved in the AzureMl run `MyContainer_XXXX_yyyy` at the best validation loss epoch `/outputs/checkpoints/best_val_loss.ckpt`. -``` +```bash himl-runner --model=Mycontainer --run_inference_only --src_checkpoint=MyContainer_XXXX_yyyy:best_val_loss.ckpt ``` @@ -238,13 +239,43 @@ Analogously, one can resume training by setting `--src_checkpoint` to either con The pytorch lightning trainer will initialize the lightning module from the given checkpoint corresponding to the best validation loss epoch as set in the following comandline. -``` +```bash himl-runner --model=Mycontainer --cluster=my_cluster_name --src_checkpoint=MyContainer_XXXX_yyyy:best_val_loss.ckpt ``` Warning: When resuming training, one should make sure to set `container.max_epochs` greater than the last epoch of the specified checkpoint. A misconfiguration exception will be raised otherwise: -``` +```text pytorch_lightning.utilities.exceptions.MisconfigurationException: You restored a checkpoint with current_epoch=19, but you have set Trainer(max_epochs=4). ``` + +## Logging to AzureML when running outside AzureML + +The runner offers the ability to log metrics to AzureML, even if the present training is not running +inside of AzureML. This adds an additional level of traceability for runs on GPU VMs, where there is otherwise +no record of any past training. + +You can trigger this behaviour by specifying the `--log_from_vm` flag. For the `HelloWorld` model, this +will look like: + +```bash +himl-runner --model=HelloWorld --log_from_vm +``` + +For logging to work, you need have a `config.json` file in the current working directory (or one of its +parent folders) that specifies the AzureML workspace itself. When starting the runner, you will be asked +to authenticate to AzureML. + +There are two additional flags that can be used to control the logging behaviour: + +- The `--experiment` flag sets which AzureML experiment to log to. By default, the experiment name will be + the name of the model class (`HelloWorld` in the above example). +- The `--tag` flag sets the display name for the AzureML run. You can use that to give your run a memorable name, + and later easily find it in the AzureML UI. + +The following command will log to the experiment `my_experiment`, in a run that is labelled `my_first_run` in the UI: + +```bash +himl-runner --model=HelloWorld --log_from_vm --experiment=my_experiment --tag=my_first_run +``` diff --git a/hi-ml-azure/src/health_azure/utils.py b/hi-ml-azure/src/health_azure/utils.py index a2e77d9a8..2875e067b 100644 --- a/hi-ml-azure/src/health_azure/utils.py +++ b/hi-ml-azure/src/health_azure/utils.py @@ -1898,7 +1898,7 @@ def create_aml_run_object( exp = Experiment(workspace=actual_workspace, name=experiment_name) if snapshot_directory is None or snapshot_directory == "": snapshot_directory = tempfile.mkdtemp() - return exp.start_logging(name=run_name, snapshot_directory=str(snapshot_directory)) # type: ignore + return exp.start_logging(display_name=run_name, snapshot_directory=str(snapshot_directory)) # type: ignore def aml_workspace_for_unittests() -> Workspace: diff --git a/hi-ml-azure/testazure/testazure/test_azure_util.py b/hi-ml-azure/testazure/testazure/test_azure_util.py index 4021eadb0..ab38b92c4 100644 --- a/hi-ml-azure/testazure/testazure/test_azure_util.py +++ b/hi-ml-azure/testazure/testazure/test_azure_util.py @@ -2112,7 +2112,7 @@ def test_create_run() -> None: run = util.create_aml_run_object(experiment_name=experiment_name, run_name=run_name, workspace=DEFAULT_WORKSPACE.workspace) assert run is not None - assert run.name == run_name + assert run.display_name == run_name assert run.experiment.name == experiment_name metric_name = "mymetric" metric_value = 1.234 diff --git a/hi-ml/src/health_ml/configs/hello_world.py b/hi-ml/src/health_ml/configs/hello_world.py index 9a6c57e9c..9b5823873 100644 --- a/hi-ml/src/health_ml/configs/hello_world.py +++ b/hi-ml/src/health_ml/configs/hello_world.py @@ -230,6 +230,7 @@ def on_test_epoch_end(self) -> None: average_mse = torch.mean(torch.stack(self.test_mse)) Path("test_mse.txt").write_text(str(average_mse.item())) Path("test_mae.txt").write_text(str(self.test_mae.compute().item())) + self.log("test_mse", average_mse, on_epoch=True, on_step=False) class HelloWorld(LightningContainer): diff --git a/hi-ml/src/health_ml/deep_learning_config.py b/hi-ml/src/health_ml/deep_learning_config.py index 8247e09f0..3d93665d8 100644 --- a/hi-ml/src/health_ml/deep_learning_config.py +++ b/hi-ml/src/health_ml/deep_learning_config.py @@ -173,6 +173,13 @@ class WorkflowParams(param.Parameterized): run_inference_only: bool = param.Boolean(False, doc="If True, run only inference and skip training after loading" "model weights from the specified checkpoint in " "`src_checkpoint` flag. If False, run training and inference.") + tag: str = param.String(doc="A string that will be used as the display name of the run in AzureML.") + experiment: str = param.String(default="", doc="The name of the AzureML experiment to use for this run. If not " + "provided, the name of the model class will be used.") + log_from_vm: bool = param.Boolean(False, doc="If True, a training run outside AzureML will still log its " + "metrics to AzureML. Both intermediate validation metrics and final test results" + "will be recorded. You need to have an AzureML workspace config.json file " + "and will be asked for interactive authentication.") CROSSVAL_INDEX_ARG_NAME = "crossval_index" CROSSVAL_COUNT_ARG_NAME = "crossval_count" diff --git a/hi-ml/src/health_ml/experiment_config.py b/hi-ml/src/health_ml/experiment_config.py index ab305248e..fb8511603 100644 --- a/hi-ml/src/health_ml/experiment_config.py +++ b/hi-ml/src/health_ml/experiment_config.py @@ -11,7 +11,6 @@ class ExperimentConfig(param.Parameterized): "job in AzureML.") model: str = param.String(doc="The fully qualified name of the model to train/test -e.g." "mymodule.configs.MyConfig.") - tag: str = param.String(doc="A string that will be used as the display name of the run in AzureML.") mount_in_azureml: bool = param.Boolean(False, doc="If False (default), consume datasets in AzureML by downloading at " "job start. If True, datasets in AzureML are mounted (read on demand " diff --git a/hi-ml/src/health_ml/lightning_container.py b/hi-ml/src/health_ml/lightning_container.py index dbda2ef86..e40f4e123 100644 --- a/hi-ml/src/health_ml/lightning_container.py +++ b/hi-ml/src/health_ml/lightning_container.py @@ -214,6 +214,12 @@ def has_custom_test_step(self) -> bool: """ return type(self.model).test_step != LightningModule.test_step + @property + def effective_experiment_name(self) -> str: + """Returns the name of the AzureML experiment that should be used. This is taken from the commandline + argument `experiment`, falling back to the model class name if not set.""" + return self.experiment or self.model_name + class LightningModuleWithOptimizer(LightningModule): """ diff --git a/hi-ml/src/health_ml/model_trainer.py b/hi-ml/src/health_ml/model_trainer.py index 28380f19a..bdece9090 100644 --- a/hi-ml/src/health_ml/model_trainer.py +++ b/hi-ml/src/health_ml/model_trainer.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Any, List, Optional, Tuple, TypeVar +from azureml.core import Run from pytorch_lightning import Callback, Trainer from pytorch_lightning.callbacks import GPUStatsMonitor, ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger @@ -53,7 +54,8 @@ def get_pl_profiler(pl_profiler: Optional[str], outputs_folder: Path) -> Optiona def create_lightning_trainer(container: LightningContainer, resume_from_checkpoint: Optional[Path] = None, num_nodes: int = 1, - multiple_trainloader_mode: str = "max_size_cycle") -> \ + multiple_trainloader_mode: str = "max_size_cycle", + azureml_run_for_logging: Optional[Run] = None) -> \ Tuple[Trainer, StoringLogger]: """ Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers @@ -63,6 +65,9 @@ def create_lightning_trainer(container: LightningContainer, :param container: The container with model and data. :param resume_from_checkpoint: If provided, training resumes from this checkpoint point. :param num_nodes: The number of nodes to use in distributed training. + :param azureml_run_for_logging: An optional AzureML Run object to which all metrics should be logged. Use this + argument to log to AzureML when the training is happening outside of AzureML. If `azureml_run_for_logging` is + None and the present code is running in AzureML, the current run is used. :return: A tuple [Trainer object, diagnostic logger] """ logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}") @@ -86,7 +91,9 @@ def create_lightning_trainer(container: LightningContainer, message += "s per node with DDP" logging.info(f"Using {message}") tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="") - loggers = [tensorboard_logger, AzureMLLogger(False)] + azureml_logger = AzureMLLogger(enable_logging_outside_azure_ml=container.log_from_vm, + run=azureml_run_for_logging) + loggers = [tensorboard_logger, azureml_logger] storing_logger = StoringLogger() loggers.append(storing_logger) # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag. diff --git a/hi-ml/src/health_ml/run_ml.py b/hi-ml/src/health_ml/run_ml.py index acd2fe498..01a92a70d 100644 --- a/hi-ml/src/health_ml/run_ml.py +++ b/hi-ml/src/health_ml/run_ml.py @@ -10,6 +10,7 @@ from pathlib import Path from typing import Dict, List, Optional +from azureml.core import Run from pytorch_lightning import Trainer, seed_everything from health_azure import AzureRunInfo @@ -18,7 +19,7 @@ is_running_in_azure_ml, PARENT_RUN_CONTEXT, RUN_CONTEXT, aggregate_hyperdrive_metrics, get_metrics_for_childless_run, ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK, - is_local_rank_zero, is_global_rank_zero,) + is_local_rank_zero, is_global_rank_zero, create_aml_run_object) from health_ml.experiment_config import ExperimentConfig from health_ml.lightning_container import LightningContainer @@ -78,6 +79,7 @@ def __init__(self, project_root=self.project_root, run_context=RUN_CONTEXT) self.trainer: Optional[Trainer] = None + self.azureml_run_for_logging: Optional[Run] = None def set_run_tags_from_parent(self) -> None: """ @@ -176,9 +178,20 @@ def init_training(self) -> None: # Set random seeds just before training. Ensure that dataloader workers are also seeded correctly. seed_everything(self.container.get_effective_random_seed(), workers=True) - # get the container's datamodule + # Get the container's datamodule self.data_module = self.container.get_data_module() + # Create an AzureML run for logging if running outside AzureML. This run will be used for metrics logging + # during both training and inference. We can't rely on the automatically generated run inside the AzureMLLogger + # class because two of those logger objects will be created, so training and inference metrics would be logged + # in different runs. + if self.container.log_from_vm: + run = create_aml_run_object(experiment_name=self.container.effective_experiment_name) + # Display name should already be set when creating the Run object, but in some scenarios this + # does not happen. Hence, set it again. + run.display_name = self.container.tag if self.container.tag else None + self.azureml_run_for_logging = run + if not self.container.run_inference_only: checkpoint_path_for_recovery = self.checkpoint_handler.get_recovery_or_checkpoint_path_train() @@ -191,7 +204,8 @@ def init_training(self) -> None: container=self.container, resume_from_checkpoint=checkpoint_path_for_recovery, num_nodes=self.container.num_nodes, - multiple_trainloader_mode=self.get_multiple_trainloader_mode()) + multiple_trainloader_mode=self.get_multiple_trainloader_mode(), + azureml_run_for_logging=self.azureml_run_for_logging) rank_info = ", ".join( f"{env}: {os.getenv(env)}" for env in [ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK] @@ -285,7 +299,10 @@ def run_inference(self) -> None: self.checkpoint_handler.get_checkpoint_to_test() if self.container.src_checkpoint else None ) trainer, _ = create_lightning_trainer( - self.container, resume_from_checkpoint=checkpoint_path, num_nodes=1 + container=self.container, + resume_from_checkpoint=checkpoint_path, + num_nodes=1, + azureml_run_for_logging=self.azureml_run_for_logging ) # Change to the outputs folder so that the model can write to current working directory, and still @@ -339,29 +356,37 @@ def run(self) -> None: Driver function to run a ML experiment """ self.setup() - self.init_training() - if not self.container.run_inference_only: - # Backup the environment variables in case we need to run a second training in the unit tests. - old_environ = dict(os.environ) - - # do training - with logging_section("Model training"): - self.run_training() - - # load model checkpoint for custom inference or additional validation step - if self.container.has_custom_test_step() or self.container.run_extra_val_epoch: - self.load_model_checkpoint() - - # Run extra validation epoch if enabled - if self.container.run_extra_val_epoch: - with logging_section("Model Validation to save plots on validation set"): - self.run_validation() - - # Kill all processes besides rank 0 - self.after_ddp_cleanup(old_environ) - - # Run inference on a single device - with logging_section("Model inference"): - self.run_inference() - - self.run_regression_test() + try: + self.init_training() + if not self.container.run_inference_only: + # Backup the environment variables in case we need to run a second training in the unit tests. + old_environ = dict(os.environ) + + # do training + with logging_section("Model training"): + self.run_training() + + # load model checkpoint for custom inference or additional validation step + if self.container.has_custom_test_step() or self.container.run_extra_val_epoch: + self.load_model_checkpoint() + + # Run extra validation epoch if enabled + if self.container.run_extra_val_epoch: + with logging_section("Model Validation to save plots on validation set"): + self.run_validation() + + # Kill all processes besides rank 0 + self.after_ddp_cleanup(old_environ) + + # Run inference on a single device + with logging_section("Model inference"): + self.run_inference() + + self.run_regression_test() + + finally: + if self.azureml_run_for_logging is not None: + try: + self.azureml_run_for_logging.complete() + except Exception as ex: + logging.error("Failed to complete AzureML run: %s", ex) diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py index 999aa02b6..230b7057f 100644 --- a/hi-ml/src/health_ml/runner.py +++ b/hi-ml/src/health_ml/runner.py @@ -170,7 +170,7 @@ def additional_run_tags(self, script_params: List[str]) -> Dict[str, str]: """ return { "commandline_args": " ".join(script_params), - "tag": self.experiment_config.tag + "tag": self.lightning_container.tag } def run(self) -> Tuple[LightningContainer, AzureRunInfo]: @@ -207,8 +207,8 @@ def after_submission_hook(azure_run: Run) -> None: """ # Set the default display name to what was provided as the "tag". This will affect single runs # and Hyperdrive parent runs - if self.experiment_config.tag: - azure_run.display_name = self.experiment_config.tag + if self.lightning_container.tag: + azure_run.display_name = self.lightning_container.tag root_folder = self.project_root entry_script = Path(sys.argv[0]).resolve() @@ -257,7 +257,7 @@ def after_submission_hook(azure_run: Run) -> None: compute_cluster_name=self.experiment_config.cluster, environment_variables=environment_variables, default_datastore=default_datastore, - experiment_name=self.lightning_container.model_name, # create_experiment_name(), + experiment_name=self.lightning_container.effective_experiment_name, input_datasets=input_datasets, # type: ignore num_nodes=self.experiment_config.num_nodes, wait_for_completion=self.experiment_config.wait_for_completion, @@ -270,12 +270,12 @@ def after_submission_hook(azure_run: Run) -> None: after_submission=after_submission_hook, tags=self.additional_run_tags(script_params) ) - if self.experiment_config.tag and azure_run_info.run: + if self.lightning_container.tag and azure_run_info.run: if self.lightning_container.is_crossvalidation_enabled: # This code is only reached inside Azure. Set display name again - this will now affect # Hypdrive child runs (for other jobs, this has already been done after submission) cv_index = self.lightning_container.crossval_index - full_display_name = f"{self.experiment_config.tag} {cv_index}" + full_display_name = f"{self.lightning_container.tag} {cv_index}" azure_run_info.run.display_name = full_display_name else: diff --git a/hi-ml/src/health_ml/utils/logging.py b/hi-ml/src/health_ml/utils/logging.py index 0b98e3a94..5790efbf0 100644 --- a/hi-ml/src/health_ml/utils/logging.py +++ b/hi-ml/src/health_ml/utils/logging.py @@ -40,6 +40,7 @@ class AzureMLLogger(LightningLoggerBase): def __init__(self, enable_logging_outside_azure_ml: Optional[bool] = False, experiment_name: str = "azureml_logger", + run: Optional[Run] = None, run_name: Optional[str] = None, workspace: Optional[Workspace] = None, workspace_config_path: Optional[Path] = None, @@ -47,38 +48,48 @@ def __init__(self, ) -> None: """ :param enable_logging_outside_azure_ml: If True, the AzureML logger will write metrics to AzureML even if - executed outside of an AzureML run (for example, when working on a separate virtual machine). If False, - the logger will only write metrics to AzureML if the code is actually running inside of AzureML. Default False, - do not log outside of AzureML. + executed outside of an AzureML run (for example, when working on a separate virtual machine). If False, + the logger will only write metrics to AzureML if the code is actually running inside of AzureML. Default + False, do not log outside of AzureML. :param experiment_name: The AzureML experiment that should hold the run when executed outside of AzureML. + :param run: The AzureML run to log to when the ``enable_logging_outside_azure_ml`` flag is True. If None, + a new run will be created. When finished, the run should be completed by calling ``run.complete()``. The + logger itself only calls ``run.flush()`` in its ``finalize()`` method. :param run_name: An optional name for the run (this will be used as the display name in the AzureML UI). This - argument only matters when running outside of AzureML. + argument only matters when running outside of AzureML. :param workspace: If provided, use this workspace to create the run in. :param workspace_config_path: Use this path to read workspace configuration json file. If not provided, - use the workspace specified by the `config.json` file in the current working directory or its parents. + use the workspace specified by the `config.json` file in the current working directory or its parents. :param snapshot_directory: The folder that should be included as the code snapshot. By default, no snapshot - is created. Set this to the folder that contains all the code your experiment uses. You can use a file - .amlignore to skip specific files or folders, akin to .gitignore.. + is created. Set this to the folder that contains all the code your experiment uses. You can use a file + .amlignore to skip specific files or folders, akin to .gitignore.. """ super().__init__() self.is_running_in_azure_ml = is_running_in_azure_ml() self.run: Optional[Run] = None - self.has_custom_run = False + self.has_user_provided_run = False + self.enable_logging_outside_azure_ml = enable_logging_outside_azure_ml if self.is_running_in_azure_ml: self.run = RUN_CONTEXT elif enable_logging_outside_azure_ml: - try: - self.run = create_aml_run_object(experiment_name=experiment_name, - run_name=run_name, - workspace=workspace, - workspace_config_path=workspace_config_path, - snapshot_directory=snapshot_directory) - print(f"Writing metrics to run {self.run.id} in experiment {self.run.experiment.name}.") - print(f"To check progress, visit this URL: {self.run.get_portal_url()}") - self.has_custom_run = True - except Exception: - logging.error("Unable to create an AzureML run to store the results.") - raise + if run is not None: + self.run = run + self.has_user_provided_run = True + else: + try: + self.run = create_aml_run_object(experiment_name=experiment_name, + run_name=run_name, + workspace=workspace, + workspace_config_path=workspace_config_path, + snapshot_directory=snapshot_directory) + # Display name should already be set when creating the run object, but this does not happen. + # In unit tests, the run has the expected display name, but not here. Hence, set it again. + self.run.display_name = run_name + except Exception as ex: + logging.error(f"Unable to create an AzureML run to store the results because of {ex}.") + raise + print(f"Writing metrics to run {self.run.id} in experiment {self.run.experiment.name}.") + print(f"To check progress, visit this URL: {self.run.get_portal_url()}") else: print("AzureMLLogger will not write any logs because it is running outside AzureML, and the " "'enable_logging_outside_azure_ml' flag is set to False") @@ -129,9 +140,15 @@ def version(self) -> int: return 0 def finalize(self, status: str) -> None: - if self.run is not None and self.has_custom_run: - # Run.complete should only be called if we created an AzureML run here in the constructor. - self.run.complete() + if self.enable_logging_outside_azure_ml and not self.is_running_in_azure_ml and self.run is not None: + if self.has_user_provided_run: + # The logger uses a run that was provided by the user: Flush it, but do not complete it. + # The user should complete the run after finishing the experiment. This is important when running + # training outside of AzureML, so that training and inference metrics can be written to the same run. + self.run.flush() + else: + # Run.complete should only be called if we created an AzureML run here in the constructor. + self.run.complete() def _preprocess_hyperparams(self, params: Any) -> Dict[str, str]: """ diff --git a/hi-ml/testhiml/testhiml/test_run_ml.py b/hi-ml/testhiml/testhiml/test_run_ml.py index 51900cf91..fa4ed8d3a 100644 --- a/hi-ml/testhiml/testhiml/test_run_ml.py +++ b/hi-ml/testhiml/testhiml/test_run_ml.py @@ -9,12 +9,15 @@ from typing import Generator from unittest.mock import DEFAULT, MagicMock, Mock, patch +from azureml._restclient.constants import RunStatus + from health_ml.configs.hello_world import HelloWorld # type: ignore from health_ml.experiment_config import ExperimentConfig from health_ml.lightning_container import LightningContainer from health_ml.run_ml import MLRunner from health_ml.utils.common_utils import is_gpu_available from health_azure.utils import is_global_rank_zero +from health_ml.utils.logging import AzureMLLogger from testazure.utils_testazure import DEFAULT_WORKSPACE from testhiml.utils.fixed_paths_for_tests import mock_run_id @@ -314,4 +317,62 @@ def test_runner_end_to_end() -> None: runner.setup() runner.init_training() runner.run_training() - assert True + + +@pytest.mark.parametrize("log_from_vm", [True, False]) +def test_log_on_vm(log_from_vm: bool) -> None: + """Test if the AzureML logger is called when the experiment is run outside AzureML.""" + experiment_config = ExperimentConfig(model="HelloWorld") + container = HelloWorld() + container.max_epochs = 1 + # Mimic an experiment name given on the command line. + experiment_name = "unittest" + container.experiment = experiment_name + # The tag is used to identify the run, similar to the behaviour when submitting a run to AzureML. + tag = f"test_log_on_vm [{log_from_vm}]" + container.tag = tag + container.log_from_vm = log_from_vm + runner = MLRunner(experiment_config=experiment_config, container=container) + # When logging to AzureML, need to provide the unit test AML workspace. + # When not logging to AzureML, no workspace (and no authentication) should be needed. + if log_from_vm: + with patch("health_azure.utils.get_workspace", return_value=DEFAULT_WORKSPACE.workspace): + runner.run() + else: + runner.run() + # The PL trainer object is created in the init_training method. + # Check that the AzureML logger is set up correctly. + assert runner.trainer is not None + assert runner.trainer.loggers is not None + assert len(runner.trainer.loggers) > 1 + logger = runner.trainer.loggers[1] + assert isinstance(logger, AzureMLLogger) + if log_from_vm: + assert logger.run is not None + # Check that all user supplied data (experiment and display name) are respected. + assert logger.run.experiment is not None + assert logger.run.experiment.name == experiment_name + assert logger.run.display_name == tag + # Both trainig and inference metrics must be logged in the same Run object. + metrics = logger.run.get_metrics() + assert "test_mse" in metrics + assert "loss" in metrics + # The run must have been correctly marked as completed. + logger.run.wait_for_completion() + assert logger.run.status == RunStatus.COMPLETED + else: + assert logger.run is None + + +def test_experiment_name() -> None: + """Test that the experiment name is set correctly, choosing either the experiment name given on the commandline + or the model name""" + container = HelloWorld() + # No experiment name given on the commandline: use the model name + model_name = "some_model" + container._model_name = model_name + assert container.effective_experiment_name == model_name + # Experiment name given on the commandline: use the experiment name + experiment_name = "unittest" + container.experiment = experiment_name + assert container.effective_experiment_name == experiment_name diff --git a/hi-ml/testhiml/testhiml/utils/test_logging.py b/hi-ml/testhiml/testhiml/utils/test_logging.py index 21a95f819..e321c8ea3 100644 --- a/hi-ml/testhiml/testhiml/utils/test_logging.py +++ b/hi-ml/testhiml/testhiml/utils/test_logging.py @@ -143,7 +143,7 @@ def test_azureml_logger() -> None: logger = create_mock_logger() # On all build agents, this should not be detected as an AzureML run. assert not logger.is_running_in_azure_ml - assert logger.has_custom_run + assert not logger.has_user_provided_run logger.log_metrics({"foo": 1.0}) assert logger.run is not None logger.run.log.assert_called_once_with("foo", 1.0, step=None) @@ -271,7 +271,8 @@ def test_azureml_logger_init1() -> None: with mock.patch("health_ml.utils.logging.RUN_CONTEXT", "foo"): logger = AzureMLLogger(enable_logging_outside_azure_ml=True) assert logger.is_running_in_azure_ml - assert not logger.has_custom_run + assert logger.enable_logging_outside_azure_ml + assert not logger.has_user_provided_run assert logger.run == "foo" # We should be able to call finalize without any effect (logger.run == "foo", which has no # "Complete" method). When running in AzureML, the logger should not @@ -300,7 +301,7 @@ def test_azureml_logger_actual_run() -> None: assert logger.run != RUN_CONTEXT assert isinstance(logger.run, Run) assert logger.run.experiment.name == "azureml_logger" - assert logger.has_custom_run + assert not logger.has_user_provided_run expected_metrics = {"foo": 1.0, "bar": 2.0} logger.log_metrics(expected_metrics) logger.run.flush() @@ -328,13 +329,36 @@ def test_azureml_logger_init4() -> None: snapshot_directory="snapshot", workspace="workspace", # type: ignore workspace_config_path=Path("config_path")) - assert logger.has_custom_run + assert not logger.has_user_provided_run assert logger.run == run_mock mock_create.assert_called_once_with(experiment_name="exp", run_name="run", snapshot_directory="snapshot", workspace="workspace", workspace_config_path=Path("config_path")) + # The run created in the constructor is under the control of the AzureML logger, and should be completed. + # Check that the finalize method calls the run's complete method, but not the run's flush method. + run_mock.flush = MagicMock() + run_mock.complete = MagicMock() + logger.finalize(status="nothing") + run_mock.flush.assert_not_called() + run_mock.complete.assert_called_once() + + +def test_azureml_logger_finalize() -> None: + """Test if the finalize method correctly updates the run status. It should only operate on runs that are + outside of AzureML.""" + run_mock = MagicMock() + logger = AzureMLLogger(enable_logging_outside_azure_ml=True, run=run_mock) + assert logger.run is not None + assert logger.has_user_provided_run + run_mock.flush = MagicMock() + run_mock.complete = MagicMock() + # When providing a run explicitly, the finalize method should not call the run's complete method. Completing + # the run is the responsibility of the user. + logger.finalize(status="nothing") + run_mock.flush.assert_called_once() + run_mock.complete.assert_not_called() def test_progress_bar_enable() -> None: