From 54bc8ed8a0bb60962745f3b49c70cbf921c0bf91 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 30 Mar 2022 09:48:29 +0100 Subject: [PATCH] ENH: Using default workspace datastore when datastore name is missing (#275) --- hi-ml-azure/src/health_azure/datasets.py | 22 ++++++++++--------- .../testazure/testazure/test_datasets.py | 8 +++---- hi-ml-azure/testazure/testazure/test_himl.py | 18 +++++++-------- .../testazure/testazure/utils_testazure.py | 5 +++-- .../scripts/mount_azure_dataset.py | 3 ++- hi-ml/src/health_ml/run_ml.py | 13 ++++++----- 6 files changed, 36 insertions(+), 33 deletions(-) diff --git a/hi-ml-azure/src/health_azure/datasets.py b/hi-ml-azure/src/health_azure/datasets.py index 4d9d14b19..6b62531ac 100644 --- a/hi-ml-azure/src/health_azure/datasets.py +++ b/hi-ml-azure/src/health_azure/datasets.py @@ -27,10 +27,13 @@ def get_datastore(workspace: Workspace, datastore_name: str) -> Datastore: datastores = workspace.datastores existing_stores = list(datastores.keys()) if not datastore_name: + # First check if there is only one datastore, which is then obviously unique. + # Only then try to use the default datastore, because there may not be a default set. if len(existing_stores) == 1: return datastores[existing_stores[0]] - raise ValueError("No datastore name provided. This is only possible if the workspace has a single datastore. " - f"However, the workspace has {len(existing_stores)} datastores: {existing_stores}") + datastore = workspace.get_default_datastore() + logging.info(f"Using the workspace default datastore {datastore.name} to access datasets.") + return datastore if datastore_name in datastores: return datastores[datastore_name] raise ValueError(f"Datastore \"{datastore_name}\" was not found in the \"{workspace.name}\" workspace. " @@ -115,7 +118,7 @@ def __init__(self, raise ValueError("Can't mount or download a dataset to the current working directory.") self.local_folder = Path(local_folder) if local_folder else None - def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Optional[Path], Optional[MountContext]]: + def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Path, Optional[MountContext]]: """ Return a local path to the dataset when outside of an AzureML run. If local_folder is supplied, then this is assumed to be a local dataset, and this is returned. @@ -123,9 +126,9 @@ def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Option returned. :param workspace: The AzureML workspace to read from. - :return: Pair of optional path to dataset and optional mountcontext. + :return: Tuple of (path to dataset, optional mountcontext) """ - status = f"Dataset {self.name} will be " + status = f"Dataset '{self.name}' will be " if self.local_folder is not None: status += f"obtained from local folder {str(self.local_folder)}" @@ -133,9 +136,8 @@ def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Option return self.local_folder, None if workspace is None: - status += "'None' - neither local_folder nor workspace available" - print(status) - return None, None + raise ValueError(f"Unable to make dataset '{self.name} available for a local run because no AzureML " + "workspace has been provided. Provide a workspace, or set a folder for local execution.") azureml_dataset = get_or_create_dataset(workspace=workspace, dataset_name=self.name, @@ -308,8 +310,8 @@ def find_workspace_for_local_datasets(aml_workspace: Optional[Workspace], try: workspace = get_workspace(aml_workspace, workspace_config_path) logging.info(f"Found workspace for datasets: {workspace.name}") - except Exception: - logging.info("Could not find workspace for datasets.") + except Exception as ex: + logging.info(f"Could not find workspace for datasets. Exception: {ex}") return workspace diff --git a/hi-ml-azure/testazure/testazure/test_datasets.py b/hi-ml-azure/testazure/testazure/test_datasets.py index b3731ccf5..096cc0034 100644 --- a/hi-ml-azure/testazure/testazure/test_datasets.py +++ b/hi-ml-azure/testazure/testazure/test_datasets.py @@ -41,11 +41,11 @@ def test_get_datastore() -> None: get_datastore(workspace=workspace, datastore_name=does_not_exist) assert f"Datastore \"{does_not_exist}\" was not found" in str(ex) - # Trying to get a datastore without name should only work if there is a single datastore + # Trying to get a datastore when no name is specified should return the workspace default datastore assert len(workspace.datastores) > 1 - with pytest.raises(ValueError) as ex: - get_datastore(workspace=workspace, datastore_name="") - assert "No datastore name provided" in str(ex) + default_datastore = get_datastore(workspace=workspace, datastore_name="") + assert default_datastore is not None + assert default_datastore.name == workspace.get_default_datastore().name # Retrieve a datastore by name name = DEFAULT_DATASTORE diff --git a/hi-ml-azure/testazure/testazure/test_himl.py b/hi-ml-azure/testazure/testazure/test_himl.py index 233669636..4c02f79d2 100644 --- a/hi-ml-azure/testazure/testazure/test_himl.py +++ b/hi-ml-azure/testazure/testazure/test_himl.py @@ -690,15 +690,16 @@ def spawn() -> Tuple[int, List[str]]: cwd=path, env=env) return code, stdout - + print(f"Starting the script in {path}") if suppress_config_creation: code, stdout = spawn() else: with check_config_json(path): code, stdout = spawn() + captured = "\n".join(stdout) + print(f"Script console output:\n{captured}") assert code == 0 if expected_pass else 1, f"Expected the script to {'pass' if expected_pass else 'fail'}, but " \ f"got a return code {code}" - captured = "\n".join(stdout) if run_target == RunTarget.LOCAL or not expected_pass: assert EXPECTED_QUEUED not in captured @@ -943,14 +944,12 @@ class TestOutputDataset: folder_name: Path -@pytest.mark.parametrize(["run_target", "local_folder", "suppress_config_creation"], - [(RunTarget.LOCAL, False, False), - (RunTarget.LOCAL, True, False), - (RunTarget.LOCAL, True, True), - (RunTarget.AZUREML, False, False)]) +@pytest.mark.parametrize(["run_target", "local_folder"], + [(RunTarget.LOCAL, False), + (RunTarget.LOCAL, True), + (RunTarget.AZUREML, False)]) def test_invoking_hello_world_datasets(run_target: RunTarget, local_folder: bool, - suppress_config_creation: bool, tmp_path: Path) -> None: """ Test that invoking rendered 'simple' / 'hello_world_template.txt' elevates itself to AzureML with config.json, @@ -958,7 +957,6 @@ def test_invoking_hello_world_datasets(run_target: RunTarget, :param run_target: Where to run the script. :param local_folder: True to use data in local folder when running locally, False to mount/download data. - :param suppress_config_creation: Do not create a config.json file if none exists :param tmp_path: PyTest test fixture for temporary path. """ input_count = 5 @@ -1079,7 +1077,7 @@ def test_invoking_hello_world_datasets(run_target: RunTarget, """ } extra_args: List[str] = [] - output = render_and_run_test_script(tmp_path, run_target, extra_options, extra_args, True, suppress_config_creation) + output = render_and_run_test_script(tmp_path, run_target, extra_options, extra_args, True) for input_dataset in input_datasets: for output_dataset in output_datasets: diff --git a/hi-ml-azure/testazure/testazure/utils_testazure.py b/hi-ml-azure/testazure/testazure/utils_testazure.py index 2e3e2551c..aad3582cf 100644 --- a/hi-ml-azure/testazure/testazure/utils_testazure.py +++ b/hi-ml-azure/testazure/testazure/utils_testazure.py @@ -81,8 +81,9 @@ def check_config_json(script_folder: Path) -> Generator: """ shared_config_json = get_shared_config_json() target_config_json = script_folder / WORKSPACE_CONFIG_JSON - if shared_config_json.exists(): - logging.info(f"Copying {WORKSPACE_CONFIG_JSON} from repository root to folder {script_folder}") + logging.info(f"Checking if configuration file {shared_config_json} exists") + if shared_config_json.is_file(): + logging.info(f"Copying configuration file to folder {script_folder}") shutil.copy(shared_config_json, target_config_json) else: logging.info(f"Creating {str(target_config_json)} from environment variables.") diff --git a/hi-ml-histopathology/src/histopathology/scripts/mount_azure_dataset.py b/hi-ml-histopathology/src/histopathology/scripts/mount_azure_dataset.py index c663ea204..c5550b603 100644 --- a/hi-ml-histopathology/src/histopathology/scripts/mount_azure_dataset.py +++ b/hi-ml-histopathology/src/histopathology/scripts/mount_azure_dataset.py @@ -19,7 +19,8 @@ def mount_dataset(dataset_id: str) -> str: if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() - parser.add_argument('dataset_id', type=str, + # Run this script as "python mount_azure_dataset.py --dataset_id TCGA-CRCk" + parser.add_argument('--dataset_id', type=str, help='Name of the Azure dataset e.g. PANDA or TCGA-CRCk') args = parser.parse_args() mount_dataset(args.dataset_id) diff --git a/hi-ml/src/health_ml/run_ml.py b/hi-ml/src/health_ml/run_ml.py index 1d30b02b3..6b3e756d0 100644 --- a/hi-ml/src/health_ml/run_ml.py +++ b/hi-ml/src/health_ml/run_ml.py @@ -76,12 +76,13 @@ def setup(self, azure_run_info: Optional[AzureRunInfo] = None) -> None: # Set up the paths to the datasets. azure_run_info already has all necessary information, using either # the provided local datasets for VM runs, or the AzureML mount points when running in AML. # This must happen before container setup because that could already read datasets. - if len(azure_run_info.input_datasets) > 0: - input_datasets = azure_run_info.input_datasets - assert len(input_datasets) > 0 - local_datasets = [ - check_dataset_folder_exists(input_dataset) for input_dataset in input_datasets # type: ignore - ] + input_datasets = azure_run_info.input_datasets + if len(input_datasets) > 0: + local_datasets: List[Path] = [] + for i, dataset in enumerate(input_datasets): + if dataset is None: + raise ValueError(f"Invalid setup: The dataset at index {i} is None") + local_datasets.append(check_dataset_folder_exists(dataset)) self.container.local_datasets = local_datasets # type: ignore # Ensure that we use fixed seeds before initializing the PyTorch models seed_everything(self.container.get_effective_random_seed())