From 54bc8ed8a0bb60962745f3b49c70cbf921c0bf91 Mon Sep 17 00:00:00 2001
From: Anton Schwaighofer <antonsc@microsoft.com>
Date: Wed, 30 Mar 2022 09:48:29 +0100
Subject: [PATCH] ENH: Using default workspace datastore when datastore name is
 missing (#275)

---
 hi-ml-azure/src/health_azure/datasets.py      | 22 ++++++++++---------
 .../testazure/testazure/test_datasets.py      |  8 +++----
 hi-ml-azure/testazure/testazure/test_himl.py  | 18 +++++++--------
 .../testazure/testazure/utils_testazure.py    |  5 +++--
 .../scripts/mount_azure_dataset.py            |  3 ++-
 hi-ml/src/health_ml/run_ml.py                 | 13 ++++++-----
 6 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/hi-ml-azure/src/health_azure/datasets.py b/hi-ml-azure/src/health_azure/datasets.py
index 4d9d14b19..6b62531ac 100644
--- a/hi-ml-azure/src/health_azure/datasets.py
+++ b/hi-ml-azure/src/health_azure/datasets.py
@@ -27,10 +27,13 @@ def get_datastore(workspace: Workspace, datastore_name: str) -> Datastore:
     datastores = workspace.datastores
     existing_stores = list(datastores.keys())
     if not datastore_name:
+        # First check if there is only one datastore, which is then obviously unique.
+        # Only then try to use the default datastore, because there may not be a default set.
         if len(existing_stores) == 1:
             return datastores[existing_stores[0]]
-        raise ValueError("No datastore name provided. This is only possible if the workspace has a single datastore. "
-                         f"However, the workspace has {len(existing_stores)} datastores: {existing_stores}")
+        datastore = workspace.get_default_datastore()
+        logging.info(f"Using the workspace default datastore {datastore.name} to access datasets.")
+        return datastore
     if datastore_name in datastores:
         return datastores[datastore_name]
     raise ValueError(f"Datastore \"{datastore_name}\" was not found in the \"{workspace.name}\" workspace. "
@@ -115,7 +118,7 @@ def __init__(self,
             raise ValueError("Can't mount or download a dataset to the current working directory.")
         self.local_folder = Path(local_folder) if local_folder else None
 
-    def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Optional[Path], Optional[MountContext]]:
+    def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Path, Optional[MountContext]]:
         """
         Return a local path to the dataset when outside of an AzureML run.
         If local_folder is supplied, then this is assumed to be a local dataset, and this is returned.
@@ -123,9 +126,9 @@ def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Option
         returned.
 
         :param workspace: The AzureML workspace to read from.
-        :return: Pair of optional path to dataset and optional mountcontext.
+        :return: Tuple of (path to dataset, optional mountcontext)
         """
-        status = f"Dataset {self.name} will be "
+        status = f"Dataset '{self.name}' will be "
 
         if self.local_folder is not None:
             status += f"obtained from local folder {str(self.local_folder)}"
@@ -133,9 +136,8 @@ def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Option
             return self.local_folder, None
 
         if workspace is None:
-            status += "'None' - neither local_folder nor workspace available"
-            print(status)
-            return None, None
+            raise ValueError(f"Unable to make dataset '{self.name} available for a local run because no AzureML "
+                             "workspace has been provided. Provide a workspace, or set a folder for local execution.")
 
         azureml_dataset = get_or_create_dataset(workspace=workspace,
                                                 dataset_name=self.name,
@@ -308,8 +310,8 @@ def find_workspace_for_local_datasets(aml_workspace: Optional[Workspace],
         try:
             workspace = get_workspace(aml_workspace, workspace_config_path)
             logging.info(f"Found workspace for datasets: {workspace.name}")
-        except Exception:
-            logging.info("Could not find workspace for datasets.")
+        except Exception as ex:
+            logging.info(f"Could not find workspace for datasets. Exception: {ex}")
     return workspace
 
 
diff --git a/hi-ml-azure/testazure/testazure/test_datasets.py b/hi-ml-azure/testazure/testazure/test_datasets.py
index b3731ccf5..096cc0034 100644
--- a/hi-ml-azure/testazure/testazure/test_datasets.py
+++ b/hi-ml-azure/testazure/testazure/test_datasets.py
@@ -41,11 +41,11 @@ def test_get_datastore() -> None:
         get_datastore(workspace=workspace, datastore_name=does_not_exist)
     assert f"Datastore \"{does_not_exist}\" was not found" in str(ex)
 
-    # Trying to get a datastore without name should only work if there is a single datastore
+    # Trying to get a datastore when no name is specified should return the workspace default datastore
     assert len(workspace.datastores) > 1
-    with pytest.raises(ValueError) as ex:
-        get_datastore(workspace=workspace, datastore_name="")
-    assert "No datastore name provided" in str(ex)
+    default_datastore = get_datastore(workspace=workspace, datastore_name="")
+    assert default_datastore is not None
+    assert default_datastore.name == workspace.get_default_datastore().name
 
     # Retrieve a datastore by name
     name = DEFAULT_DATASTORE
diff --git a/hi-ml-azure/testazure/testazure/test_himl.py b/hi-ml-azure/testazure/testazure/test_himl.py
index 233669636..4c02f79d2 100644
--- a/hi-ml-azure/testazure/testazure/test_himl.py
+++ b/hi-ml-azure/testazure/testazure/test_himl.py
@@ -690,15 +690,16 @@ def spawn() -> Tuple[int, List[str]]:
             cwd=path,
             env=env)
         return code, stdout
-
+    print(f"Starting the script in {path}")
     if suppress_config_creation:
         code, stdout = spawn()
     else:
         with check_config_json(path):
             code, stdout = spawn()
+    captured = "\n".join(stdout)
+    print(f"Script console output:\n{captured}")
     assert code == 0 if expected_pass else 1, f"Expected the script to {'pass' if expected_pass else 'fail'}, but " \
                                               f"got a return code {code}"
-    captured = "\n".join(stdout)
 
     if run_target == RunTarget.LOCAL or not expected_pass:
         assert EXPECTED_QUEUED not in captured
@@ -943,14 +944,12 @@ class TestOutputDataset:
     folder_name: Path
 
 
-@pytest.mark.parametrize(["run_target", "local_folder", "suppress_config_creation"],
-                         [(RunTarget.LOCAL, False, False),
-                          (RunTarget.LOCAL, True, False),
-                          (RunTarget.LOCAL, True, True),
-                          (RunTarget.AZUREML, False, False)])
+@pytest.mark.parametrize(["run_target", "local_folder"],
+                         [(RunTarget.LOCAL, False),
+                          (RunTarget.LOCAL, True),
+                          (RunTarget.AZUREML, False)])
 def test_invoking_hello_world_datasets(run_target: RunTarget,
                                        local_folder: bool,
-                                       suppress_config_creation: bool,
                                        tmp_path: Path) -> None:
     """
     Test that invoking rendered 'simple' / 'hello_world_template.txt' elevates itself to AzureML with config.json,
@@ -958,7 +957,6 @@ def test_invoking_hello_world_datasets(run_target: RunTarget,
 
     :param run_target: Where to run the script.
     :param local_folder: True to use data in local folder when running locally, False to mount/download data.
-    :param suppress_config_creation: Do not create a config.json file if none exists
     :param tmp_path: PyTest test fixture for temporary path.
     """
     input_count = 5
@@ -1079,7 +1077,7 @@ def test_invoking_hello_world_datasets(run_target: RunTarget,
         """
     }
     extra_args: List[str] = []
-    output = render_and_run_test_script(tmp_path, run_target, extra_options, extra_args, True, suppress_config_creation)
+    output = render_and_run_test_script(tmp_path, run_target, extra_options, extra_args, True)
 
     for input_dataset in input_datasets:
         for output_dataset in output_datasets:
diff --git a/hi-ml-azure/testazure/testazure/utils_testazure.py b/hi-ml-azure/testazure/testazure/utils_testazure.py
index 2e3e2551c..aad3582cf 100644
--- a/hi-ml-azure/testazure/testazure/utils_testazure.py
+++ b/hi-ml-azure/testazure/testazure/utils_testazure.py
@@ -81,8 +81,9 @@ def check_config_json(script_folder: Path) -> Generator:
     """
     shared_config_json = get_shared_config_json()
     target_config_json = script_folder / WORKSPACE_CONFIG_JSON
-    if shared_config_json.exists():
-        logging.info(f"Copying {WORKSPACE_CONFIG_JSON} from repository root to folder {script_folder}")
+    logging.info(f"Checking if configuration file {shared_config_json} exists")
+    if shared_config_json.is_file():
+        logging.info(f"Copying configuration file to folder {script_folder}")
         shutil.copy(shared_config_json, target_config_json)
     else:
         logging.info(f"Creating {str(target_config_json)} from environment variables.")
diff --git a/hi-ml-histopathology/src/histopathology/scripts/mount_azure_dataset.py b/hi-ml-histopathology/src/histopathology/scripts/mount_azure_dataset.py
index c663ea204..c5550b603 100644
--- a/hi-ml-histopathology/src/histopathology/scripts/mount_azure_dataset.py
+++ b/hi-ml-histopathology/src/histopathology/scripts/mount_azure_dataset.py
@@ -19,7 +19,8 @@ def mount_dataset(dataset_id: str) -> str:
 if __name__ == '__main__':
     import argparse
     parser = argparse.ArgumentParser()
-    parser.add_argument('dataset_id', type=str,
+    # Run this script as "python mount_azure_dataset.py --dataset_id TCGA-CRCk"
+    parser.add_argument('--dataset_id', type=str,
                         help='Name of the Azure dataset e.g. PANDA or TCGA-CRCk')
     args = parser.parse_args()
     mount_dataset(args.dataset_id)
diff --git a/hi-ml/src/health_ml/run_ml.py b/hi-ml/src/health_ml/run_ml.py
index 1d30b02b3..6b3e756d0 100644
--- a/hi-ml/src/health_ml/run_ml.py
+++ b/hi-ml/src/health_ml/run_ml.py
@@ -76,12 +76,13 @@ def setup(self, azure_run_info: Optional[AzureRunInfo] = None) -> None:
             # Set up the paths to the datasets. azure_run_info already has all necessary information, using either
             # the provided local datasets for VM runs, or the AzureML mount points when running in AML.
             # This must happen before container setup because that could already read datasets.
-            if len(azure_run_info.input_datasets) > 0:
-                input_datasets = azure_run_info.input_datasets
-                assert len(input_datasets) > 0
-                local_datasets = [
-                    check_dataset_folder_exists(input_dataset) for input_dataset in input_datasets  # type: ignore
-                ]
+            input_datasets = azure_run_info.input_datasets
+            if len(input_datasets) > 0:
+                local_datasets: List[Path] = []
+                for i, dataset in enumerate(input_datasets):
+                    if dataset is None:
+                        raise ValueError(f"Invalid setup: The dataset at index {i} is None")
+                    local_datasets.append(check_dataset_folder_exists(dataset))
                 self.container.local_datasets = local_datasets  # type: ignore
         # Ensure that we use fixed seeds before initializing the PyTorch models
         seed_everything(self.container.get_effective_random_seed())