Skip to content

Commit

Permalink
Merge 'origin/main' into antonsc/gpu_test
Browse files Browse the repository at this point in the history
  • Loading branch information
ant0nsc committed Mar 30, 2022
2 parents f568b86 + 54bc8ed commit 12dfebf
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 31 deletions.
22 changes: 12 additions & 10 deletions hi-ml-azure/src/health_azure/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,13 @@ def get_datastore(workspace: Workspace, datastore_name: str) -> Datastore:
datastores = workspace.datastores
existing_stores = list(datastores.keys())
if not datastore_name:
# First check if there is only one datastore, which is then obviously unique.
# Only then try to use the default datastore, because there may not be a default set.
if len(existing_stores) == 1:
return datastores[existing_stores[0]]
raise ValueError("No datastore name provided. This is only possible if the workspace has a single datastore. "
f"However, the workspace has {len(existing_stores)} datastores: {existing_stores}")
datastore = workspace.get_default_datastore()
logging.info(f"Using the workspace default datastore {datastore.name} to access datasets.")
return datastore
if datastore_name in datastores:
return datastores[datastore_name]
raise ValueError(f"Datastore \"{datastore_name}\" was not found in the \"{workspace.name}\" workspace. "
Expand Down Expand Up @@ -115,27 +118,26 @@ def __init__(self,
raise ValueError("Can't mount or download a dataset to the current working directory.")
self.local_folder = Path(local_folder) if local_folder else None

def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Optional[Path], Optional[MountContext]]:
def to_input_dataset_local(self, workspace: Optional[Workspace]) -> Tuple[Path, Optional[MountContext]]:
"""
Return a local path to the dataset when outside of an AzureML run.
If local_folder is supplied, then this is assumed to be a local dataset, and this is returned.
Otherwise the dataset is mounted or downloaded to either the target folder or a temporary folder and that is
returned.
:param workspace: The AzureML workspace to read from.
:return: Pair of optional path to dataset and optional mountcontext.
:return: Tuple of (path to dataset, optional mountcontext)
"""
status = f"Dataset {self.name} will be "
status = f"Dataset '{self.name}' will be "

if self.local_folder is not None:
status += f"obtained from local folder {str(self.local_folder)}"
print(status)
return self.local_folder, None

if workspace is None:
status += "'None' - neither local_folder nor workspace available"
print(status)
return None, None
raise ValueError(f"Unable to make dataset '{self.name} available for a local run because no AzureML "
"workspace has been provided. Provide a workspace, or set a folder for local execution.")

azureml_dataset = get_or_create_dataset(workspace=workspace,
dataset_name=self.name,
Expand Down Expand Up @@ -308,8 +310,8 @@ def find_workspace_for_local_datasets(aml_workspace: Optional[Workspace],
try:
workspace = get_workspace(aml_workspace, workspace_config_path)
logging.info(f"Found workspace for datasets: {workspace.name}")
except Exception:
logging.info("Could not find workspace for datasets.")
except Exception as ex:
logging.info(f"Could not find workspace for datasets. Exception: {ex}")
return workspace


Expand Down
8 changes: 4 additions & 4 deletions hi-ml-azure/testazure/testazure/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ def test_get_datastore() -> None:
get_datastore(workspace=workspace, datastore_name=does_not_exist)
assert f"Datastore \"{does_not_exist}\" was not found" in str(ex)

# Trying to get a datastore without name should only work if there is a single datastore
# Trying to get a datastore when no name is specified should return the workspace default datastore
assert len(workspace.datastores) > 1
with pytest.raises(ValueError) as ex:
get_datastore(workspace=workspace, datastore_name="")
assert "No datastore name provided" in str(ex)
default_datastore = get_datastore(workspace=workspace, datastore_name="")
assert default_datastore is not None
assert default_datastore.name == workspace.get_default_datastore().name

# Retrieve a datastore by name
name = DEFAULT_DATASTORE
Expand Down
18 changes: 8 additions & 10 deletions hi-ml-azure/testazure/testazure/test_himl.py
Original file line number Diff line number Diff line change
Expand Up @@ -691,15 +691,16 @@ def spawn() -> Tuple[int, List[str]]:
cwd=path,
env=env)
return code, stdout

print(f"Starting the script in {path}")
if suppress_config_creation:
code, stdout = spawn()
else:
with check_config_json(path, shared_config_json=get_shared_config_json()):
code, stdout = spawn()
captured = "\n".join(stdout)
print(f"Script console output:\n{captured}")
assert code == 0 if expected_pass else 1, f"Expected the script to {'pass' if expected_pass else 'fail'}, but " \
f"got a return code {code}"
captured = "\n".join(stdout)

if run_target == RunTarget.LOCAL or not expected_pass:
assert EXPECTED_QUEUED not in captured
Expand Down Expand Up @@ -944,22 +945,19 @@ class TestOutputDataset:
folder_name: Path


@pytest.mark.parametrize(["run_target", "local_folder", "suppress_config_creation"],
[(RunTarget.LOCAL, False, False),
(RunTarget.LOCAL, True, False),
(RunTarget.LOCAL, True, True),
(RunTarget.AZUREML, False, False)])
@pytest.mark.parametrize(["run_target", "local_folder"],
[(RunTarget.LOCAL, False),
(RunTarget.LOCAL, True),
(RunTarget.AZUREML, False)])
def test_invoking_hello_world_datasets(run_target: RunTarget,
local_folder: bool,
suppress_config_creation: bool,
tmp_path: Path) -> None:
"""
Test that invoking rendered 'simple' / 'hello_world_template.txt' elevates itself to AzureML with config.json,
and that datasets are mounted in all combinations.
:param run_target: Where to run the script.
:param local_folder: True to use data in local folder when running locally, False to mount/download data.
:param suppress_config_creation: Do not create a config.json file if none exists
:param tmp_path: PyTest test fixture for temporary path.
"""
input_count = 5
Expand Down Expand Up @@ -1080,7 +1078,7 @@ def test_invoking_hello_world_datasets(run_target: RunTarget,
"""
}
extra_args: List[str] = []
output = render_and_run_test_script(tmp_path, run_target, extra_options, extra_args, True, suppress_config_creation)
output = render_and_run_test_script(tmp_path, run_target, extra_options, extra_args, True)

for input_dataset in input_datasets:
for output_dataset in output_datasets:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def mount_dataset(dataset_id: str) -> str:
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dataset_id', type=str,
# Run this script as "python mount_azure_dataset.py --dataset_id TCGA-CRCk"
parser.add_argument('--dataset_id', type=str,
help='Name of the Azure dataset e.g. PANDA or TCGA-CRCk')
args = parser.parse_args()
mount_dataset(args.dataset_id)
13 changes: 7 additions & 6 deletions hi-ml/src/health_ml/run_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,13 @@ def setup(self, azure_run_info: Optional[AzureRunInfo] = None) -> None:
# Set up the paths to the datasets. azure_run_info already has all necessary information, using either
# the provided local datasets for VM runs, or the AzureML mount points when running in AML.
# This must happen before container setup because that could already read datasets.
if len(azure_run_info.input_datasets) > 0:
input_datasets = azure_run_info.input_datasets
assert len(input_datasets) > 0
local_datasets = [
check_dataset_folder_exists(input_dataset) for input_dataset in input_datasets # type: ignore
]
input_datasets = azure_run_info.input_datasets
if len(input_datasets) > 0:
local_datasets: List[Path] = []
for i, dataset in enumerate(input_datasets):
if dataset is None:
raise ValueError(f"Invalid setup: The dataset at index {i} is None")
local_datasets.append(check_dataset_folder_exists(dataset))
self.container.local_datasets = local_datasets # type: ignore
# Ensure that we use fixed seeds before initializing the PyTorch models
seed_everything(self.container.get_effective_random_seed())
Expand Down

1 comment on commit 12dfebf

@github-actions

This comment was marked as outdated.

Please sign in to comment.