Skip to content

Commit

Permalink
BUG: Jobs on Kubernetes compute don't see GPUs (#769)
Browse files Browse the repository at this point in the history
Solved by submitting single node jobs as MpiDistribution(node_count=1)
  • Loading branch information
ant0nsc committed Jan 25, 2023
1 parent ec8b795 commit 6fca6b7
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 15 deletions.
2 changes: 1 addition & 1 deletion hi-ml-azure/run_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
azure-ai-ml>=0.1.0b6
azure-ai-ml>=1.1.1
azureml-core>=1.42.0
azureml-dataset-runtime[fuse]>=1.42.0
azureml-mlflow>=1.42.0
Expand Down
12 changes: 5 additions & 7 deletions hi-ml-azure/src/health_azure/himl.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,12 +547,10 @@ def submit_run_v2(workspace: Optional[Workspace],

def create_command_job(cmd: str) -> Command:
if pytorch_processes_per_node is None:
if num_nodes > 1:
distribution: Any = MpiDistribution(process_count_per_instance=1)
else:
# An empty dictionary for single node jobs would be in line with the type annotations on the
# 'command' function, but this is not recognized by the SDK. So we need to pass None instead.
distribution = None
# On AML managed compute, we can set distribution to None for single node jobs.
# However, on Kubernetes compute, single node jobs don't see any GPUs. GPUs are visible for MpiDistribution
# jobs, so we set MpiDistribution even for single node jobs.
distribution: Union[MpiDistribution, PyTorchDistribution] = MpiDistribution(process_count_per_instance=1)
else:
distribution = PyTorchDistribution(process_count_per_instance=pytorch_processes_per_node)
return command(
Expand All @@ -567,7 +565,7 @@ def create_command_job(cmd: str) -> Command:
shm_size=docker_shm_size,
display_name=display_name,
instance_count=num_nodes,
distribution=distribution, # type: ignore
distribution=distribution,
)

if hyperparam_args:
Expand Down
9 changes: 4 additions & 5 deletions hi-ml-azure/testazure/testazure/test_himl.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,7 @@ def _mock_sweep(*args: Any, **kwargs: Any) -> MagicMock:
tags=dummy_tags,
shm_size=dummy_docker_shm_size,
display_name=dummy_display_name,
distribution=None,
distribution=MpiDistribution(process_count_per_instance=1),
instance_count=1
)

Expand Down Expand Up @@ -858,7 +858,7 @@ def _mock_sweep(*args: Any, **kwargs: Any) -> MagicMock:
tags=dummy_tags,
shm_size=dummy_docker_shm_size,
display_name=dummy_display_name,
distribution=None,
distribution=MpiDistribution(process_count_per_instance=1),
instance_count=1
)

Expand Down Expand Up @@ -1867,7 +1867,7 @@ def test_submit_to_azure_v2_distributed() -> None:
mock_command.assert_called_once()
_, call_kwargs = mock_command.call_args
assert call_kwargs.get("instance_count") == 1
assert call_kwargs.get("distribution") is None
assert call_kwargs.get("distribution") == MpiDistribution(process_count_per_instance=1)

with pytest.raises(ValueError, match="num_nodes must be >= 1"):
_ = himl.submit_to_azure_if_needed(
Expand Down Expand Up @@ -1925,5 +1925,4 @@ def test_submit_to_azure_v2_distributed() -> None:
_, call_kwargs = mock_command.call_args
assert call_kwargs.get("instance_count") == num_nodes
distribution = call_kwargs.get("distribution")
assert isinstance(distribution, MpiDistribution)
assert distribution.process_count_per_instance == 1
assert distribution == MpiDistribution(process_count_per_instance=1)
2 changes: 1 addition & 1 deletion hi-ml-cpath/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ dependencies:
- astroid==2.12.13
- async-timeout==4.0.2
- attrs==21.4.0
- azure-ai-ml==1.1.2
- azure-ai-ml==1.3.0
- azure-common==1.1.28
- azure-core==1.26.1
- azure-graphrbac==0.61.1
Expand Down
2 changes: 1 addition & 1 deletion hi-ml/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ dependencies:
- async-timeout==4.0.2
- asynctest==0.13.0
- attrs==22.1.0
- azure-ai-ml==1.1.1
- azure-ai-ml==1.3.0
- azure-common==1.1.28
- azure-core==1.26.1
- azure-graphrbac==0.61.1
Expand Down

0 comments on commit 6fca6b7

Please sign in to comment.