Skip to content

Commit

Permalink
ASP-5048 Fix bug when extracting the lead host from the nodelist
Browse files Browse the repository at this point in the history
  • Loading branch information
julianaklulo committed Apr 30, 2024
1 parent 47212d5 commit 3bfd0be
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 20 deletions.
2 changes: 1 addition & 1 deletion lm-agent/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
This file keeps track of all notable changes to `License Manager Agent`.

## Unreleased

* Fix bug in extraction of the lead host when the job runs in multiple nodes [ASP-5048]

## 3.2.0 -- 2024-04-29
* Improve FlexLM, LM-X and OLicense parsers to parse multiple versions of the license server output [ASP-4670]
Expand Down
38 changes: 31 additions & 7 deletions lm-agent/lm_agent/workload_managers/slurm/cmd_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@

from lm_agent.backend_utils.models import LicenseBooking
from lm_agent.logs import logger
from lm_agent.workload_managers.slurm.common import (
CMD_TIMEOUT,
ENCODING,
SACCTMGR_PATH,
SCONTROL_PATH,
SQUEUE_PATH,
)


SCONTROL_PATH = "/usr/bin/scontrol"
SACCTMGR_PATH = "/usr/bin/sacctmgr"
SQUEUE_PATH = "/usr/bin/squeue"
CMD_TIMEOUT = 5
ENCODING = "UTF8"


class SqueueParserUnexpectedInputError(ValueError):
Expand Down Expand Up @@ -145,6 +145,30 @@ async def scontrol_show_lic():
return output


async def get_lead_host(nodelist):
"""
Get the job's lead host from the nodelist.
The lead host is the first node in the nodelist.
The nodelist can contain multiple lists of nodes inside square brackets.
"""
cmd = [
SCONTROL_PATH,
"show",
"hostnames",
nodelist,
]

proc = await asyncio.create_subprocess_shell(
shlex.join(cmd), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
)

stdout, _ = await asyncio.wait_for(proc.communicate(), CMD_TIMEOUT)
output = str(stdout, encoding=ENCODING)

return output.split("\n")[0]


async def get_cluster_name() -> str:
cmd = [
SACCTMGR_PATH,
Expand Down
16 changes: 7 additions & 9 deletions lm-agent/lm_agent/workload_managers/slurm/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,23 @@
import os

from lm_agent.logs import logger
from lm_agent.workload_managers.slurm.cmd_utils import get_lead_host

SCONTROL_PATH = "/usr/bin/scontrol"
SACCTMGR_PATH = "/usr/bin/sacctmgr"
SQUEUE_PATH = "/usr/bin/squeue"
CMD_TIMEOUT = 5
ENCODING = "UTF8"


def get_job_context():
"""Get and return variables from the job environment."""
async def get_job_context():
"""
Get and return variables from the job environment.
"""
ctxt = dict()
try:
ctxt = {
"cluster_name": os.environ["SLURM_CLUSTER_NAME"],
"job_id": os.environ["SLURM_JOB_ID"],
"lead_host": os.environ["SLURM_JOB_NODELIST"].split(",")[0],
"lead_host": await get_lead_host(os.environ["SLURM_JOB_NODELIST"]),
"user_name": os.environ["SLURM_JOB_USER"],
"job_licenses": os.environ.get("SLURM_JOB_LICENSES", ""),
}

except KeyError as e:
# If not all keys could be assigned, then return non 0 exit status
logger.error(
Expand Down
2 changes: 1 addition & 1 deletion lm-agent/lm_agent/workload_managers/slurm/reservations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from lm_agent.exceptions import CommandFailedToExecute
from lm_agent.logs import logger
from lm_agent.utils import run_command
from lm_agent.workload_managers.slurm.common import SCONTROL_PATH
from lm_agent.workload_managers.slurm.cmd_utils import SCONTROL_PATH


async def scontrol_create_reservation(licenses: str, duration: str) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
async def epilog():
# Initialize the logger
init_logging("slurmctld-epilog")
job_context = get_job_context()
job_context = await get_job_context()
job_id = job_context["job_id"]
job_licenses = job_context["job_licenses"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ async def prolog():
# Initialize the logger
init_logging("slurmctld-prolog")
# Acqure the job context
job_context = get_job_context()
job_context = await get_job_context()
job_id = job_context.get("job_id", "")
user_name = job_context.get("user_name")
lead_host = job_context.get("lead_host")
Expand Down
27 changes: 27 additions & 0 deletions lm-agent/tests/workload_managers/slurm/test_cmd_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
get_all_product_features_from_cluster,
get_required_licenses_for_job,
squeue_parser,
get_lead_host,
)


Expand Down Expand Up @@ -233,3 +234,29 @@ async def test_get_all_features_cluster_values(
):
show_lic_mock.return_value = show_lic_output
assert used_features == await get_all_features_cluster_values()


@mark.parametrize(
"nodelist,scontrol_output,actual_lead_host",
[
("host[1,2,3,4]", b"host1\nhost2\nhost3\nhost4\n", "host1"),
("host1", b"host1", "host1"),
(
"host-1-[1-3,5],host-2-[1,4]",
b"host-1-1\nhost-1-2\nhost-1-3\nhost-1-5\nhost-2-1\nhost-2-4\n",
"host-1-1",
),
],
)
@mark.asyncio
@mock.patch("lm_agent.workload_managers.slurm.cmd_utils.asyncio.create_subprocess_shell")
async def test_get_lead_host(
subprocess_mock: mock.MagicMock, nodelist: str, scontrol_output: bytes, actual_lead_host: str
):
"""
Do I return the correct lead host from the scontrol show hostnames command?
"""
subprocess_mock.return_value.communicate.return_value = (scontrol_output, None)

parsed_lead_host = await get_lead_host(nodelist)
assert parsed_lead_host == actual_lead_host

0 comments on commit 3bfd0be

Please sign in to comment.