From cb17cce41ff656e2971922cc9677786e4fd23d3f Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 30 Oct 2025 10:38:10 -0600 Subject: [PATCH 1/3] Let seq and rankfile mappers compute their own num-procs If we are using the seq or rankfile mapper and have multiple apps on the cmd line, then allow the mappers to compute their own num procs if one or more are not given. Signed-off-by: Ralph Castain --- src/mca/rmaps/base/rmaps_base_map_job.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mca/rmaps/base/rmaps_base_map_job.c b/src/mca/rmaps/base/rmaps_base_map_job.c index 0452f84bf9..4a9cd3f2a7 100644 --- a/src/mca/rmaps/base/rmaps_base_map_job.c +++ b/src/mca/rmaps/base/rmaps_base_map_job.c @@ -504,6 +504,13 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) options.nprocs += app->num_procs; continue; } + + if (PRTE_MAPPING_SEQ == PRTE_GET_MAPPING_POLICY(jdata->map->mapping) || + PRTE_MAPPING_BYUSER == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + // these mappers compute their #procs as they go + continue; + } + if (1 < jdata->num_apps && 0 == app->num_procs) { pmix_show_help("help-prte-rmaps-base.txt", "multi-apps-and-zero-np", true, @@ -562,6 +569,7 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) HWLOC_OBJ_PU); } } + } else { if (NULL != options.cpuset) { ck = PMIX_ARGV_SPLIT_COMPAT(options.cpuset, ','); From 58130c688f27bc5e539c8811b39565e071da0d92 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 30 Oct 2025 12:08:41 -0600 Subject: [PATCH 2/3] Fix relative node processing The empty nodes were not properly being added to the list of names to be used by the mapper. Signed-off-by: Ralph Castain --- src/util/dash_host/dash_host.c | 40 ++++++++++++++++++++++++--- src/util/dash_host/help-dash-host.txt | 4 ++- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/util/dash_host/dash_host.c b/src/util/dash_host/dash_host.c index 079eb4de41..577bc83477 100644 --- a/src/util/dash_host/dash_host.c +++ b/src/util/dash_host/dash_host.c @@ -450,14 +450,19 @@ int prte_util_add_dash_host_nodes(pmix_list_t *nodes, char *hosts, bool allocati */ static int parse_dash_host(char ***mapped_nodes, char *hosts) { - int32_t j, k; + int32_t j, k, start; int rc = PRTE_SUCCESS; char **mini_map = NULL, *cptr; - int nodeidx; + int nodeidx, nnodes, p; prte_node_t *node; char **host_argv = NULL; host_argv = PMIX_ARGV_SPLIT_COMPAT(hosts, ','); + if (prte_hnp_is_allocated) { + start = 0; + } else { + start = 1; + } /* Accumulate all of the host name mappings */ for (j = 0; j < PMIX_ARGV_COUNT_COMPAT(host_argv); ++j) { @@ -472,8 +477,35 @@ static int parse_dash_host(char ***mapped_nodes, char *hosts) */ if (NULL != (cptr = strchr(mini_map[k], ':'))) { /* the colon indicates a specific # are requested */ - *cptr = '*'; - PMIX_ARGV_APPEND_NOSIZE_COMPAT(mapped_nodes, cptr); + ++cptr; + if (NULL == cptr) { + // missing number of nodes being requested + pmix_show_help("help-dash-host.txt", + "dash-host:invalid-relative-node-syntax", true, + mini_map[k]); + rc = PRTE_ERR_SILENT; + goto cleanup; + } + nnodes = strtol(cptr, NULL, 10); + for (j=start, p=0; j < (int32_t)prte_node_pool->size && p < nnodes; j++) { + node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, j); + if (NULL == node) { + continue; + } + // if the node is empty, capture it + if (0 == node->num_procs) { + PMIX_ARGV_APPEND_NOSIZE_COMPAT(mapped_nodes, node->name); + ++p; + } + } + if (p < nnodes) { + // not enough empty nodes + pmix_show_help("help-dash-host.txt", + "dash-host:not-enough-empty", true, + nnodes-p); + rc = PRTE_ERR_SILENT; + goto cleanup; + } } else { /* add a marker to the list */ PMIX_ARGV_APPEND_NOSIZE_COMPAT(mapped_nodes, "*"); diff --git a/src/util/dash_host/help-dash-host.txt b/src/util/dash_host/help-dash-host.txt index a9b43afcef..902cdffd36 100644 --- a/src/util/dash_host/help-dash-host.txt +++ b/src/util/dash_host/help-dash-host.txt @@ -48,7 +48,9 @@ A relative host was improperly specified — the value provided was. --host: %s You may have forgotten to preface a node with "N" or "n", or used the -"e" or "E" to indicate empty nodes. +"e" or "E" to indicate empty nodes, or you ended the value with a +colon but forgot to include the number of empty nodes you were +requesting. Re-run this command with "--help hosts" for further information. # From 2ff7d6b355a2d42c2a3e97a08b75d22c45128c0b Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 31 Oct 2025 09:37:46 -0600 Subject: [PATCH 3/3] Replace sprintf with snprintf Per note in the OMPI project, at least one compiler family is removing the "sprintf" function. Replace all uses of that function with the safer "snprintf" version. Signed-off-by: Ralph Castain --- examples/debugger/direct-multi.c | 2 +- examples/debugger/direct.c | 4 +- src/mca/rmaps/rank_file/rmaps_rank_file.c | 12 +-- src/rml/oob/oob_tcp.c | 2 +- test/double-get.c | 6 +- test/get-nofence.c | 2 +- test/ptrace/ptrace_spawn_stopped.cxx | 119 ---------------------- 7 files changed, 14 insertions(+), 133 deletions(-) delete mode 100644 test/ptrace/ptrace_spawn_stopped.cxx diff --git a/examples/debugger/direct-multi.c b/examples/debugger/direct-multi.c index 99719b90d4..e7f11d18ea 100644 --- a/examples/debugger/direct-multi.c +++ b/examples/debugger/direct-multi.c @@ -564,7 +564,7 @@ static pmix_status_t spawn_app(void) } else { PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL); // All procs stop in PMIx_Init } - sprintf(map_str, "ppr:%d:node", app_npernode); + snprintf(map_str, 30, "ppr:%d:node", app_npernode); PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_MAPBY, map_str, PMIX_STRING); // app procs/node PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_RANKBY, "slot", PMIX_STRING); // match baseline PMIX_INFO_LIST_ADD(rc, tinfo, PMIX_FWD_STDOUT, NULL, PMIX_BOOL); // forward stdout to me diff --git a/examples/debugger/direct.c b/examples/debugger/direct.c index 8edffc96fb..c36f0412ad 100644 --- a/examples/debugger/direct.c +++ b/examples/debugger/direct.c @@ -338,7 +338,7 @@ static int cospawn_launch(myrel_t *myrel) /* Process that is spawning processes is a tool process */ PMIX_INFO_LIST_ADD(rc, dirs, PMIX_REQUESTOR_IS_TOOL, NULL, PMIX_BOOL); /* Map spawned processes by slot */ - sprintf(map_str, "ppr:%d:node", app_npernode); + snprintf(map_str, 128, "ppr:%d:node", app_npernode); PMIX_INFO_LIST_ADD(rc, dirs, PMIX_MAPBY, map_str, PMIX_STRING); PMIX_INFO_LIST_CONVERT(rc, dirs, &darray); PMIX_INFO_LIST_RELEASE(dirs); @@ -824,7 +824,7 @@ int main(int argc, char **argv) // procs are to pause in PMIx_Init for debugger attach PMIX_INFO_LIST_ADD(rc, dirs, PMIX_DEBUG_STOP_IN_INIT, NULL, PMIX_BOOL); } - sprintf(map_str, "ppr:%d:node", app_npernode); + snprintf(map_str, 128, "ppr:%d:node", app_npernode); PMIX_INFO_LIST_ADD(rc, dirs, PMIX_MAPBY, map_str, PMIX_STRING); // 1 per node PMIX_INFO_LIST_ADD(rc, dirs, PMIX_FWD_STDOUT, NULL, PMIX_BOOL); // forward stdout to me PMIX_INFO_LIST_ADD(rc, dirs, PMIX_FWD_STDERR, NULL, PMIX_BOOL); // forward stderr to me diff --git a/src/mca/rmaps/rank_file/rmaps_rank_file.c b/src/mca/rmaps/rank_file/rmaps_rank_file.c index 42cd5da03d..e228fdbded 100644 --- a/src/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/src/mca/rmaps/rank_file/rmaps_rank_file.c @@ -562,7 +562,7 @@ static int prte_rmaps_rank_file_parse(const char *rankfile) case PRTE_RANKFILE_INT: case PRTE_RANKFILE_RELATIVE: if (PRTE_RANKFILE_INT == token) { - sprintf(buff, "%d", prte_rmaps_rank_file_value.ival); + snprintf(buff,RMAPS_RANK_FILE_MAX_SLOTS, "%d", prte_rmaps_rank_file_value.ival); value = buff; } else { value = prte_rmaps_rank_file_value.sval; @@ -627,7 +627,7 @@ static int prte_rmaps_rank_file_parse(const char *rankfile) goto unlock; } else { /* prepare rank assignment string for the help message in case of a bad-assign */ - sprintf(tmp_rank_assignment, "%s slot=%s", node_name, value); + snprintf(tmp_rank_assignment, RMAPS_RANK_FILE_MAX_SLOTS, "%s slot=%s", node_name, value); pmix_pointer_array_set_item(assigned_ranks_array, 0, tmp_rank_assignment); } @@ -671,7 +671,7 @@ static char *prte_rmaps_rank_file_parse_string_or_int(void) case PRTE_RANKFILE_STRING: return strdup(prte_rmaps_rank_file_value.sval); case PRTE_RANKFILE_INT: - sprintf(tmp_str, "%d", prte_rmaps_rank_file_value.ival); + snprintf(tmp_str, RMAPS_RANK_FILE_MAX_SLOTS, "%d", prte_rmaps_rank_file_value.ival); return strdup(tmp_str); default: return NULL; @@ -787,7 +787,7 @@ static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, c // session dir + / (1) + lsf_rf. (7) + XXXXXX (6) + \0 (1) len = strlen(prte_process_info.top_session_dir) + 1 + 7 + 6 + 1; (*aff_rankfile) = (char*) malloc(sizeof(char) * len); - sprintf(*aff_rankfile, "%s/lsf_rf.XXXXXX", prte_process_info.top_session_dir); + snprintf(*aff_rankfile, len, "%s/lsf_rf.XXXXXX", prte_process_info.top_session_dir); /* open the file */ fp = fopen(affinity_file, "r"); @@ -882,7 +882,7 @@ static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, c free(cpus[i]); // 10 max number of digits in an int cpus[i] = (char*)malloc(sizeof(char) * 10); - sprintf(cpus[i], "%d", obj->logical_index); + snprintf(cpus[i], 10, "%d", obj->logical_index); } sep = PMIX_ARGV_JOIN_COMPAT(cpus, ','); PMIX_ARGV_FREE_COMPAT(cpus); @@ -893,7 +893,7 @@ static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, c // "rank " (5) + id (max 10) + = (1) + host (?) + " slot=" (6) + ids (?) + '\0' (1) len = 5 + 10 + 1 + strlen(hstname) + 6 + strlen(sep) + 1; tmp_str = (char *)malloc(sizeof(char) * len); - sprintf(tmp_str, "rank %d=%s slot=%s\n", cur_rank, hstname, sep); + snprintf(tmp_str, len, "rank %d=%s slot=%s\n", cur_rank, hstname, sep); pmix_fd_write(fp_rank, strlen(tmp_str), tmp_str); free(tmp_str); ++cur_rank; diff --git a/src/rml/oob/oob_tcp.c b/src/rml/oob/oob_tcp.c index db3cd7824b..00793ffd9f 100644 --- a/src/rml/oob/oob_tcp.c +++ b/src/rml/oob/oob_tcp.c @@ -287,7 +287,7 @@ int prte_oob_open(void) sizeof(copied_interface->if_mac)); copied_interface->ifmtu = selected_interface->ifmtu; /* Add the if_mask to the list */ - sprintf(string, "%d", selected_interface->if_mask); + snprintf(string, 50, "%d", selected_interface->if_mask); PMIX_ARGV_APPEND_NOSIZE_COMPAT(&prte_oob_base.if_masks, string); pmix_list_append(&prte_oob_base.local_ifs, &(copied_interface->super)); } diff --git a/test/double-get.c b/test/double-get.c index a2f1fc3f0e..9aeca0713f 100644 --- a/test/double-get.c +++ b/test/double-get.c @@ -154,8 +154,8 @@ int main(int argc, char *argv[]) PMIX_VALUE_RELEASE(pvalue); /* the below two lines break the subsequent PMIx_Get query on a key set later */ - sprintf(data, "FIRST TIME rank %d", myproc.rank); - pmi_set_string("test-key-1", data, 256); + snprintf(data, 256, "FIRST TIME rank %d", myproc.rank); + pnmi_set_string("test-key-1", data, 256); pmix_exchange(true); if (1 == myproc.rank) { @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) } } - sprintf(data, "SECOND TIME rank %d", myproc.rank); + snprintf(data, 256, "SECOND TIME rank %d", myproc.rank); if (0 == myproc.rank) { pmi_set_string("test-key-2", data, 256); } else { diff --git a/test/get-nofence.c b/test/get-nofence.c index ff74310e68..28ec44c634 100644 --- a/test/get-nofence.c +++ b/test/get-nofence.c @@ -163,7 +163,7 @@ int main(int argc, char *argv[]) PMIX_VALUE_RELEASE(pvalue); /* the below two lines break the subsequent PMIx_Get query on a key set later */ - sprintf(data, "FIRST TIME rank %d", myproc.rank); + snprintf(data, 256, "FIRST TIME rank %d", myproc.rank); pmi_set_string("test-key-1", data, 256); if (timeout) { diff --git a/test/ptrace/ptrace_spawn_stopped.cxx b/test/ptrace/ptrace_spawn_stopped.cxx deleted file mode 100644 index ef06e97f34..0000000000 --- a/test/ptrace/ptrace_spawn_stopped.cxx +++ /dev/null @@ -1,119 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -void child (int argc, char **argv) -{ - /* Non-portable call to ptrace(). YMMV. */ - long rvl = ptrace (PTRACE_TRACEME, 0, 0, 0); - if (rvl == -1) - { - perror ("child: ptrace(PTRACE_TRACEME) failed"); - exit (1); - } - int rvi = execvp (argv[0], argv); - perror ("child: execvp() failed"); - exit (1); -} - -void parent (pid_t pid, - const char *program) -{ - fprintf (stderr, - "parent: child pid is %d\n", - int (pid)); - int status; - pid_t rvp = waitpid (pid, &status, WUNTRACED); - if (rvp == -1) - { - perror ("parent: waitpid(pid) failed"); - exit (1); - } - if (WIFSTOPPED (status)) - { - fprintf (stderr, - "parent: child process stopped on signal %d\n", - WSTOPSIG (status)); - int rvi = kill (pid, SIGSTOP); - if (rvi == -1) - { - perror ("parent: kill(SIGSTOP) failed"); - exit (1); - } - /* Non-portable call to ptrace(). YMMV. */ - long rvl = ptrace (PTRACE_DETACH, pid, 0, (void*) SIGSTOP); - if (rvl == -1) - { - perror ("child: ptrace(PTRACE_DETACH) failed"); - exit (1); - } - } - else if (WIFEXITED (status)) - { - fprintf (stderr, - "parent: child process exited with status %d\n", - WEXITSTATUS (status)); - exit (1); - } - else if (WIFSIGNALED (status)) - { - fprintf (stderr, - "parent: child process kill by signal %d\n", - WTERMSIG (status)); - exit (1); - } - else - { - fprintf (stderr, - "parent: child process not stopped, exited, or signaled (status==%d)\n", - status); - exit (1); - } - - fprintf (stderr, - "parent: here's what 'ps' says about the process:\n"); - char cmd[100]; - sprintf (cmd, "ps -u -p %d", int(pid)); - system (cmd); - - fprintf (stderr, - "parent: try attaching using one of the following:\n" - "1) gdb %s %d\n" - "2) totalview -pid %d %s\n", - program, int (pid), - int (pid), program); - - fprintf (stderr, - "parent: waiting for the child to exit\n"); - rvp = waitpid (pid, &status, 0); - - fprintf (stderr, - "parent: child exited/terminated, wait status 0x%x\n", - status); -} - -int main (int argc, char **argv) -{ - if (argc < 2) - { - fprintf (stderr, - "Usage: %s []\n", - argv[0]); - exit (1); - } - pid_t pid = fork(); - if (pid == -1) - { - perror ("parent: fork() failed"); - exit (1); - } - else if (pid == 0) - child (argc - 1, argv + 1); - else - parent (pid, argv[1]); - return 0; -}