Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/mca/ess/base/ess_base_bootstrap.c
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,9 @@ int prte_ess_base_bootstrap(void)
if (NULL != cluster) {
free(cluster);
}
if (NULL != ctrlhost) {
free(ctrlhost);
}
if (NULL != dvmnodes) {
free(dvmnodes);
}
Expand Down
5 changes: 3 additions & 2 deletions src/mca/ess/base/ess_base_std_prted.c
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,9 @@ int prte_ess_base_prted_setup(void)
error:
pmix_show_help("help-prte-runtime.txt", "prte_init:startup:internal-failure", true,
error, PRTE_ERROR_NAME(ret), ret);
/* remove our use of the session directory tree */
PMIX_RELEASE(jdata);
if (NULL != jdata) {
PMIX_RELEASE(jdata);
}
return PRTE_ERR_SILENT;
}

Expand Down
76 changes: 64 additions & 12 deletions src/mca/grpcomm/direct/grpcomm_direct_group.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -226,19 +226,43 @@ static void group(int sd, short args, void *cbdata)
PMIx_Info_list_convert(grpinfo, &darray);
info = (pmix_info_t*)darray.array;
ninfo = darray.size;
PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE);
rc = PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(relay);
PMIX_DESTRUCT(&sig);
goto error;
}
if (0 < ninfo) {
PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO);
rc = PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(relay);
PMIX_DESTRUCT(&sig);
goto error;
}
}
PMIX_DATA_ARRAY_DESTRUCT(&darray);

// pack any endpts
PMIx_Info_list_convert(endpts, &darray);
info = (pmix_info_t*)darray.array;
ninfo = darray.size;
PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE);
rc = PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(relay);
PMIX_DESTRUCT(&sig);
goto error;
}
if (0 < ninfo) {
PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO);
rc = PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(relay);
PMIX_DESTRUCT(&sig);
goto error;
}
}
PMIX_DATA_ARRAY_DESTRUCT(&darray);
}
Expand Down Expand Up @@ -298,7 +322,7 @@ void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender,
prte_namelist_t *nm;
pmix_data_array_t darray;
pmix_status_t st;
pmix_info_t *info = NULL, *endpts, *grpinfo;
pmix_info_t *info = NULL, *endpts, *grpinfo = NULL;
prte_grpcomm_direct_group_signature_t *sig = NULL;
pmix_data_buffer_t *reply;
prte_grpcomm_group_t *coll;
Expand Down Expand Up @@ -386,7 +410,9 @@ void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender,
rc = PMIx_Data_unpack(NULL, buffer, &nendpts, &cnt, PMIX_SIZE);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_INFO_FREE(grpinfo, ngrpinfo);
if (NULL != grpinfo) {
PMIX_INFO_FREE(grpinfo, ngrpinfo);
}
PMIX_RELEASE(sig);
return;
}
Expand All @@ -396,7 +422,9 @@ void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender,
rc = PMIx_Data_unpack(NULL, buffer, endpts, &cnt, PMIX_INFO);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_INFO_FREE(grpinfo, ngrpinfo);
if (NULL != grpinfo) {
PMIX_INFO_FREE(grpinfo, ngrpinfo);
}
PMIX_INFO_FREE(endpts, nendpts);
PMIX_RELEASE(sig);
return;
Expand Down Expand Up @@ -619,19 +647,43 @@ void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender,
PMIx_Info_list_convert(coll->grpinfo, &darray);
info = (pmix_info_t*)darray.array;
ninfo = darray.size;
PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE);
rc = PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(reply);
PMIX_RELEASE(sig);
return;
}
if (0 < ninfo) {
PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO);
rc = PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(reply);
PMIX_RELEASE(sig);
return;
}
}
PMIX_DATA_ARRAY_DESTRUCT(&darray);

// pack any endpts
PMIx_Info_list_convert(coll->endpts, &darray);
info = (pmix_info_t*)darray.array;
ninfo = darray.size;
PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE);
rc = PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(reply);
PMIX_RELEASE(sig);
return;
}
if (0 < ninfo) {
PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO);
rc =PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DATA_BUFFER_RELEASE(reply);
PMIX_RELEASE(sig);
return;
}
}
PMIX_DATA_ARRAY_DESTRUCT(&darray);
}
Expand Down
3 changes: 1 addition & 2 deletions src/mca/odls/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
* Copyright (c) 2017-2019 Intel, Inc. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -56,7 +56,6 @@ typedef struct {
char **ev_threads; // event progress thread names
int next_base; // counter to load-level thread use
bool signal_direct_children_only;
pmix_lock_t lock;
char *exec_agent;
} prte_odls_globals_t;

Expand Down
15 changes: 4 additions & 11 deletions src/mca/odls/base/odls_base_bind.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,11 +275,10 @@ void prte_odls_base_set(prte_odls_spawn_caddy_t *cd, int write_fd)
hwloc_bitmap_free(cpuset);
/* if we got an error and this wasn't a default binding policy, then report it */
if (rc < 0 && PRTE_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
char *tmp = NULL;
if (errno == ENOSYS) {
msg = "hwloc indicates cpu binding not supported";
msg = strdup("hwloc indicates cpu binding not supported");
} else if (errno == EXDEV) {
msg = "hwloc indicates cpu binding cannot be enforced";
msg = strdup("hwloc indicates cpu binding cannot be enforced");
} else {
pmix_asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"",
prte_strerror(rc), child->cpuset);
Expand All @@ -291,19 +290,13 @@ void prte_odls_base_set(prte_odls_spawn_caddy_t *cd, int write_fd)
"binding generic error",
prte_process_info.nodename, context->app, msg,
__FILE__, __LINE__);
free(msg); // silence static analyzer warning
} else {
send_warn_show_help(write_fd, "help-prte-odls-default.txt",
"not bound", prte_process_info.nodename,
context->app, msg, __FILE__, __LINE__);
if (NULL != tmp) {
free(tmp);
free(msg);
}
return;
}
if (NULL != tmp) {
free(tmp);
free(msg);
return;
}
}

Expand Down
28 changes: 26 additions & 2 deletions src/mca/odls/base/odls_base_default_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -1252,14 +1252,15 @@ void prte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
int j, idx;
int total_num_local_procs = 0;
prte_odls_launch_local_t *caddy = (prte_odls_launch_local_t *) cbdata;
prte_job_t *jobdat;
prte_job_t *jobdat, *parent;
pmix_nspace_t job;
prte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
bool index_argv;
bool index_argv, inherit;
char *msg, **xfer;
prte_odls_spawn_caddy_t *cd;
prte_event_base_t *evb;
prte_schizo_base_module_t *schizo;
pmix_proc_t *nptr;
PRTE_HIDE_UNUSED_PARAMS(fd, sd);

PMIX_ACQUIRE_OBJECT(caddy);
Expand Down Expand Up @@ -1352,6 +1353,20 @@ void prte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
}
}

// see if we have a parent in case of inheritance
nptr = NULL;
prte_get_attribute(&jobdat->attributes, PRTE_JOB_LAUNCH_PROXY, (void **) &nptr, PMIX_PROC);
if (NULL != nptr) {
parent = prte_get_job_data_object(nptr->nspace);
if (NULL != parent) {
inherit = prte_get_attribute(&parent->attributes, PRTE_JOB_INHERIT, NULL, PMIX_BOOL);
} else {
inherit = false;
}
} else {
inherit = false;
}

for (j = 0; j < jobdat->apps->size; j++) {
app = (prte_app_context_t *) pmix_pointer_array_get_item(jobdat->apps, j);
if (NULL == app) {
Expand Down Expand Up @@ -1395,6 +1410,10 @@ void prte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
}

// process any provided env directives
if (inherit) {
// start with the parent's directives
process_envars(parent, app);
}
process_envars(jobdat, app);


Expand Down Expand Up @@ -2151,6 +2170,11 @@ int prte_odls_base_default_restart_proc(prte_proc_t *child,
child->rml_uri = NULL;
}
app = (prte_app_context_t *) pmix_pointer_array_get_item(jobdat->apps, child->app_idx);
if (NULL == app) {
PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND);
rc = PRTE_ERR_NOT_FOUND;
goto CLEANUP;
}

/* setup the path */
if (PRTE_SUCCESS != (rc = setup_path(app, &wdir))) {
Expand Down
20 changes: 3 additions & 17 deletions src/mca/odls/base/odls_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ prte_odls_globals_t prte_odls_globals = {
.ev_threads = NULL,
.next_base = 0,
.signal_direct_children_only = false,
.lock = PMIX_LOCK_STATIC_INIT,
.exec_agent = NULL
};

Expand Down Expand Up @@ -126,7 +125,6 @@ void prte_odls_base_harvest_threads(void)
{
int i;

PMIX_ACQUIRE_THREAD(&prte_odls_globals.lock);
if (0 < prte_odls_globals.num_threads) {
/* stop the progress threads */
if (NULL != prte_odls_globals.ev_threads) {
Expand All @@ -144,18 +142,15 @@ void prte_odls_base_harvest_threads(void)
prte_odls_globals.ev_threads = NULL;
}
}
PMIX_RELEASE_THREAD(&prte_odls_globals.lock);
}

void prte_odls_base_start_threads(prte_job_t *jdata)
{
int i;
char *tmp;

PMIX_ACQUIRE_THREAD(&prte_odls_globals.lock);
/* only do this once */
if (NULL != prte_odls_globals.ev_threads) {
PMIX_RELEASE_THREAD(&prte_odls_globals.lock);
return;
}

Expand Down Expand Up @@ -205,7 +200,6 @@ void prte_odls_base_start_threads(prte_job_t *jdata)
free(tmp);
}
}
PMIX_RELEASE_THREAD(&prte_odls_globals.lock);
}

static int prte_odls_base_close(void)
Expand All @@ -230,8 +224,6 @@ static int prte_odls_base_close(void)

prte_odls_base_harvest_threads();

PMIX_DESTRUCT_LOCK(&prte_odls_globals.lock);

return pmix_mca_base_framework_components_close(&prte_odls_base_framework, NULL);
}

Expand All @@ -247,9 +239,6 @@ static int prte_odls_base_open(pmix_mca_base_open_flag_t flags)
bool xterm_hold;
sigset_t unblock;

PMIX_CONSTRUCT_LOCK(&prte_odls_globals.lock);
prte_odls_globals.lock.active = false; // start with nobody having the thread

/* initialize the global array of local children */
prte_local_children = PMIX_NEW(pmix_pointer_array_t);
if (PRTE_SUCCESS
Expand All @@ -263,12 +252,9 @@ static int prte_odls_base_open(pmix_mca_base_open_flag_t flags)
prte_odls_globals.xtermcmd = NULL;

/* ensure that SIGCHLD is unblocked as we need to capture it */
if (0 != sigemptyset(&unblock)) {
return PRTE_ERROR;
}
if (0 != sigaddset(&unblock, SIGCHLD)) {
return PRTE_ERROR;
}
sigemptyset(&unblock);
sigaddset(&unblock, SIGCHLD);

if (0 != sigprocmask(SIG_UNBLOCK, &unblock, NULL)) {
return PRTE_ERR_NOT_SUPPORTED;
}
Expand Down
13 changes: 12 additions & 1 deletion src/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,16 @@ void prte_plm_base_allocation_complete(int fd, short args, void *cbdata)
* to map so we can see where the procs would have
* gone - so skip to the mapping state */
if (prte_get_attribute(&caddy->jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) {
PRTE_ACTIVATE_JOB_STATE(caddy->jdata, PRTE_JOB_STATE_DAEMONS_REPORTED);
node = (prte_node_t*)pmix_pointer_array_get_item(prte_node_pool, 0);
if (NULL == node) {
// should never happen
PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND);
PRTE_ACTIVATE_JOB_STATE(caddy->jdata, PRTE_JOB_STATE_FAILED_TO_START);
PMIX_RELEASE(caddy);
return;
}
prte_rmaps_base.require_hwtcpus = !prte_hwloc_base_core_cpus(node->topology->topo);
PRTE_ACTIVATE_JOB_STATE(caddy->jdata, PRTE_JOB_STATE_DAEMONS_REPORTED);
} else {
/* move the state machine along */
caddy->jdata->state = PRTE_JOB_STATE_ALLOCATION_COMPLETE;
Expand Down Expand Up @@ -1043,6 +1050,10 @@ void prte_plm_base_post_launch(int fd, short args, void *cbdata)
continue;
}
app = (prte_app_context_t*)pmix_pointer_array_get_item(jdata->apps, proc->app_idx);
if (NULL == app) {
// should never happen
continue;
}
fprintf(fp, "(rank, host, exe, pid) = (%u, %s, %s, %d)\n",
proc->name.rank, proc->node->name, app->app, proc->pid);
}
Expand Down
1 change: 0 additions & 1 deletion src/mca/ras/base/ras_base_allocate.c
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,6 @@ void prte_ras_base_display_cpus(prte_job_t *jdata, char *nodelist)
}
if (0 == strcmp(nptr->name, nodes[j])) {
display_cpus(nptr->topology, jdata, nodes[j]);
moveon = true;
break;
}
if (NULL == nptr->aliases) {
Expand Down
2 changes: 2 additions & 0 deletions src/mca/ras/pbs/ras_pbs_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ static int discover(pmix_list_t *nodelist, char *pbs_jobid)
if (prte_mca_ras_pbs_component.smp_mode) {
/* this cannot happen in smp mode */
pmix_show_help("help-ras-pbs.txt", "smp-multi", true);
fclose(fp);
free(hostname);
return PRTE_ERR_BAD_PARAM;
}
++node->slots;
Expand Down
Loading
Loading