Skip to content

Commit

Permalink
Merge pull request #96 from rhc54/topic/dbgr
Browse files Browse the repository at this point in the history
Restore debugger launch of apps and then daemons
  • Loading branch information
Ralph Castain committed Sep 26, 2018
2 parents f0c1262 + cb3926a commit 4301061
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 21 deletions.
7 changes: 7 additions & 0 deletions config/opal_check_pmi.m4
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,13 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
LDFLAGS="-L$pmix_ext_install_libdir $LDFLAGS"])
LIBS="$LIBS -lpmix"
CPPFLAGS="$CPPFLAGS -I$pmix_ext_install_dir/include"
OPAL_WRAPPER_FLAGS_ADD([CPPFLAGS], [-I$pmix_ext_install_dir/include])
LDFLAGS="$LDFLAGS -L$pmix_ext_install_libdir"
OPAL_WRAPPER_FLAGS_ADD([LDFLAGS], [-L$pmix_ext_install_libdir])
opal_external_pmix_happy=yes
AC_DEFINE_UNQUOTED([OPAL_PMIX_VERSION], [$opal_external_pmix_version_found],
Expand Down
20 changes: 10 additions & 10 deletions examples/debugger.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ static void cbfunc(pmix_status_t status,
PMIX_INFO_CREATE(mq->info, ninfo);
mq->ninfo = ninfo;
for (n=0; n < ninfo; n++) {
fprintf(stderr, "Key %s Type %s(%d)\n", info[n].key, PMIx_Data_type_string(info[n].value.type), info[n].value.type);
PMIX_INFO_XFER(&mq->info[n], &info[n]);
}
}
Expand Down Expand Up @@ -272,7 +273,6 @@ static pmix_status_t spawn_debugger(char *appspace, myrel_t *myrel)
PMIX_APP_FREE(debugger, 1);
return rc;
}
fprintf(stderr, "SPAWNED DEBUGGERD\n");
/* cleanup */
PMIX_INFO_FREE(dinfo, dninfo);
PMIX_APP_FREE(debugger, 1);
Expand Down Expand Up @@ -438,7 +438,7 @@ int main(int argc, char **argv)
app[0].cwd = strdup(cwd);
app[0].maxprocs = 1;
/* provide job-level directives so the apps do what the user requested */
ninfo = 5;
ninfo = 6;
PMIX_INFO_CREATE(info, ninfo);
PMIX_INFO_LOAD(&info[0], PMIX_MAPBY, "slot", PMIX_STRING); // map by slot
asprintf(&tmp, "%s:%d", myproc.nspace, myproc.rank);
Expand All @@ -450,6 +450,7 @@ int main(int argc, char **argv)
PMIX_INFO_LOAD(&info[2], PMIX_FWD_STDOUT, &cospawn, PMIX_BOOL); // forward stdout to me
PMIX_INFO_LOAD(&info[3], PMIX_FWD_STDERR, &cospawn, PMIX_BOOL); // forward stderr to me
PMIX_INFO_LOAD(&info[4], PMIX_NOTIFY_COMPLETION, NULL, PMIX_BOOL); // notify us when the job completes
PMIX_INFO_LOAD(&info[5], PMIX_LAUNCHER_RENDEZVOUS_FILE, "dbgr.rndz.txt", PMIX_STRING); // have it output a specific rndz file

/* spawn the job - the function will return when the launcher
* has been launched. Note that this doesn't tell us anything
Expand All @@ -463,7 +464,6 @@ int main(int argc, char **argv)
fprintf(stderr, "Application failed to launch with error: %s(%d)\n", PMIx_Error_string(rc), rc);
goto done;
}
fprintf(stderr, "Spawn complete\n\n");

/* wait here for the launcher to declare itself ready */
DEBUG_WAIT_THREAD(&launcher_ready.lock);
Expand All @@ -481,7 +481,7 @@ int main(int argc, char **argv)
fprintf(stderr, "Failed to connect to %s server: %s(%d)\n", argv[1], PMIx_Error_string(rc), rc);
goto done;
}
fprintf(stderr, "Connection completed\n");

/* send the launch directives */
ninfo = 3;
PMIX_INFO_CREATE(info, ninfo);
Expand Down Expand Up @@ -527,7 +527,6 @@ int main(int argc, char **argv)
myquery_data.info = NULL;
myquery_data.ninfo = 0;
/* execute the query */
fprintf(stderr, "Debugger: querying capabilities\n");
if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)&myquery_data))) {
fprintf(stderr, "PMIx_Query_info failed: %d\n", rc);
goto done;
Expand Down Expand Up @@ -574,7 +573,7 @@ int main(int argc, char **argv)
}
}
}

cospawn = false;
/* if cospawn is true, then we can launch both the app and the debugger
* daemons at the same time */
if (cospawn) {
Expand Down Expand Up @@ -630,22 +629,24 @@ int main(int argc, char **argv)
/* wait to get a response */
DEBUG_WAIT_THREAD(&myquery_data.lock);
DEBUG_DESTRUCT_LOCK(&myquery_data.lock);

/* we should have gotten a response */
if (PMIX_SUCCESS != myquery_data.status) {
fprintf(stderr, "Debugger[%s:%d] Proctable query failed: %s\n",
myproc.nspace, myproc.rank, PMIx_Error_string(myquery_data.status));
goto done;
}
/* there should hvae been data */
/* there should have been data */
if (NULL == myquery_data.info || 0 == myquery_data.ninfo) {
fprintf(stderr, "Debugger[%s:%d] Proctable query return no results\n",
myproc.nspace, myproc.rank);
goto done;
}
/* the query should have returned a data_array */
if (PMIX_DATA_ARRAY != myquery_data.info[0].value.type) {
fprintf(stderr, "Debugger[%s:%d] Query returned incorrect data type: %s\n", PMIx_Data_type_string(myquery_data.info[0].value.type));
fprintf(stderr, "Debugger[%s:%d] Query returned incorrect data type: %s(%d)\n",
myproc.nspace, myproc.rank,
PMIx_Data_type_string(myquery_data.info[0].value.type),
(int)myquery_data.info[0].value.type);
return -1;
}
if (NULL == myquery_data.info[0].value.data.darray->array) {
Expand All @@ -664,7 +665,6 @@ int main(int argc, char **argv)
* int exit_code;
* pmix_proc_state_t state;
*/
fprintf(stderr, "Received %d array elements\n", (int)myquery_data.info[0].value.data.darray->size);

/* now launch the debugger daemons */
if (PMIX_SUCCESS != (rc = spawn_debugger(clientspace, &dbrel))) {
Expand Down
6 changes: 5 additions & 1 deletion examples/debuggerd.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,13 @@ int main(int argc, char **argv)
DEBUG_DESTRUCT_LOCK(&mylock);

/* get the nspace of the job we are to debug - it will be in our JOB info */
#ifdef PMIX_LOAD_PROCID
PMIX_LOAD_PROCID(&proc, myproc.nspace, PMIX_RANK_WILDCARD);
#else
PMIX_PROC_CONSTRUCT(&proc);
(void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_KEYLEN);
proc.rank = PMIX_RANK_WILDCARD;
#endif
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_DEBUG_JOB, NULL, 0, &val))) {
fprintf(stderr, "[%s:%d:%lu] Failed to get job being debugged - error %s\n",
myproc.nspace, myproc.rank,
Expand Down Expand Up @@ -384,7 +388,7 @@ int main(int argc, char **argv)
PMIX_INFO_LOAD(&info[1], PMIX_EVENT_NON_DEFAULT, NULL, PMIX_BOOL); // deliver to the target nspace
fprintf(stderr, "[%s:%u:%lu] Sending release\n", myproc.nspace, myproc.rank, (unsigned long)pid);
rc = PMIx_Notify_event(PMIX_ERR_DEBUGGER_RELEASE,
NULL, PMIX_RANGE_LOCAL,
NULL, PMIX_RANGE_CUSTOM,
info, ninfo, NULL, NULL);
if (PMIX_SUCCESS != rc) {
fprintf(stderr, "%s[%s:%u:%lu] Sending release failed with error %s(%d)\n",
Expand Down
3 changes: 2 additions & 1 deletion orte/mca/rmaps/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ ORTE_DECLSPEC int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
opal_list_t *nodes,
bool remove);

ORTE_DECLSPEC int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
ORTE_DECLSPEC int orte_rmaps_base_set_mapping_policy(orte_job_t *jdata,
orte_mapping_policy_t *policy,
char **device, char *spec);
ORTE_DECLSPEC int orte_rmaps_base_set_ranking_policy(orte_ranking_policy_t *policy,
orte_mapping_policy_t mapping,
Expand Down
17 changes: 13 additions & 4 deletions orte/mca/rmaps/base/rmaps_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
"rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N, default <obj>=NUMA");
}

if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping,
if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(NULL, &orte_rmaps_base.mapping,
&orte_rmaps_base.device,
rmaps_base_mapping_policy))) {
return rc;
Expand Down Expand Up @@ -593,7 +593,8 @@ static int check_modifiers(char *ck, orte_mapping_policy_t *tmp)
return ORTE_ERR_TAKE_NEXT_OPTION;
}

int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
int orte_rmaps_base_set_mapping_policy(orte_job_t *jdata,
orte_mapping_policy_t *policy,
char **device, char *inspec)
{
char *ck;
Expand Down Expand Up @@ -681,7 +682,11 @@ int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
}
}
/* now save the pattern */
orte_rmaps_base.ppr = strdup(ck);
if (NULL == jdata || NULL == jdata->map) {
orte_rmaps_base.ppr = strdup(ck);
} else {
jdata->map->ppr = strdup(ck);
}
ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_PPR);
ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN);
free(spec);
Expand Down Expand Up @@ -747,7 +752,11 @@ int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
}

setpolicy:
*policy = tmp;
if (NULL == jdata || NULL == jdata->map) {
*policy = tmp;
} else {
jdata->map->mapping = tmp;
}

return ORTE_SUCCESS;
}
Expand Down
2 changes: 1 addition & 1 deletion orte/orted/pmix/pmix_server_dyn.c
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ static void interim(int sd, short args, void *cbdata)
rc = ORTE_ERR_BAD_PARAM;
goto complete;
}
rc = orte_rmaps_base_set_mapping_policy(&jdata->map->mapping,
rc = orte_rmaps_base_set_mapping_policy(jdata, &jdata->map->mapping,
NULL, info->value.data.string);
if (ORTE_SUCCESS != rc) {
goto complete;
Expand Down
12 changes: 8 additions & 4 deletions orte/orted/pmix/pmix_server_gen.c
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ static void _query(int sd, short args, void *cbdata)
pmix_data_array_t *darray;
pmix_proc_info_t *procinfo;
pmix_info_t *info;
pmix_status_t ret;
pmix_status_t ret = PMIX_SUCCESS;
opal_ds_info_t *kv;
orte_jobid_t jobid;
orte_job_t *jdata;
Expand Down Expand Up @@ -785,14 +785,14 @@ static void _query(int sd, short args, void *cbdata)
}
}
if (ORTE_JOBID_INVALID == jobid) {
rc = ORTE_ERR_BAD_PARAM;
ret = PMIX_ERR_NOT_FOUND;
goto done;
}
/* construct a list of values with opal_proc_info_t
* entries for each proc in the indicated job */
jdata = orte_get_job_data_object(jobid);
if (NULL == jdata) {
rc = ORTE_ERR_NOT_FOUND;
ret = PMIX_ERR_NOT_FOUND;
goto done;
}
/* setup the reply */
Expand Down Expand Up @@ -897,9 +897,13 @@ static void _query(int sd, short args, void *cbdata)
/* convert the list of results to an info array */
rcd->ninfo = opal_list_get_size(results);
PMIX_INFO_CREATE(rcd->info, rcd->ninfo);
n=0;
OPAL_LIST_FOREACH(kv, results, opal_ds_info_t) {
PMIX_INFO_XFER(&rcd->info[n], kv->info);
n++;
}
}
}

cd->infocbfunc(ret, rcd->info, rcd->ninfo, cd->cbdata, qrel, rcd);
}

Expand Down
8 changes: 8 additions & 0 deletions orte/tools/prun/prun.c
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,14 @@ int prun(int argc, char *argv[])
opal_list_append(&tinfo, &ds->super);
}

/* check for request to drop a rendezvous file */
if (NULL != (param = getenv("PMIX_LAUNCHER_RENDEZVOUS_FILE"))) {
ds = OBJ_NEW(opal_ds_info_t);
PMIX_INFO_CREATE(ds->info, 1);
PMIX_INFO_LOAD(ds->info, PMIX_LAUNCHER_RENDEZVOUS_FILE, param, PMIX_STRING);
opal_list_append(&tinfo, &ds->super);
}

/* convert to array of info */
ninfo = opal_list_get_size(&tinfo);
PMIX_INFO_CREATE(iptr, ninfo);
Expand Down

0 comments on commit 4301061

Please sign in to comment.