diff --git a/.gitignore b/.gitignore index 1049aad9b31..b45ab10f922 100644 --- a/.gitignore +++ b/.gitignore @@ -111,6 +111,7 @@ contrib/platform/intel/bend/*orcm* contrib/scaling/orte_no_op contrib/scaling/mpi_no_op contrib/scaling/mpi_barrier +contrib/scaling/mpi_memprobe examples/hello_c examples/hello_cxx diff --git a/contrib/scaling/Makefile b/contrib/scaling/Makefile index c0e871c608d..69fb6233d4d 100644 --- a/contrib/scaling/Makefile +++ b/contrib/scaling/Makefile @@ -1,4 +1,4 @@ -PROGS = orte_no_op mpi_no_op +PROGS = orte_no_op mpi_no_op mpi_memprobe all: $(PROGS) @@ -10,5 +10,8 @@ orte_no_op: mpi_no_op: mpicc -o mpi_no_op mpi_no_op.c +mpi_memprobe: + mpicc -o mpi_memprobe mpi_memprobe.c -lopen-pal + clean: rm -f $(PROGS) *~ diff --git a/contrib/scaling/mpi_memprobe.c b/contrib/scaling/mpi_memprobe.c new file mode 100644 index 00000000000..b3458ece9f4 --- /dev/null +++ b/contrib/scaling/mpi_memprobe.c @@ -0,0 +1,98 @@ +/* -*- C -*- + * + * $HEADER$ + * + * The most basic of MPI applications + */ + +#include "orte_config.h" + +#include +#include "mpi.h" +#include "opal/mca/pmix/pmix.h" +#include "orte/runtime/runtime.h" +#include "orte/util/proc_info.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" + +static volatile int active; +static volatile bool wait_for_release = true; +#define MEMPROBE_RELEASE 12345 + +static void _release_fn(int status, + const opal_process_name_t *source, + opal_list_t *info, opal_list_t *results, + opal_pmix_notification_complete_fn_t cbfunc, + void *cbdata) +{ + /* must let the notifier know we are done */ + if (NULL != cbfunc) { + cbfunc(0, NULL, NULL, NULL, cbdata); + } + /* flag that the debugger is complete so we can exit */ + wait_for_release = false; +} + +static void _register_fn(int status, + size_t evhandler_ref, + void *cbdata) +{ + volatile int *active = (volatile int*)cbdata; + + if (0 != status) { + fprintf(stderr, "Client EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n", + status, (unsigned long)evhandler_ref); + } + *active = status; +} + +int main(int argc, char* argv[]) +{ + int rank, size; + opal_list_t *codes; + opal_value_t *kv; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + if (0 == rank) { + fprintf(stderr, "Sampling memory usage after MPI_Init\n"); + } + + codes = OBJ_NEW(opal_list_t); + kv = OBJ_NEW(opal_value_t); + kv->key = strdup("errorcode"); + kv->type = OPAL_INT; + kv->data.integer = MEMPROBE_RELEASE; + opal_list_append(codes, &kv->super); + + active = -1; + opal_pmix.register_evhandler(codes, NULL, _release_fn, _register_fn, (void*)&active); + while (-1 == active) { + usleep(10); + } + + /* now wait for notification */ + while (wait_for_release) { + usleep(10); + } + wait_for_release = true; + + /* perform a barrier so some communication will occur, thus + * requiring exchange of endpoint info */ + MPI_Barrier(MPI_COMM_WORLD); + + if (0 == rank) { + fprintf(stderr, "\n\nSampling memory usage after MPI_Barrier\n"); + } + + /* wait again while memory is sampled */ + while (wait_for_release) { + usleep(10); + } + + MPI_Finalize(); + return 0; +} diff --git a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h index 4f5884de1e1..5c309ddc140 100644 --- a/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h +++ b/opal/mca/pmix/pmix2x/pmix/include/pmix_common.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. @@ -157,6 +157,7 @@ typedef uint32_t pmix_rank_t; #define PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" // (bool) Resource Manager will clean session directories /* information about relative ranks as assigned by the RM */ +#define PMIX_PROCID "pmix.procid" // (pmix_proc_t) process identifier #define PMIX_NSPACE "pmix.nspace" // (char*) nspace of a job #define PMIX_JOBID "pmix.jobid" // (char*) jobid assigned by scheduler #define PMIX_APPNUM "pmix.appnum" // (uint32_t) app number within the job @@ -282,6 +283,8 @@ typedef uint32_t pmix_rank_t; #define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform #define PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // return a comma-delimited list of supported spawn attributes #define PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // return a comma-delimited list of supported debug attributes +#define PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // return info on memory usage for the procs indicated in the qualifiers +#define PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only /* log attributes */ #define PMIX_LOG_STDERR "pmix.log.stderr" // (bool) log data to stderr diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c index 9d5539f0a6d..e70d5782c45 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c +++ b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Artem Y. Polyakov . @@ -553,11 +553,18 @@ static void _putfn(int sd, short args, void *cbdata) { pmix_cb_t *cb = (pmix_cb_t*)cbdata; pmix_status_t rc; - pmix_kval_t *kv; + pmix_kval_t *kv = NULL; pmix_nspace_t *ns; uint8_t *tmp; size_t len; + /* no need to push info that starts with "pmix" as that is + * info we would have been provided at startup */ + if (0 == strncmp(cb->key, "pmix", 4)) { + rc = PMIX_SUCCESS; + goto done; + } + /* setup to xfer the data */ kv = PMIX_NEW(pmix_kval_t); kv->key = strdup(cb->key); // need to copy as the input belongs to the user @@ -622,7 +629,9 @@ static void _putfn(int sd, short args, void *cbdata) } done: - PMIX_RELEASE(kv); // maintain accounting + if (NULL != kv) { + PMIX_RELEASE(kv); // maintain accounting + } cb->pstatus = rc; cb->active = false; } diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c index 98df8827464..5ad6c6014f2 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c +++ b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client_get.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Artem Y. Polyakov . @@ -763,7 +763,7 @@ static void _getnbfn(int fd, short flags, void *cbdata) * us to attempt to retrieve it from the server */ for (n=0; n < cb->ninfo; n++) { if (0 == strcmp(cb->info[n].key, PMIX_OPTIONAL) && - cb->info[n].value.data.flag) { + (PMIX_UNDEF == cb->info[n].value.type || cb->info[n].value.data.flag)) { /* they don't want us to try and retrieve it */ pmix_output_verbose(2, pmix_globals.debug_output, "PMIx_Get key=%s for rank = %d, namespace = %s was not found - request was optional", diff --git a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c index 437106187be..3cb3923bed6 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c +++ b/opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_registration.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -356,6 +356,7 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) sing->code = cd->codes[0]; index = pmix_globals.events.nhdlrs; sing->index = index; + sing->evhdlr = cd->evhdlr; ++pmix_globals.events.nhdlrs; sing->cbobject = cbobject; rc = _add_hdlr(&pmix_globals.events.single_events, &sing->super, @@ -365,17 +366,17 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) PMIX_ERR_WOULD_BLOCK != rc) { /* unable to register */ --pmix_globals.events.nhdlrs; - rc = PMIX_ERR_EVENT_REGISTRATION; - index = UINT_MAX; + rc = PMIX_ERR_EVENT_REGISTRATION; + index = UINT_MAX; + goto ack; + } + if (PMIX_ERR_WOULD_BLOCK == rc) { + /* the callback will provide our response */ + PMIX_RELEASE(cd); + return; + } goto ack; } - if (PMIX_ERR_WOULD_BLOCK == rc) { - /* the callback will provide our response */ - PMIX_RELEASE(cd); - return; - } - goto ack; - } /* must be a multi-code registration */ multi = PMIX_NEW(pmix_multi_event_t); @@ -387,6 +388,7 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) memcpy(multi->codes, cd->codes, cd->ncodes * sizeof(pmix_status_t)); index = pmix_globals.events.nhdlrs; multi->index = index; + multi->evhdlr = cd->evhdlr; ++pmix_globals.events.nhdlrs; multi->cbobject = cbobject; rc = _add_hdlr(&pmix_globals.events.multi_events, &multi->super, @@ -396,9 +398,9 @@ static void reg_event_hdlr(int sd, short args, void *cbdata) PMIX_ERR_WOULD_BLOCK != rc) { /* unable to register */ --pmix_globals.events.nhdlrs; - rc = PMIX_ERR_EVENT_REGISTRATION; - index = UINT_MAX; - goto ack; + rc = PMIX_ERR_EVENT_REGISTRATION; + index = UINT_MAX; + goto ack; } if (PMIX_ERR_WOULD_BLOCK == rc) { /* the callback will provide our response */ diff --git a/opal/mca/pmix/pmix2x/pmix2x.c b/opal/mca/pmix/pmix2x/pmix2x.c index 395681e4a67..84743c28e91 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.c +++ b/opal/mca/pmix/pmix2x/pmix2x.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Mellanox Technologies, Inc. @@ -145,121 +145,52 @@ static void pmix2x_register_jobid(opal_jobid_t jobid, const char *nspace) opal_list_append(&mca_pmix_pmix2x_component.jobids, &jptr->super); } -static void completion_handler(int status, void *cbdata) +static void event_hdlr_complete(pmix_status_t status, void *cbdata) { - opal_pmix2x_event_chain_t *chain = (opal_pmix2x_event_chain_t*)cbdata; - if (NULL != chain->info) { - OPAL_LIST_RELEASE(chain->info); - } + pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata; + + OBJ_RELEASE(op); } -static void progress_local_event_hdlr(int status, - opal_list_t *results, - opal_pmix_op_cbfunc_t cbfunc, void *thiscbdata, - void *notification_cbdata) +static void return_local_event_hdlr(int status, opal_list_t *results, + opal_pmix_op_cbfunc_t cbfunc, void *thiscbdata, + void *notification_cbdata) { - opal_pmix2x_event_chain_t *chain = (opal_pmix2x_event_chain_t*)notification_cbdata; + pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)notification_cbdata; + pmix2x_opcaddy_t *op; + opal_value_t *kv; + pmix_status_t pstatus; size_t n; - opal_list_item_t *nxt; - opal_pmix2x_single_event_t *sing; - opal_pmix2x_multi_event_t *multi; - opal_pmix2x_default_event_t *def; - - /* if the caller indicates that the chain is completed, then stop here */ - if (OPAL_ERR_HANDLERS_COMPLETE == status) { - goto complete; - } - /* if any results were provided, then add them here */ - if (NULL != results) { - while (NULL != (nxt = opal_list_remove_first(results))) { - opal_list_append(results, nxt); - } - } - - /* see if we need to continue, starting with the single code events */ - if (NULL != chain->sing) { - /* the last handler was for a single code - see if there are - * any others that match this event */ - while (opal_list_get_end(&mca_pmix_pmix2x_component.single_events) != (nxt = opal_list_get_next(&chain->sing->super))) { - sing = (opal_pmix2x_single_event_t*)nxt; - if (sing->code == chain->status) { - OBJ_RETAIN(chain); - chain->sing = sing; - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s PROGRESS CALLING SINGLE EVHDLR", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - sing->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - goto complete; - } - } - /* if we get here, then there are no more single code - * events that match */ - chain->sing = NULL; - /* pickup the beginning of the multi-code event list */ - if (0 < opal_list_get_size(&mca_pmix_pmix2x_component.multi_events)) { - chain->multi = (opal_pmix2x_multi_event_t*)opal_list_get_begin(&mca_pmix_pmix2x_component.multi_events); - } - } - - /* see if we need to continue with the multi code events */ - if (NULL != chain->multi) { - while (opal_list_get_end(&mca_pmix_pmix2x_component.multi_events) != (nxt = opal_list_get_next(&chain->multi->super))) { - multi = (opal_pmix2x_multi_event_t*)nxt; - for (n=0; n < multi->ncodes; n++) { - if (multi->codes[n] == chain->status) { - /* found it - invoke the handler, pointing its - * callback function to our progression function */ - OBJ_RETAIN(chain); - chain->multi = multi; - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s PROGRESS CALLING MULTI EVHDLR", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - multi->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - goto complete; + if (NULL != cd->pmixcbfunc) { + op = OBJ_NEW(pmix2x_opcaddy_t); + + if (NULL != results) { + /* convert the list of results to an array of info */ + op->ninfo = opal_list_get_size(results); + if (0 < op->ninfo) { + PMIX_INFO_CREATE(op->info, op->ninfo); + n=0; + OPAL_LIST_FOREACH(kv, cd->info, opal_value_t) { + (void)strncpy(op->info[n].key, kv->key, PMIX_MAX_KEYLEN); + pmix2x_value_load(&op->info[n].value, kv); + ++n; } } } - /* if we get here, then there are no more multi-mode - * events that match */ - chain->multi = NULL; - /* pickup the beginning of the default event list */ - if (0 < opal_list_get_size(&mca_pmix_pmix2x_component.default_events)) { - chain->def = (opal_pmix2x_default_event_t*)opal_list_get_begin(&mca_pmix_pmix2x_component.default_events); - } + /* convert the status */ + pstatus = pmix2x_convert_opalrc(status); + /* call the library's callback function */ + cd->pmixcbfunc(pstatus, op->info, op->ninfo, event_hdlr_complete, op, cd->cbdata); } - /* if they didn't want it to go to a default handler, then we are done */ - if (chain->nondefault) { - goto complete; - } - - if (NULL != chain->def) { - if (opal_list_get_end(&mca_pmix_pmix2x_component.default_events) != (nxt = opal_list_get_next(&chain->def->super))) { - def = (opal_pmix2x_default_event_t*)nxt; - OBJ_RETAIN(chain); - chain->def = def; - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s PROGRESS CALLING DEFAULT EVHDLR", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - def->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - } + /* release the threadshift object */ + if (NULL != cd->info) { + OPAL_LIST_RELEASE(cd->info); } + OBJ_RELEASE(cd); - complete: - /* we still have to call their final callback */ - if (NULL != chain->final_cbfunc) { - chain->final_cbfunc(OPAL_SUCCESS, chain->final_cbdata); - } - /* maintain acctng */ - OBJ_RELEASE(chain); - /* let the caller know that we are done with their callback */ + /* release the caller */ if (NULL != cbfunc) { cbfunc(OPAL_SUCCESS, thiscbdata); } @@ -268,92 +199,29 @@ static void progress_local_event_hdlr(int status, static void _event_hdlr(int sd, short args, void *cbdata) { pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata; - size_t n; - opal_pmix2x_event_chain_t *chain; - opal_pmix2x_single_event_t *sing; - opal_pmix2x_multi_event_t *multi; - opal_pmix2x_default_event_t *def; + opal_pmix2x_event_t *event; opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s _EVENT_HDLR RECEIVED NOTIFICATION OF STATUS %d", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), cd->status); - - chain = OBJ_NEW(opal_pmix2x_event_chain_t); - /* point it at our final callback */ - chain->final_cbfunc = completion_handler; - chain->final_cbdata = chain; - - /* carry across provided info */ - chain->status = cd->status; - chain->source = cd->pname; - chain->info = cd->info; - chain->nondefault = cd->nondefault; - - /* cycle thru the single-event registrations first */ - OPAL_LIST_FOREACH(sing, &mca_pmix_pmix2x_component.single_events, opal_pmix2x_single_event_t) { - if (sing->code == chain->status) { + "%s _EVENT_HDLR RECEIVED NOTIFICATION FOR HANDLER %d OF STATUS %d", + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (int)cd->id, cd->status); + + /* cycle thru the registrations */ + OPAL_LIST_FOREACH(event, &mca_pmix_pmix2x_component.events, opal_pmix2x_event_t) { + if (cd->id == event->index) { /* found it - invoke the handler, pointing its - * callback function to our progression function */ - OBJ_RETAIN(chain); - chain->sing = sing; + * callback function to our callback function */ opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s _EVENT_HDLR CALLING SINGLE EVHDLR", + "%s _EVENT_HDLR CALLING EVHDLR", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - sing->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); + event->handler(cd->status, &cd->pname, + cd->info, &cd->results, + return_local_event_hdlr, (void*)cd); return; } } - - /* if we didn't find any match in the single-event registrations, - * then cycle thru the multi-event registrations next */ - OPAL_LIST_FOREACH(multi, &mca_pmix_pmix2x_component.multi_events, opal_pmix2x_multi_event_t) { - for (n=0; n < multi->ncodes; n++) { - if (multi->codes[n] == chain->status) { - /* found it - invoke the handler, pointing its - * callback function to our progression function */ - OBJ_RETAIN(chain); - chain->multi = multi; - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s _EVENT_HDLR CALLING MULTI EVHDLR", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - multi->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - return; - } - } - } - - /* if they didn't want it to go to a default handler, then we are done */ - if (chain->nondefault) { - /* if we get here, then we need to cache this event in case they - * register for it later - we cannot lose individual events */ - opal_list_append(&mca_pmix_pmix2x_component.cache, &chain->super); - return; - } - - /* we are done with the threadshift caddy */ - OBJ_RELEASE(cd); - - /* finally, pass it to any default handlers */ - if (0 < opal_list_get_size(&mca_pmix_pmix2x_component.default_events)) { - def = (opal_pmix2x_default_event_t*)opal_list_get_first(&mca_pmix_pmix2x_component.default_events); - OBJ_RETAIN(chain); - chain->def = def; - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s _EVENT_HDLR CALLING DEFAULT EVHDLR", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - def->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - return; - } - - /* we still have to call their final callback */ - if (NULL != chain->final_cbfunc) { - chain->final_cbfunc(PMIX_SUCCESS, chain->final_cbdata); + /* if we didn't find a match, we still have to call their final callback */ + if (NULL != cd->pmixcbfunc) { + cd->pmixcbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cd->cbdata); } return; } @@ -385,6 +253,9 @@ void pmix2x_event_hdlr(size_t evhdlr_registration_id, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), status); cd = OBJ_NEW(pmix2x_threadshift_t); + cd->id = evhdlr_registration_id; + cd->pmixcbfunc = cbfunc; + cd->cbdata = cbdata; /* convert the incoming status */ cd->status = pmix2x_convert_rc(status); @@ -409,9 +280,6 @@ void pmix2x_event_hdlr(size_t evhdlr_registration_id, if (NULL != info) { cd->info = OBJ_NEW(opal_list_t); for (n=0; n < ninfo; n++) { - if (0 == strncmp(info[n].key, PMIX_EVENT_NON_DEFAULT, PMIX_MAX_KEYLEN)) { - cd->nondefault = true; - } iptr = OBJ_NEW(opal_value_t); iptr->key = strdup(info[n].key); if (OPAL_SUCCESS != (rc = pmix2x_value_unload(iptr, &info[n].value))) { @@ -422,17 +290,25 @@ void pmix2x_event_hdlr(size_t evhdlr_registration_id, opal_list_append(cd->info, &iptr->super); } } + + /* convert the array of prior results */ + if (NULL != results) { + for (n=0; n < nresults; n++) { + iptr = OBJ_NEW(opal_value_t); + iptr->key = strdup(results[n].key); + if (OPAL_SUCCESS != (rc = pmix2x_value_unload(iptr, &results[n].value))) { + OPAL_ERROR_LOG(rc); + OBJ_RELEASE(iptr); + continue; + } + opal_list_append(&cd->results, &iptr->super); + } + } + /* now push it into the local thread */ event_assign(&cd->ev, opal_pmix_base.evbase, -1, EV_WRITE, _event_hdlr, cd); event_active(&cd->ev, EV_WRITE, 1); - - /* we don't need any of the data they provided, - * so let them go - also tell them that we will handle - * everything from this point forward */ - if (NULL != cbfunc) { - cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata); - } } opal_vpid_t pmix2x_convert_rank(pmix_rank_t rank) @@ -536,7 +412,7 @@ pmix_status_t pmix2x_convert_opalrc(int rc) case OPAL_SUCCESS: return PMIX_SUCCESS; default: - return PMIX_ERROR; + return rc; } } @@ -620,7 +496,7 @@ int pmix2x_convert_rc(pmix_status_t rc) case PMIX_SUCCESS: return OPAL_SUCCESS; default: - return OPAL_ERROR; + return rc; } } @@ -735,6 +611,10 @@ void pmix2x_value_load(pmix_value_t *v, { opal_pmix2x_jobid_trkr_t *job; bool found; + opal_list_t *list; + opal_value_t *val; + pmix_info_t *info; + size_t n; switch(kv->type) { case OPAL_UNDEF: @@ -876,8 +756,22 @@ void pmix2x_value_load(pmix_value_t *v, memcpy(&v->data.state, &kv->data.uint8, sizeof(uint8_t)); break; case OPAL_PTR: - v->type = PMIX_POINTER; - v->data.ptr = kv->data.ptr; + /* if someone returned a pointer, it must be to a list of + * opal_value_t's that we need to convert to a pmix_data_array + * of pmix_info_t structures */ + list = (opal_list_t*)kv->data.ptr; + v->type = PMIX_DATA_ARRAY; + v->data.darray = (pmix_data_array_t*)malloc(sizeof(pmix_data_array_t)); + v->data.darray->type = PMIX_INFO; + v->data.darray->size = opal_list_get_size(list); + PMIX_INFO_CREATE(info, v->data.darray->size); + v->data.darray->array = info; + n=0; + OPAL_LIST_FOREACH(val, list, opal_value_t) { + (void)strncpy(info[n].key, val->key, PMIX_MAX_KEYLEN); + pmix2x_value_load(&info[n].value, val); + ++n; + } break; default: /* silence warnings */ @@ -1041,16 +935,27 @@ int pmix2x_value_unload(opal_value_t *kv, return rc; } +static void errreg_cbfunc (pmix_status_t status, + size_t errhandler_ref, + void *cbdata) +{ + pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata; + + op->event->index = errhandler_ref; + opal_output_verbose(5, opal_pmix_base_framework.framework_output, + "PMIX2x errreg_cbfunc - error handler registered status=%d, reference=%lu", + status, (unsigned long)errhandler_ref); + if (NULL != op->evregcbfunc) { + op->evregcbfunc(pmix2x_convert_rc(status), errhandler_ref, op->cbdata); + } + OBJ_RELEASE(op); +} + static void _reg_hdlr(int sd, short args, void *cbdata) { pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata; - opal_pmix2x_event_chain_t *chain; - opal_pmix2x_single_event_t *sing = NULL; - opal_pmix2x_multi_event_t *multi = NULL; - opal_pmix2x_default_event_t *def = NULL; + pmix2x_opcaddy_t *op; opal_value_t *kv; - int i; - bool prepend = false; size_t n; opal_output_verbose(2, opal_pmix_base_framework.framework_output, @@ -1058,116 +963,46 @@ static void _reg_hdlr(int sd, short args, void *cbdata) OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (NULL == cd->event_codes) ? "NULL" : "NON-NULL"); - if (NULL != cd->info) { - OPAL_LIST_FOREACH(kv, cd->info, opal_value_t) { - if (0 == strcmp(kv->key, OPAL_PMIX_EVENT_ORDER_PREPEND)) { - prepend = true; - break; - } - } - } + op = OBJ_NEW(pmix2x_opcaddy_t); + op->evregcbfunc = cd->cbfunc; + op->cbdata = cd->cbdata; - if (NULL == cd->event_codes) { - /* this is a default handler */ - def = OBJ_NEW(opal_pmix2x_default_event_t); - def->handler = cd->evhandler; - def->index = mca_pmix_pmix2x_component.evindex; - if (prepend) { - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s PREPENDING TO DEFAULT EVENTS", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - opal_list_prepend(&mca_pmix_pmix2x_component.default_events, &def->super); - } else { - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s APPENDING TO DEFAULT EVENTS", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - opal_list_append(&mca_pmix_pmix2x_component.default_events, &def->super); - } - } else if (1 == opal_list_get_size(cd->event_codes)) { - /* single handler */ - sing = OBJ_NEW(opal_pmix2x_single_event_t); - kv = (opal_value_t*)opal_list_get_first(cd->event_codes); - sing->code = kv->data.integer; - sing->index = mca_pmix_pmix2x_component.evindex; - sing->handler = cd->evhandler; - if (prepend) { - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s PREPENDING TO SINGLE EVENTS WITH CODE %d", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), sing->code); - opal_list_prepend(&mca_pmix_pmix2x_component.single_events, &sing->super); - } else { - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s APPENDING TO SINGLE EVENTS WITH CODE %d", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), sing->code); - opal_list_append(&mca_pmix_pmix2x_component.single_events, &sing->super); - } - } else { - multi = OBJ_NEW(opal_pmix2x_multi_event_t); - multi->ncodes = opal_list_get_size(cd->event_codes); - multi->codes = (int*)malloc(multi->ncodes * sizeof(int)); - i=0; + /* convert the event codes */ + if (NULL != cd->event_codes) { + op->ncodes = opal_list_get_size(cd->event_codes); + op->pcodes = (pmix_status_t*)malloc(op->ncodes * sizeof(pmix_status_t)); + n=0; OPAL_LIST_FOREACH(kv, cd->event_codes, opal_value_t) { - multi->codes[i] = kv->data.integer; - ++i; - } - multi->index = mca_pmix_pmix2x_component.evindex; - multi->handler = cd->evhandler; - if (prepend) { - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s PREPENDING TO MULTI EVENTS", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - opal_list_prepend(&mca_pmix_pmix2x_component.multi_events, &multi->super); - } else { - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "%s APPENDING TO MULTI EVENTS", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); - opal_list_append(&mca_pmix_pmix2x_component.multi_events, &multi->super); + op->pcodes[n] = pmix2x_convert_opalrc(kv->data.integer); } } - /* release the caller */ - if (NULL != cd->cbfunc) { - cd->cbfunc(OPAL_SUCCESS, mca_pmix_pmix2x_component.evindex, cd->cbdata); - } - mca_pmix_pmix2x_component.evindex++; - - /* check if any matching notifications have been cached - only nondefault - * events will have been cached*/ - if (NULL == def) { - /* check single code registrations */ - if (NULL != sing) { - OPAL_LIST_FOREACH(chain, &mca_pmix_pmix2x_component.cache, opal_pmix2x_event_chain_t) { - if (sing->code == chain->status) { - opal_list_remove_item(&mca_pmix_pmix2x_component.cache, &chain->super); - chain->sing = sing; - sing->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - OBJ_RELEASE(cd); - return; - } - } - } else if (NULL != multi) { - /* check for multi code registrations */ - OPAL_LIST_FOREACH(chain, &mca_pmix_pmix2x_component.cache, opal_pmix2x_event_chain_t) { - for (n=0; n < multi->ncodes; n++) { - if (multi->codes[n] == chain->status) { - opal_list_remove_item(&mca_pmix_pmix2x_component.cache, &chain->super); - chain->multi = multi; - multi->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - OBJ_RELEASE(cd); - return; - } - } + /* convert the list of info to an array of pmix_info_t */ + if (NULL != cd->info) { + op->ninfo = opal_list_get_size(cd->info); + if (0 < op->ninfo) { + PMIX_INFO_CREATE(op->info, op->ninfo); + n=0; + OPAL_LIST_FOREACH(kv, cd->info, opal_value_t) { + (void)strncpy(op->info[n].key, kv->key, PMIX_MAX_KEYLEN); + pmix2x_value_load(&op->info[n].value, kv); + ++n; } } } + /* register the event */ + op->event = OBJ_NEW(opal_pmix2x_event_t); + op->event->handler = cd->evhandler; + opal_list_append(&mca_pmix_pmix2x_component.events, &op->event->super); + PMIx_Register_event_handler(op->pcodes, op->ncodes, + op->info, op->ninfo, + pmix2x_event_hdlr, errreg_cbfunc, op); + OBJ_RELEASE(cd); return; } + static void register_handler(opal_list_t *event_codes, opal_list_t *info, opal_pmix_notification_fn_t evhandler, @@ -1184,36 +1019,20 @@ static void register_handler(opal_list_t *event_codes, static void _dereg_hdlr(int sd, short args, void *cbdata) { pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata; - opal_pmix2x_single_event_t *sing; - opal_pmix2x_multi_event_t *multi; - opal_pmix2x_default_event_t *def; - - /* check the single events first */ - OPAL_LIST_FOREACH(sing, &mca_pmix_pmix2x_component.single_events, opal_pmix2x_single_event_t) { - if (cd->handler == sing->index) { - opal_list_remove_item(&mca_pmix_pmix2x_component.single_events, &sing->super); - OBJ_RELEASE(sing); - goto release; - } - } - /* check multi events */ - OPAL_LIST_FOREACH(multi, &mca_pmix_pmix2x_component.multi_events, opal_pmix2x_multi_event_t) { - if (cd->handler == multi->index) { - opal_list_remove_item(&mca_pmix_pmix2x_component.multi_events, &multi->super); - OBJ_RELEASE(multi); - goto release; - } - } - /* check default events */ - OPAL_LIST_FOREACH(def, &mca_pmix_pmix2x_component.default_events, opal_pmix2x_default_event_t) { - if (cd->handler == def->index) { - opal_list_remove_item(&mca_pmix_pmix2x_component.default_events, &def->super); - OBJ_RELEASE(def); + opal_pmix2x_event_t *event; + + /* look for this event */ + OPAL_LIST_FOREACH(event, &mca_pmix_pmix2x_component.events, opal_pmix2x_event_t) { + if (cd->handler == event->index) { + opal_list_remove_item(&mca_pmix_pmix2x_component.events, &event->super); + OBJ_RELEASE(event); break; } } + /* tell the library to deregister this handler */ + PMIx_Deregister_event_handler(cd->handler, NULL, NULL); - release: + /* release the caller */ if (NULL != cd->opcbfunc) { cd->opcbfunc(OPAL_SUCCESS, cd->cbdata); } @@ -1230,90 +1049,81 @@ static void deregister_handler(size_t evhandler, return; } -static void _notify_event(int sd, short args, void *cbdata) +static void notify_complete(pmix_status_t status, void *cbdata) { - pmix2x_threadshift_t *cd = (pmix2x_threadshift_t*)cbdata; - size_t i; - opal_pmix2x_single_event_t *sing; - opal_pmix2x_multi_event_t *multi; - opal_pmix2x_default_event_t *def; - opal_pmix2x_event_chain_t *chain; - - /* check the single events first */ - OPAL_LIST_FOREACH(sing, &mca_pmix_pmix2x_component.single_events, opal_pmix2x_single_event_t) { - if (cd->status == sing->code) { - /* found it - invoke the handler, pointing its - * callback function to our progression function */ - chain = OBJ_NEW(opal_pmix2x_event_chain_t); - chain->status = cd->status; - chain->range = pmix2x_convert_opalrange(cd->range); - chain->source = *(cd->source); - chain->info = cd->info; - chain->final_cbfunc = cd->opcbfunc; - chain->final_cbdata = cd->cbdata; - chain->sing = sing; - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "[%s] CALLING SINGLE EVHDLR FOR STATUS %d", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), chain->status); - sing->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - OBJ_RELEASE(cd); - return; - } + pmix2x_opcaddy_t *op = (pmix2x_opcaddy_t*)cbdata; + if (NULL != op->opcbfunc) { + op->opcbfunc(pmix2x_convert_rc(status), op->cbdata); } - /* check multi events */ - OPAL_LIST_FOREACH(multi, &mca_pmix_pmix2x_component.multi_events, opal_pmix2x_multi_event_t) { - for (i=0; i < multi->ncodes; i++) { - if (cd->status == multi->codes[i]) { - /* found it - invoke the handler, pointing its - * callback function to our progression function */ - chain = OBJ_NEW(opal_pmix2x_event_chain_t); - chain->status = cd->status; - chain->range = pmix2x_convert_opalrange(cd->range); - chain->source = *(cd->source); - chain->info = cd->info; - chain->final_cbfunc = cd->opcbfunc; - chain->final_cbdata = cd->cbdata; - chain->multi = multi; - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "[%s] CALLING MULTI EVHDLR FOR STATUS %d", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), chain->status); - multi->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - OBJ_RELEASE(cd); - return; + OBJ_RELEASE(op); +} + +static void _notify(int sd, short args, void *cbdata) +{ + pmix2x_threadshift_t *cd = (pmix2x_threadshift_t *)cbdata; + pmix2x_opcaddy_t *op; + opal_value_t *kv; + pmix_proc_t p, *pptr; + pmix_status_t pstatus; + size_t n; + int rc=OPAL_SUCCESS; + pmix_data_range_t prange; + opal_pmix2x_jobid_trkr_t *job, *jptr; + + op = OBJ_NEW(pmix2x_opcaddy_t); + + /* convert the status */ + pstatus = pmix2x_convert_opalrc(cd->status); + + /* convert the source */ + if (NULL == cd->source) { + pptr = NULL; + } else { + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) { + if (jptr->jobid == cd->source->jobid) { + job = jptr; + break; } } + if (NULL == job) { + rc = OPAL_ERR_NOT_FOUND; + goto release; + } + (void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN); + p.rank = pmix2x_convert_opalrank(cd->source->vpid); + pptr = &p; } - /* check default events */ - if (0 < opal_list_get_size(&mca_pmix_pmix2x_component.default_events)) { - def = (opal_pmix2x_default_event_t*)opal_list_get_first(&mca_pmix_pmix2x_component.default_events); - chain = OBJ_NEW(opal_pmix2x_event_chain_t); - chain->status = cd->status; - chain->range = pmix2x_convert_opalrange(cd->range); - chain->source = *(cd->source); - chain->info = cd->info; - chain->final_cbfunc = cd->opcbfunc; - chain->final_cbdata = cd->cbdata; - chain->def = def; - opal_output_verbose(2, opal_pmix_base_framework.framework_output, - "[%s] CALLING DEFAULT EVHDLR FOR STATUS %d", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), chain->status); - def->handler(chain->status, &chain->source, - chain->info, &chain->results, - progress_local_event_hdlr, (void*)chain); - OBJ_RELEASE(cd); - return; + + /* convert the range */ + prange = pmix2x_convert_opalrange(cd->range); + + /* convert the list of info */ + if (NULL != cd->info) { + op->ninfo = opal_list_get_size(cd->info); + if (0 < op->ninfo) { + PMIX_INFO_CREATE(op->info, op->ninfo); + n=0; + OPAL_LIST_FOREACH(kv, cd->info, opal_value_t) { + (void)strncpy(op->info[n].key, kv->key, PMIX_MAX_KEYLEN); + pmix2x_value_load(&op->info[n].value, kv); + ++n; + } + } } - /* if we get here, then there are no registered event handlers */ + /* ask the library to notify our clients */ + pstatus = PMIx_Notify_event(pstatus, pptr, prange, op->info, op->ninfo, notify_complete, op); + rc = pmix2x_convert_rc(pstatus); + + release: + /* release the caller */ if (NULL != cd->opcbfunc) { - cd->opcbfunc(OPAL_ERR_NOT_FOUND, cd->cbdata); + cd->opcbfunc(rc, cd->cbdata); } OBJ_RELEASE(cd); - return; } static int notify_event(int status, @@ -1324,7 +1134,7 @@ static int notify_event(int status, { /* we must threadshift this request as we might not be in an event * and we are going to access framework-global lists/objects */ - OPAL_PMIX_NOTIFY_THREADSHIFT(status, source, range, info, _notify_event, cbfunc, cbdata); + OPAL_PMIX_NOTIFY_THREADSHIFT(status, source, range, info, _notify, cbfunc, cbdata); return OPAL_SUCCESS; } @@ -1401,47 +1211,14 @@ OBJ_CLASS_INSTANCE(opal_pmix2x_jobid_trkr_t, opal_list_item_t, NULL, NULL); -OBJ_CLASS_INSTANCE(opal_pmix2x_single_event_t, - opal_list_item_t, - NULL, NULL); - -static void mtevcon(opal_pmix2x_multi_event_t *p) -{ - p->codes = NULL; - p->ncodes = 0; -} -static void mtevdes(opal_pmix2x_multi_event_t *p) -{ - if (NULL != p->codes) { - free(p->codes); - } -} -OBJ_CLASS_INSTANCE(opal_pmix2x_multi_event_t, - opal_list_item_t, - mtevcon, mtevdes); - -OBJ_CLASS_INSTANCE(opal_pmix2x_default_event_t, - opal_list_item_t, - NULL, NULL); - -static void chcon(opal_pmix2x_event_chain_t *p) -{ - p->nondefault = false; - p->info = NULL; - OBJ_CONSTRUCT(&p->results, opal_list_t); - p->sing = NULL; - p->multi = NULL; - p->def = NULL; - p->final_cbfunc = NULL; - p->final_cbdata = NULL; -} -static void chdes(opal_pmix2x_event_chain_t *p) +static void evcon(opal_pmix2x_event_t *p) { - OPAL_LIST_DESTRUCT(&p->results); + p->handler = NULL; + p->cbdata = NULL; } -OBJ_CLASS_INSTANCE(opal_pmix2x_event_chain_t, +OBJ_CLASS_INSTANCE(opal_pmix2x_event_t, opal_list_item_t, - chcon, chdes); + evcon, NULL); static void opcon(pmix2x_opcaddy_t *p) { @@ -1455,11 +1232,15 @@ static void opcon(pmix2x_opcaddy_t *p) p->apps = NULL; p->sz = 0; p->active = false; + p->codes = NULL; + p->pcodes = NULL; + p->event = NULL; p->opcbfunc = NULL; p->mdxcbfunc = NULL; p->valcbfunc = NULL; p->lkcbfunc = NULL; p->spcbfunc = NULL; + p->evregcbfunc = NULL; p->cbdata = NULL; } static void opdes(pmix2x_opcaddy_t *p) @@ -1476,6 +1257,9 @@ static void opdes(pmix2x_opcaddy_t *p) if (NULL != p->apps) { PMIX_APP_FREE(p->apps, p->sz); } + if (NULL != p->pcodes) { + free(p->pcodes); + } } OBJ_CLASS_INSTANCE(pmix2x_opcaddy_t, opal_object_t, @@ -1513,12 +1297,17 @@ static void tscon(pmix2x_threadshift_t *p) p->source = NULL; p->event_codes = NULL; p->info = NULL; + OBJ_CONSTRUCT(&p->results, opal_list_t); p->evhandler = NULL; p->nondefault = false; p->cbfunc = NULL; p->opcbfunc = NULL; p->cbdata = NULL; } +static void tsdes(pmix2x_threadshift_t *p) +{ + OPAL_LIST_DESTRUCT(&p->results); +} OBJ_CLASS_INSTANCE(pmix2x_threadshift_t, opal_object_t, - tscon, NULL); + tscon, tsdes); diff --git a/opal/mca/pmix/pmix2x/pmix2x.h b/opal/mca/pmix/pmix2x/pmix2x.h index 3abb4508660..5e137322736 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.h +++ b/opal/mca/pmix/pmix2x/pmix2x.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2016 Research Organization for Information Science @@ -39,9 +39,7 @@ typedef struct { opal_list_t jobids; bool native_launch; size_t evindex; - opal_list_t single_events; - opal_list_t multi_events; - opal_list_t default_events; + opal_list_t events; int cache_size; opal_list_t cache; } mca_pmix_pmix2x_component_t; @@ -61,42 +59,10 @@ OBJ_CLASS_DECLARATION(opal_pmix2x_jobid_trkr_t); typedef struct { opal_list_item_t super; size_t index; - int code; opal_pmix_notification_fn_t handler; -} opal_pmix2x_single_event_t; -OBJ_CLASS_DECLARATION(opal_pmix2x_single_event_t); - -typedef struct { - opal_list_item_t super; - size_t index; - int *codes; - size_t ncodes; - opal_pmix_notification_fn_t handler; -} opal_pmix2x_multi_event_t; -OBJ_CLASS_DECLARATION(opal_pmix2x_multi_event_t); - -typedef struct { - opal_list_item_t super; - size_t index; - opal_pmix_notification_fn_t handler; -} opal_pmix2x_default_event_t; -OBJ_CLASS_DECLARATION(opal_pmix2x_default_event_t); - -typedef struct { - opal_list_item_t super; - int status; - bool nondefault; - opal_process_name_t source; - pmix_data_range_t range; - opal_list_t *info; - opal_list_t results; - opal_pmix2x_single_event_t *sing; - opal_pmix2x_multi_event_t *multi; - opal_pmix2x_default_event_t *def; - opal_pmix_op_cbfunc_t final_cbfunc; - void *final_cbdata; -} opal_pmix2x_event_chain_t; -OBJ_CLASS_DECLARATION(opal_pmix2x_event_chain_t); + void *cbdata; +} opal_pmix2x_event_t; +OBJ_CLASS_DECLARATION(opal_pmix2x_event_t); typedef struct { opal_object_t super; @@ -111,11 +77,16 @@ typedef struct { pmix_app_t *apps; size_t sz; volatile bool active; + opal_list_t *codes; + pmix_status_t *pcodes; + size_t ncodes; + opal_pmix2x_event_t *event; opal_pmix_op_cbfunc_t opcbfunc; opal_pmix_modex_cbfunc_t mdxcbfunc; opal_pmix_value_cbfunc_t valcbfunc; opal_pmix_lookup_cbfunc_t lkcbfunc; opal_pmix_spawn_cbfunc_t spcbfunc; + opal_pmix_evhandler_reg_cbfunc_t evregcbfunc; void *cbdata; } pmix2x_opcaddy_t; OBJ_CLASS_DECLARATION(pmix2x_opcaddy_t); @@ -152,28 +123,15 @@ typedef struct { size_t handler; opal_list_t *event_codes; opal_list_t *info; + opal_list_t results; opal_pmix_notification_fn_t evhandler; opal_pmix_evhandler_reg_cbfunc_t cbfunc; opal_pmix_op_cbfunc_t opcbfunc; + pmix_event_notification_cbfunc_fn_t pmixcbfunc; void *cbdata; } pmix2x_threadshift_t; OBJ_CLASS_DECLARATION(pmix2x_threadshift_t); -#define OPAL_PMIX_OPCD_THREADSHIFT(i, s, sr, if, nif, fn, cb, cd) \ - do { \ - pmix2x_opalcaddy_t *_cd; \ - _cd = OBJ_NEW(pmix2x_opalcaddy_t); \ - _cd->id = (i); \ - _cd->status = (s); \ - _cd->source = (sr); \ - _cd->info = (i); \ - _cd->evcbfunc = (cb); \ - _cd->cbdata = (cd); \ - event_assign(&((_cd)->ev), opal_pmix_base.evbase, \ - -1, EV_WRITE, (fn), (_cd)); \ - event_active(&((_cd)->ev), EV_WRITE, 1); \ - } while(0) - #define OPAL_PMIX_OP_THREADSHIFT(e, fn, cb, cd) \ do { \ pmix2x_threadshift_t *_cd; \ diff --git a/opal/mca/pmix/pmix2x/pmix2x_client.c b/opal/mca/pmix/pmix2x/pmix2x_client.c index c2728a68e49..bb3e891855a 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_client.c +++ b/opal/mca/pmix/pmix2x/pmix2x_client.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Mellanox Technologies, Inc. @@ -36,7 +36,6 @@ static pmix_proc_t my_proc; static char *dbgvalue=NULL; -static size_t errhdler_ref = 0; #define PMIX_WAIT_FOR_COMPLETION(a) \ do { \ @@ -50,7 +49,9 @@ static void errreg_cbfunc (pmix_status_t status, size_t errhandler_ref, void *cbdata) { - errhdler_ref = errhandler_ref; + opal_pmix2x_event_t *event = (opal_pmix2x_event_t*)cbdata; + + event->index = errhandler_ref; opal_output_verbose(5, opal_pmix_base_framework.framework_output, "PMIX client errreg_cbfunc - error handler registered status=%d, reference=%lu", status, (unsigned long)errhandler_ref); @@ -62,6 +63,7 @@ int pmix2x_client_init(void) pmix_status_t rc; int dbg; opal_pmix2x_jobid_trkr_t *job; + opal_pmix2x_event_t *event; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "PMIx_client init"); @@ -98,7 +100,9 @@ int pmix2x_client_init(void) opal_proc_set_name(&pname); /* register the default event handler */ - PMIx_Register_event_handler(NULL, 0, NULL, 0, pmix2x_event_hdlr, errreg_cbfunc, NULL); + event = OBJ_NEW(opal_pmix2x_event_t); + opal_list_append(&mca_pmix_pmix2x_component.events, &event->super); + PMIx_Register_event_handler(NULL, 0, NULL, 0, pmix2x_event_hdlr, errreg_cbfunc, event); return OPAL_SUCCESS; } @@ -106,12 +110,16 @@ int pmix2x_client_init(void) int pmix2x_client_finalize(void) { pmix_status_t rc; + opal_pmix2x_event_t *event; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "PMIx_client finalize"); - /* deregister the default event handler */ - PMIx_Deregister_event_handler(errhdler_ref, NULL, NULL); + /* deregister all event handlers */ + OPAL_LIST_FOREACH(event, &mca_pmix_pmix2x_component.events, opal_pmix2x_event_t) { + PMIx_Deregister_event_handler(event->index, NULL, NULL); + } + /* the list will be destructed when the component is finalized */ rc = PMIx_Finalize(NULL, 0); return pmix2x_convert_rc(rc); diff --git a/opal/mca/pmix/pmix2x/pmix2x_component.c b/opal/mca/pmix/pmix2x/pmix2x_component.c index 3079cbc3b75..bd8b74fc163 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_component.c +++ b/opal/mca/pmix/pmix2x/pmix2x_component.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. @@ -36,7 +36,6 @@ const char *opal_pmix_pmix2x_component_version_string = static int external_open(void); static int external_close(void); static int external_component_query(mca_base_module_t **module, int *priority); -static int external_register(void); /* @@ -66,7 +65,6 @@ mca_pmix_pmix2x_component_t mca_pmix_pmix2x_component = { .mca_open_component = external_open, .mca_close_component = external_close, .mca_query_component = external_component_query, - .mca_register_component_params = external_register, }, /* Next the MCA v1.0.0 component meta data */ .base_data = { @@ -77,27 +75,11 @@ mca_pmix_pmix2x_component_t mca_pmix_pmix2x_component = { .native_launch = false }; -static int external_register(void) -{ - mca_pmix_pmix2x_component.cache_size = 256; - mca_base_component_var_register(&mca_pmix_pmix2x_component.super.base_version, - "cache_size", "Size of the ring buffer cache for events", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &mca_pmix_pmix2x_component.cache_size); - - return OPAL_SUCCESS; -} - - static int external_open(void) { mca_pmix_pmix2x_component.evindex = 0; OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.jobids, opal_list_t); - OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.single_events, opal_list_t); - OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.multi_events, opal_list_t); - OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.default_events, opal_list_t); - OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.cache, opal_list_t); + OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.events, opal_list_t); return OPAL_SUCCESS; } @@ -105,10 +87,7 @@ static int external_open(void) static int external_close(void) { OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.jobids); - OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.single_events); - OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.multi_events); - OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.default_events); - OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.cache); + OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.events); return OPAL_SUCCESS; } diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index 4e6f9ef0b9d..f6809cb8e3b 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -88,6 +88,7 @@ BEGIN_C_DECLS #define OPAL_PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" // (bool) Resource Manager will clean session directories /* information about relative ranks as assigned by the RM */ +#define OPAL_PMIX_PROCID "pmix.procid" // (opal_process_name_t) process identifier #define OPAL_PMIX_NSPACE "pmix.nspace" // (char*) nspace of a job #define OPAL_PMIX_JOBID "pmix.jobid" // (uint32_t) jobid assigned by scheduler #define OPAL_PMIX_APPNUM "pmix.appnum" // (uint32_t) app number within the job @@ -117,6 +118,8 @@ BEGIN_C_DECLS #define OPAL_PMIX_LOCAL_CPUSETS "pmix.lcpus" // (char*) colon-delimited cpusets of local peers within the specified nspace #define OPAL_PMIX_PROC_URI "opal.puri" // (char*) URI containing contact info for proc - NOTE: this is published by procs and // thus cannot be prefixed with "pmix" +#define OPAL_PMIX_DAEMON_MEMORY "pmix.dmn.mem" // (float) Mbytes of memory currently used by daemon +#define OPAL_PMIX_CLIENT_AVG_MEMORY "pmix.cl.mem.avg" // (float) Average Mbytes of memory used by client processes /* size info */ #define OPAL_PMIX_UNIV_SIZE "pmix.univ.size" // (uint32_t) #procs in this nspace @@ -220,6 +223,8 @@ BEGIN_C_DECLS #define OPAL_PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform" #define OPAL_PMIX_QUERY_SPAWN_SUPPORT "pmix.qry.spawn" // return a comma-delimited list of supported spawn attributes #define OPAL_PMIX_QUERY_DEBUG_SUPPORT "pmix.qry.debug" // return a comma-delimited list of supported debug attributes +#define OPAL_PMIX_QUERY_MEMORY_USAGE "pmix.qry.mem" // return info on memory usage for the procs indicated in the qualifiers +#define OPAL_PMIX_QUERY_LOCAL_ONLY "pmix.qry.local" // constrain the query to local information only /* log attributes */ #define OPAL_PMIX_LOG_STDERR "pmix.log.stderr" // (bool) log data to stderr diff --git a/opal/util/proc.c b/opal/util/proc.c index bf76399c40a..2481408a17f 100644 --- a/opal/util/proc.c +++ b/opal/util/proc.c @@ -4,7 +4,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights @@ -200,10 +200,9 @@ char* opal_get_proc_hostname(const opal_proc_t *proc) } /* if we don't already have it, then try to get it */ - OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_HOSTNAME, &proc->proc_name, - (char**)&(proc->proc_hostname), OPAL_STRING); + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_HOSTNAME, &proc->proc_name, + (char**)&(proc->proc_hostname), OPAL_STRING); if (OPAL_SUCCESS != ret) { - OPAL_ERROR_LOG(ret); return "unknown"; // return something so the caller doesn't segfault } diff --git a/orte/mca/dfs/app/dfs_app_component.c b/orte/mca/dfs/app/dfs_app_component.c index 395c98da022..1479007ac0e 100644 --- a/orte/mca/dfs/app/dfs_app_component.c +++ b/orte/mca/dfs/app/dfs_app_component.c @@ -3,6 +3,7 @@ * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * + * Copyright (c) 2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -71,7 +72,7 @@ static int dfs_app_close(void) static int dfs_app_component_query(mca_base_module_t **module, int *priority) { - if (ORTE_PROC_IS_APP && orte_staged_execution) { + if (ORTE_PROC_IS_APP) { /* set our priority high as we are the default for apps */ *priority = 1000; *module = (mca_base_module_t *)&orte_dfs_app_module; diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index 170804775cc..7997c282b14 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -12,7 +12,7 @@ * Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -89,7 +89,6 @@ static int rte_init(void) char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; - char *rmluri; opal_value_t *kv; char *val; int u32, *u32ptr; @@ -399,19 +398,9 @@ static int rte_init(void) orte_process_info.max_procs = orte_process_info.num_procs; } - /*** PUSH DATA FOR OTHERS TO FIND ***/ - - /* push our RML URI in case others need to talk directly to us */ - rmluri = orte_rml.get_contact_info(); - /* push it out for others to use */ - OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_PROC_URI, rmluri, OPAL_STRING); - if (ORTE_SUCCESS != ret) { - error = "pmix put uri"; - goto error; - } - free(rmluri); - - /* push our hostname so others can find us, if they need to */ + /* push our hostname so others can find us, if they need to - the + * native PMIx component will ignore this request as the hostname + * is provided by the system */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "db store hostname"; diff --git a/orte/mca/ess/singleton/ess_singleton_module.c b/orte/mca/ess/singleton/ess_singleton_module.c index 7e523219cd5..2f2e5376ac8 100644 --- a/orte/mca/ess/singleton/ess_singleton_module.c +++ b/orte/mca/ess/singleton/ess_singleton_module.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2016 Research Organization for Information Science @@ -336,13 +336,6 @@ static int rte_init(void) return rc; } - /* push our hostname so others can find us, if they need to */ - OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); - if (ORTE_SUCCESS != ret) { - error = "db store hostname"; - goto error; - } - return ORTE_SUCCESS; error: diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 5a228a80635..e9bad1cf347 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1341,9 +1341,6 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, opal_argv_append(argc, argv, "1"); } - if (orte_map_reduce) { - opal_argv_append(argc, argv, "--mapreduce"); - } if (orte_map_stddiag_to_stderr) { opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID); opal_argv_append(argc, argv, "orte_map_stddiag_to_stderr"); diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index b35a798ce79..6e561dacb0b 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -1101,11 +1101,6 @@ static int setup_child(orte_job_t *jdata, opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, &app->env); } - /* if we are using staged execution, tell it */ - if (orte_staged_execution) { - opal_setenv("OMPI_MCA_orte_staged_execution", "1", true, &app->env); - } - /* if the proc isn't going to forward IO, then we need to flag that * it has "completed" iof termination as otherwise it will never fire */ diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 0c4d928c3b9..c8a2907c16f 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -126,7 +126,6 @@ static struct { int uri_pipe; int singleton_died_pipe; bool abort; - bool mapreduce; bool tree_spawn; char *hnp_topo_sig; bool test_suicide; @@ -217,10 +216,6 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { &orted_globals.hnp_topo_sig, OPAL_CMD_LINE_TYPE_STRING, "Topology signature of HNP" }, - { NULL, '\0', "mapreduce", "mapreduce", 0, - &orted_globals.mapreduce, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to report process bindings to stderr" }, - /* End of list */ { NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } @@ -335,11 +330,6 @@ int orte_daemon(int argc, char *argv[]) free(tmp_env_var); #endif - /* if mapreduce set, flag it */ - if (orted_globals.mapreduce) { - orte_map_reduce = true; - } - /* detach from controlling terminal * otherwise, remain attached so output can get to us */ diff --git a/orte/orted/orted_submit.c b/orte/orted/orted_submit.c index 2b2dae6fdb1..848ce8e3a67 100644 --- a/orte/orted/orted_submit.c +++ b/orte/orted/orted_submit.c @@ -14,7 +14,7 @@ * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -137,6 +137,9 @@ static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jar static int parse_appfile(orte_job_t *jdata, char *filename, char ***env); static void orte_timeout_wakeup(int sd, short args, void *cbdata); static void orte_profile_wakeup(int sd, short args, void *cbdata); +static void profile_recv(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, orte_rml_tag_t tag, + void* cbdata); static void launch_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata); @@ -896,20 +899,6 @@ int orte_submit_job(char *argv[], int *index, } } - /* check for debugger test envars and forward them if necessary */ - if (NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) { - char *evar; - evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"); - for (i=0; i < (orte_app_idx_t)jdata->num_apps; i++) { - if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { - opal_setenv("ORTE_TEST_DEBUGGER_ATTACH", "1", true, &app->env); - if (NULL != evar) { - opal_setenv("ORTE_TEST_DEBUGGER_SLEEP", evar, true, &app->env); - } - } - } - } - /* check for suicide test directives */ if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") || NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) { @@ -956,6 +945,8 @@ int orte_submit_job(char *argv[], int *index, ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE); //goto DONE; } + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MEMPROFILE, + ORTE_RML_PERSISTENT, profile_recv, NULL); orte_memprofile_timeout->tv.tv_sec = timeout_seconds; orte_memprofile_timeout->tv.tv_usec = 0; opal_event_evtimer_set(orte_event_base, orte_memprofile_timeout->ev, @@ -2212,10 +2203,9 @@ static void orte_debugger_init_before_spawn(orte_job_t *jdata) static bool mpir_breakpoint_fired = false; -static void _send_notification(void) +static void _send_notification(int status) { opal_buffer_t buf; - int status = OPAL_ERR_DEBUGGER_RELEASE; orte_grpcomm_signature_t sig; int rc; opal_value_t kv, *kvptr; @@ -2448,7 +2438,7 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) "%s NOTIFYING DEBUGGER RELEASE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* notify all procs that the debugger is ready */ - _send_notification(); + _send_notification(OPAL_ERR_DEBUGGER_RELEASE); } } return; @@ -2547,7 +2537,7 @@ void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) "%s NOTIFYING DEBUGGER RELEASE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* notify all procs that the debugger is ready */ - _send_notification(); + _send_notification(OPAL_ERR_DEBUGGER_RELEASE); } else if (!orte_debugger_test_attach) { /* if I am launching debugger daemons, then I need to do so now * that the job has been started and I know which nodes have @@ -3133,6 +3123,16 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata) static int nreports = 0; static orte_timer_t profile_timer; +static int nchecks = 0; + +static void profile_timeout(int sd, short args, void *cbdata) +{ + /* abort the job */ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); + /* set the global abnormal exit flag */ + orte_abnormal_term_ordered = true; +} + static void profile_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, @@ -3167,26 +3167,33 @@ static void profile_recv(int status, orte_process_name_t* sender, done: --nreports; if (nreports == 0) { + ++nchecks; /* cancel the timeout */ OBJ_DESTRUCT(&profile_timer); - /* abort the job */ - ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); - /* set the global abnormal exit flag */ - orte_abnormal_term_ordered = true; + /* notify to release */ + _send_notification(12345); + /* if this was the first measurement, then we need to + * let the probe move along */ + if (2 > nchecks) { + /* reset the event */ + opal_event_evtimer_set(orte_event_base, orte_memprofile_timeout->ev, + orte_profile_wakeup, NULL); + opal_event_set_priority(orte_memprofile_timeout->ev, ORTE_ERROR_PRI); + opal_event_evtimer_add(orte_memprofile_timeout->ev, &orte_memprofile_timeout->tv); + /* reset the timer */ + OBJ_CONSTRUCT(&profile_timer, orte_timer_t); + opal_event_evtimer_set(orte_event_base, + profile_timer.ev, profile_timeout, NULL); + opal_event_set_priority(profile_timer.ev, ORTE_ERROR_PRI); + profile_timer.tv.tv_sec = 30; + opal_event_evtimer_add(profile_timer.ev, &profile_timer.tv); + return; + } } } -static void profile_timeout(int sd, short args, void *cbdata) -{ - /* abort the job */ - ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); - /* set the global abnormal exit flag */ - orte_abnormal_term_ordered = true; -} - void orte_profile_wakeup(int sd, short args, void *cbdata) { - orte_job_t *jdata = (orte_job_t*)cbdata; orte_job_t *dmns; orte_proc_t *dmn; int i; @@ -3202,8 +3209,6 @@ void orte_profile_wakeup(int sd, short args, void *cbdata) /* set the recv */ nreports = 1; // always get a report from ourselves - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_MEMPROFILE, - ORTE_RML_PERSISTENT, profile_recv, NULL); /* setup the buffer */ buffer = OBJ_NEW(opal_buffer_t); @@ -3213,13 +3218,6 @@ void orte_profile_wakeup(int sd, short args, void *cbdata) OBJ_RELEASE(buffer); goto giveup; } - /* pack the jobid in question */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &jdata->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - goto giveup; - } - /* goes to just the first daemon beyond ourselves - no need to get it from everyone */ dmns = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (NULL != (dmn = (orte_proc_t*)opal_pointer_array_get_item(dmns->procs, 1))) { diff --git a/orte/orted/pmix/pmix_server_gen.c b/orte/orted/pmix/pmix_server_gen.c index 69a83825589..ff36dd5b3b8 100644 --- a/orte/orted/pmix/pmix_server_gen.c +++ b/orte/orted/pmix/pmix_server_gen.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -35,6 +35,7 @@ #include "opal/util/argv.h" #include "opal/util/output.h" #include "opal/dss/dss.h" +#include "opal/mca/pstat/pstat.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/rmaps_types.h" @@ -449,13 +450,18 @@ static void _query(int sd, short args, void *cbdata) opal_pmix_query_t *q; opal_value_t *kv; orte_job_t *jdata; - int rc; - opal_list_t *results; + orte_proc_t *proct; + int rc, i, num_replies; + opal_list_t *results, targets, *array; size_t n; uint32_t key; void *nptr; char **nspaces=NULL, nspace[512]; char **ans = NULL; + bool local_only; + orte_namelist_t *nm; + opal_pstats_t pstat; + float pss; opal_output_verbose(2, orte_pmix_server_globals.output, "%s processing query", @@ -522,6 +528,84 @@ static void _query(int sd, short args, void *cbdata) opal_list_append(results, &kv->super); opal_argv_free(ans); ans = NULL; + } else if (0 == strcmp(q->keys[n], OPAL_PMIX_QUERY_MEMORY_USAGE)) { + /* check the qualifiers to find the procs they want to + * know about - if qualifiers are NULL, then get it for + * the daemons + all active jobs */ + if (0 == opal_list_get_size(&q->qualifiers)) { + /* create a request tracker */ + /* xcast a request for all memory usage */ + /* return success - the callback will be done + * once we get the results */ + return; + } + + /* scan the qualifiers */ + OPAL_LIST_FOREACH(kv, &q->qualifiers, opal_value_t) { + if (0 == strcmp(kv->key, OPAL_PMIX_QUERY_LOCAL_ONLY)) { + if (OPAL_UNDEF == kv->type || kv->data.flag) { + local_only = true; + } else { + local_only = false; + } + } else if (0 == strcmp(kv->key, OPAL_PMIX_PROCID)) { + /* save this directive on our list of targets */ + nm = OBJ_NEW(orte_namelist_t); + } + } + + /* if they have asked for only our local procs or daemon, + * then we can just get the data directly */ + if (local_only) { + if (0 == opal_list_get_size(&targets)) { + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_QUERY_MEMORY_USAGE); + kv->type = OPAL_PTR; + array = OBJ_NEW(opal_list_t); + kv->data.ptr = array; + opal_list_append(results, &kv->super); + /* collect my memory usage */ + OBJ_CONSTRUCT(&pstat, opal_pstats_t); + opal_pstat.query(orte_process_info.pid, &pstat, NULL); + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_DAEMON_MEMORY); + kv->type = OPAL_FLOAT; + kv->data.fval = pstat.pss; + opal_list_append(array, &kv->super); + OBJ_DESTRUCT(&pstat); + /* collect the memory usage of all my children */ + pss = 0.0; + num_replies = 0; + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { + /* collect the stats on this proc */ + OBJ_CONSTRUCT(&pstat, opal_pstats_t); + if (OPAL_SUCCESS == opal_pstat.query(proct->pid, &pstat, NULL)) { + pss += pstat.pss; + ++num_replies; + } + OBJ_DESTRUCT(&pstat); + } + } + /* compute the average value */ + if (0 < num_replies) { + pss /= (float)num_replies; + } + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_CLIENT_AVG_MEMORY); + kv->type = OPAL_FLOAT; + kv->data.fval = pss; + opal_list_append(array, &kv->super); + } else { + + } + } else { + /* if they want it for remote procs, see who is hosting them + * and ask directly for the info - if rank=wildcard, then + * we need to xcast the request and collect the results */ + } + } } } diff --git a/orte/orted/pmix/pmix_server_register_fns.c b/orte/orted/pmix/pmix_server_register_fns.c index 4e8c67b3db0..d007b76c28b 100644 --- a/orte/orted/pmix/pmix_server_register_fns.c +++ b/orte/orted/pmix/pmix_server_register_fns.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science @@ -407,6 +407,14 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->type = OPAL_UINT32; kv->data.uint32 = pptr->node->index; opal_list_append(pmap, &kv->super); + + if (map->num_nodes < orte_hostname_cutoff) { + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_HOSTNAME); + kv->type = OPAL_STRING; + kv->data.string = strdup(pptr->node->name); + opal_list_append(pmap, &kv->super); + } } } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index f1b47f6933a..2c39349b0bd 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -13,7 +13,7 @@ * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -86,6 +86,7 @@ bool orte_have_fqdn_allocation = false; bool orte_show_resolved_nodenames = false; bool orte_retain_aliases = false; int orte_use_hostname_alias = -1; +int orte_hostname_cutoff = 1000; int orted_debug_failure = -1; int orted_debug_failure_delay = -1; @@ -183,10 +184,6 @@ int orte_stat_history_size = -1; /* envars to forward */ char **orte_forwarded_envars = NULL; -/* map-reduce mode */ -bool orte_map_reduce = false; -bool orte_staged_execution = false; - /* map stddiag output to stderr so it isn't forwarded to mpirun */ bool orte_map_stddiag_to_stderr = false; @@ -196,9 +193,6 @@ int orte_max_vm_size = -1; /* user debugger */ char *orte_base_user_debugger = NULL; -/* modex cutoff */ -uint32_t orte_direct_modex_cutoff = UINT32_MAX; - int orte_debug_output = -1; bool orte_debug_daemons_flag = false; bool orte_xml_output = false; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index bb3534a20be..78d4638b5a8 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -13,7 +13,7 @@ * Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -466,6 +466,7 @@ ORTE_DECLSPEC extern bool orte_have_fqdn_allocation; ORTE_DECLSPEC extern bool orte_show_resolved_nodenames; ORTE_DECLSPEC extern bool orte_retain_aliases; ORTE_DECLSPEC extern int orte_use_hostname_alias; +ORTE_DECLSPEC extern int orte_hostname_cutoff; /* debug flags */ ORTE_DECLSPEC extern int orted_debug_failure; @@ -564,10 +565,6 @@ ORTE_DECLSPEC extern int orte_stat_history_size; /* envars to forward */ ORTE_DECLSPEC extern char **orte_forwarded_envars; -/* map-reduce mode */ -ORTE_DECLSPEC extern bool orte_map_reduce; -ORTE_DECLSPEC extern bool orte_staged_execution; - /* map stddiag output to stderr so it isn't forwarded to mpirun */ ORTE_DECLSPEC extern bool orte_map_stddiag_to_stderr; @@ -582,9 +579,6 @@ ORTE_DECLSPEC extern char *orte_base_user_debugger; */ ORTE_DECLSPEC extern char *orte_daemon_cores; -/* cutoff for collective modex */ -ORTE_DECLSPEC extern uint32_t orte_direct_modex_cutoff; - END_C_DECLS #endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */ diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 33dc7a05935..21b3354678f 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -13,7 +13,7 @@ * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 Los Alamos National Security, LLC. * All rights reserved - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -429,6 +429,13 @@ int orte_register_params(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_retain_aliases); + orte_hostname_cutoff = 1000; + (void) mca_base_var_register ("orte", "orte", NULL, "hostname_cutoff", + "Pass hostnames to all procs when #nodes is less than cutoff [default:1000]", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_READONLY, + &orte_hostname_cutoff); + /* which alias to use in MPIR_proctab */ orte_use_hostname_alias = 1; (void) mca_base_var_register ("orte", "orte", NULL, "hostname_alias_index", @@ -659,13 +666,6 @@ int orte_register_params(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_allowed_exit_without_sync); - orte_staged_execution = false; - (void) mca_base_var_register ("orte", "orte", NULL, "staged_execution", - "Staged execution is being used", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &orte_staged_execution); - orte_report_child_jobs_separately = false; (void) mca_base_var_register ("orte", "orte", NULL, "report_child_jobs_separately", "Return the exit status of the primary job only", @@ -754,17 +754,6 @@ int orte_register_params(void) OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &orte_daemon_cores); - /* cutoff for full modex */ - orte_direct_modex_cutoff = UINT32_MAX; - id = mca_base_var_register ("orte", "orte", NULL, "direct_modex_cutoff", - "If the number of processes in the application exceeds the provided value," - "modex will be done upon demand [default: UINT32_MAX]", - MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &orte_direct_modex_cutoff); - /* register a synonym for old name */ - mca_base_var_register_synonym (id, "ompi", "ompi", "hostname", "cutoff", MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - /* get the conduit params */ orte_coll_transport = "fabric,ethernet"; (void) mca_base_var_register("orte", "orte", "coll", "transports",