From a239b4c3c3fafade1b732d522248c0bb71d1b366 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 2 Aug 2017 19:46:38 -0600 Subject: [PATCH] Per discussion on the PMIx side, do a better job of detecting mismatches between location directives for OPAL and PMIx. Provide a more helpful error message and error out if we find a mismatch. If any OPAL values are set and the PMIx equivalent is not, then transfer it. Do not clear PMIX_INSTALL_PREFIX from the daemon's launch environment Fixes #3980 Closes #4007 Refs #3985 Signed-off-by: Ralph Castain --- opal/mca/pmix/pmix2x/Makefile.am | 2 + opal/mca/pmix/pmix2x/help-pmix-pmix2x.txt | 32 ++++++ opal/mca/pmix/pmix2x/pmix2x.c | 108 +++++++++++++++++++++ opal/mca/pmix/pmix2x/pmix2x.h | 3 + opal/mca/pmix/pmix2x/pmix2x_client.c | 8 +- opal/mca/pmix/pmix2x/pmix2x_component.c | 17 ++++ opal/mca/pmix/pmix2x/pmix2x_server_south.c | 7 +- orte/orted/orted_main.c | 3 +- orte/orted/orted_submit.c | 9 +- 9 files changed, 175 insertions(+), 14 deletions(-) create mode 100644 opal/mca/pmix/pmix2x/help-pmix-pmix2x.txt diff --git a/opal/mca/pmix/pmix2x/Makefile.am b/opal/mca/pmix/pmix2x/Makefile.am index 049238c6a40..bd9304e5d13 100644 --- a/opal/mca/pmix/pmix2x/Makefile.am +++ b/opal/mca/pmix/pmix2x/Makefile.am @@ -12,6 +12,8 @@ EXTRA_DIST = autogen.subdirs +dist_opaldata_DATA = help-pmix-pmix2x.txt + SUBDIRS = pmix sources = \ diff --git a/opal/mca/pmix/pmix2x/help-pmix-pmix2x.txt b/opal/mca/pmix/pmix2x/help-pmix-pmix2x.txt new file mode 100644 index 00000000000..07327e11636 --- /dev/null +++ b/opal/mca/pmix/pmix2x/help-pmix-pmix2x.txt @@ -0,0 +1,32 @@ +# -*- text -*- +# +# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open MPI MCA error messages. +# +[evars] +We found conflicting directives regarding the location of OPAL vs PMIx +installation directories: + +%s + +This usually indicates that OMPI was configured to use its internal copy +of PMIx, but another installation of PMIx is also in use on this system +and could potentially cause confusion between the two sets of plugins. +Please either unset the indicated environment variables, or configure +OMPI to use the external PMIx installation. diff --git a/opal/mca/pmix/pmix2x/pmix2x.c b/opal/mca/pmix/pmix2x/pmix2x.c index 0630c6691e5..33e55806e32 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.c +++ b/opal/mca/pmix/pmix2x/pmix2x.c @@ -34,6 +34,7 @@ #include "opal/threads/threads.h" #include "opal/util/argv.h" #include "opal/util/error.h" +#include "opal/util/opal_environ.h" #include "opal/util/output.h" #include "opal/util/proc.h" #include "opal/util/show_help.h" @@ -1391,6 +1392,113 @@ opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir) } } +typedef struct { + opal_list_item_t super; + char *opalname; + char *opalvalue; + char *pmixname; + char *pmixvalue; + bool mismatched; +} opal_pmix_evar_t; +static void econ(opal_pmix_evar_t *p) +{ + p->opalname = NULL; + p->opalvalue = NULL; + p->pmixname = NULL; + p->pmixvalue = NULL; + p->mismatched = false; +} +static OBJ_CLASS_INSTANCE(opal_pmix_evar_t, + opal_list_item_t, + econ, NULL); +struct known_value { + char *opalname; + char *pmixname; +}; + +static struct known_value known_values[] = { + {"OPAL_PREFIX", "PMIX_INSTALL_PREFIX"}, + {"OPAL_EXEC_PREFIX", "PMIX_EXEC_PREFIX"}, + {"OPAL_BINDIR", "PMIX_BINDIR"}, + {"OPAL_SBINDIR", "PMIX_SBINDIR"}, + {"OPAL_LIBEXECDIR", "PMIX_LIBEXECDIR"}, + {"OPAL_DATAROOTDIR", "PMIX_DATAROOTDIR"}, + {"OPAL_DATADIR", "PMIX_DATADIR"}, + {"OPAL_SYSCONFDIR", "PMIX_SYSCONFDIR"}, + {"OPAL_SHAREDSTATEDIR", "PMIX_SHAREDSTATEDIR"}, + {"OPAL_LOCALSTATEDIR", "PMIX_LOCALSTATEDIR"}, + {"OPAL_LIBDIR", "PMIX_LIBDIR"}, + {"OPAL_INCLUDEDIR", "PMIX_INCLUDEDIR"}, + {"OPAL_INFODIR", "PMIX_INFODIR"}, + {"OPAL_MANDIR", "PMIX_MANDIR"}, + {"OPAL_PKGDATADIR", "PMIX_PKGDATADIR"}, + {"OPAL_PKGLIBDIR", "PMIX_PKGLIBDIR"}, + {"OPAL_PKGINCLUDEDIR", "PMIX_PKGINCLUDEDIR"} +}; + + +int opal_pmix_pmix2x_check_evars(void) +{ + opal_list_t values; + int nvals, i; + opal_pmix_evar_t *evar; + bool mismatched = false; + char *tmp=NULL, *tmp2; + + OBJ_CONSTRUCT(&values, opal_list_t); + nvals = sizeof(known_values) / sizeof(struct known_value); + for (i=0; i < nvals; i++) { + evar = OBJ_NEW(opal_pmix_evar_t); + evar->opalname = known_values[i].opalname; + evar->opalvalue = getenv(evar->opalname); + evar->pmixname = known_values[i].pmixname; + evar->pmixvalue = getenv(evar->pmixname); + /* if the OPAL value is not set and the PMIx value is, + * then that is a problem. Likewise, if both are set + * and are different, then that is also a problem. Note that + * it is okay for the OPAL value to be set and the PMIx + * value to not be set */ + if ((NULL == evar->opalvalue && NULL != evar->pmixvalue) || + (NULL != evar->opalvalue && NULL != evar->pmixvalue && + 0 != strcmp(evar->opalvalue, evar->pmixvalue))) { + evar->mismatched = true; + mismatched = true; + } + opal_list_append(&values, &evar->super); + } + if (!mismatched) { + /* transfer any OPAL values that were set - we already verified + * that the equivalent PMIx value, if present, matches, so + * don't overwrite it */ + OPAL_LIST_FOREACH(evar, &values, opal_pmix_evar_t) { + if (NULL != evar->opalvalue && NULL == evar->pmixvalue) { + opal_setenv(evar->pmixname, evar->opalvalue, true, &environ); + } + } + OPAL_LIST_DESTRUCT(&values); + return OPAL_SUCCESS; + } + /* we have at least one mismatch somewhere, so print out the table */ + OPAL_LIST_FOREACH(evar, &values, opal_pmix_evar_t) { + if (evar->mismatched) { + if (NULL == tmp) { + asprintf(&tmp, " %s: %s\n %s: %s", + evar->opalname, (NULL == evar->opalvalue) ? "NULL" : evar->opalvalue, + evar->pmixname, (NULL == evar->pmixvalue) ? "NULL" : evar->pmixvalue); + } else { + asprintf(&tmp2, "%s\n\n %s: %s\n %s: %s", tmp, + evar->opalname, (NULL == evar->opalvalue) ? "NULL" : evar->opalvalue, + evar->pmixname, (NULL == evar->pmixvalue) ? "NULL" : evar->pmixvalue); + free(tmp); + tmp = tmp2; + } + } + } + opal_show_help("help-pmix-pmix2x.txt", "evars", true, tmp); + free(tmp); + return OPAL_ERR_SILENT; +} + /**** INSTANTIATE INTERNAL CLASSES ****/ OBJ_CLASS_INSTANCE(opal_pmix2x_jobid_trkr_t, opal_list_item_t, diff --git a/opal/mca/pmix/pmix2x/pmix2x.h b/opal/mca/pmix/pmix2x/pmix2x.h index f8d93e55f81..9f39e6fe2bc 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.h +++ b/opal/mca/pmix/pmix2x/pmix2x.h @@ -46,6 +46,7 @@ typedef struct { int cache_size; opal_list_t cache; opal_list_t dmdx; + bool silence_warning; } mca_pmix_pmix2x_component_t; OPAL_DECLSPEC extern mca_pmix_pmix2x_component_t mca_pmix_pmix2x_component; @@ -290,6 +291,8 @@ OPAL_MODULE_DECLSPEC int pmix2x_server_notify_event(int status, /**** COMPONENT UTILITY FUNCTIONS ****/ +OPAL_MODULE_DECLSPEC int opal_pmix_pmix2x_check_evars(void); + OPAL_MODULE_DECLSPEC void pmix2x_event_hdlr(size_t evhdlr_registration_id, pmix_status_t status, const pmix_proc_t *source, pmix_info_t info[], size_t ninfo, diff --git a/opal/mca/pmix/pmix2x/pmix2x_client.c b/opal/mca/pmix/pmix2x/pmix2x_client.c index 52d77fab99e..540204c331a 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_client.c +++ b/opal/mca/pmix/pmix2x/pmix2x_client.c @@ -31,6 +31,7 @@ #include "opal/util/argv.h" #include "opal/util/opal_environ.h" #include "opal/util/proc.h" +#include "opal/util/show_help.h" #include "opal/mca/pmix/base/base.h" #include "pmix2x.h" @@ -66,7 +67,6 @@ int pmix2x_client_init(opal_list_t *ilist) pmix_info_t *pinfo; size_t ninfo, n; opal_value_t *ival; - char *evar; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "PMIx_client init"); @@ -78,9 +78,9 @@ int pmix2x_client_init(opal_list_t *ilist) asprintf(&dbgvalue, "PMIX_DEBUG=%d", dbg); putenv(dbgvalue); } - if ((NULL != (evar = getenv("OPAL_PREFIX"))) && - (NULL == getenv("PMIX_INSTALL_PREFIX"))) { - opal_setenv("PMIX_INSTALL_PREFIX", evar, false, &environ); + /* check the evars for a mismatch */ + if (OPAL_SUCCESS != (dbg = opal_pmix_pmix2x_check_evars())) { + return dbg; } } diff --git a/opal/mca/pmix/pmix2x/pmix2x_component.c b/opal/mca/pmix/pmix2x/pmix2x_component.c index 21785a7edf7..03246c11801 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_component.c +++ b/opal/mca/pmix/pmix2x/pmix2x_component.c @@ -33,6 +33,7 @@ const char *opal_pmix_pmix2x_component_version_string = /* * Local function */ +static int external_register(void); static int external_open(void); static int external_close(void); static int external_component_query(mca_base_module_t **module, int *priority); @@ -65,6 +66,7 @@ mca_pmix_pmix2x_component_t mca_pmix_pmix2x_component = { .mca_open_component = external_open, .mca_close_component = external_close, .mca_query_component = external_component_query, + .mca_register_component_params = external_register }, /* Next the MCA v1.0.0 component meta data */ .base_data = { @@ -75,6 +77,21 @@ mca_pmix_pmix2x_component_t mca_pmix_pmix2x_component = { .native_launch = false }; +static int external_register(void) +{ + mca_base_component_t *component = &mca_pmix_pmix2x_component.super.base_version; + + mca_pmix_pmix2x_component.silence_warning = false; + (void) mca_base_component_var_register (component, "silence_warning", + "Silence warning about PMIX_INSTALL_PREFIX", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_pmix_pmix2x_component.silence_warning); + + return OPAL_SUCCESS; +} + static int external_open(void) { mca_pmix_pmix2x_component.evindex = 0; diff --git a/opal/mca/pmix/pmix2x/pmix2x_server_south.c b/opal/mca/pmix/pmix2x/pmix2x_server_south.c index 41abb8b4d58..e33e6718b24 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_server_south.c +++ b/opal/mca/pmix/pmix2x/pmix2x_server_south.c @@ -100,7 +100,6 @@ int pmix2x_server_init(opal_pmix_server_module_t *module, opal_pmix2x_event_t *event; opal_pmix2x_jobid_trkr_t *job; opal_pmix_lock_t lk; - char *evar; OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock); @@ -109,9 +108,9 @@ int pmix2x_server_init(opal_pmix_server_module_t *module, asprintf(&dbgvalue, "PMIX_DEBUG=%d", dbg); putenv(dbgvalue); } - if ((NULL != (evar = getenv("OPAL_PREFIX"))) && - (NULL == getenv("PMIX_INSTALL_PREFIX"))) { - opal_setenv("PMIX_INSTALL_PREFIX", evar, false, &environ); + /* check the evars for a mismatch */ + if (OPAL_SUCCESS != (dbg = opal_pmix_pmix2x_check_evars())) { + return dbg; } } ++opal_pmix_base.initialized; diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 0bd57a25a92..ff6291d4df4 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -285,8 +285,9 @@ int orte_daemon(int argc, char *argv[]) */ orte_launch_environ = opal_argv_copy(environ); - /* purge any ess flag set in the environ when we were launched */ + /* purge any ess/pmix flags set in the environ when we were launched */ opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ); + opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ); /* if orte_daemon_debug is set, let someone know we are alive right * away just in case we have a problem along the way diff --git a/orte/orted/orted_submit.c b/orte/orted/orted_submit.c index b471acf13ba..99cc99acac3 100644 --- a/orte/orted/orted_submit.c +++ b/orte/orted/orted_submit.c @@ -537,11 +537,6 @@ int orte_submit_init(int argc, char *argv[], */ opal_finalize(); - /* clear params from the environment so our children - * don't pick them up */ - opal_unsetenv(OPAL_MCA_PREFIX"ess", &environ); - opal_unsetenv(OPAL_MCA_PREFIX"pmix", &environ); - if (ORTE_PROC_IS_TOOL) { opal_value_t val; /* extract the name */ @@ -589,6 +584,10 @@ int orte_submit_init(int argc, char *argv[], * orterun */ orte_launch_environ = opal_argv_copy(environ); + /* clear params from the environment so our children + * don't pick them up */ + opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ); + opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ); } return ORTE_SUCCESS;