Skip to content

Commit

Permalink
Merge pull request #3778 from rhc54/topic/warn
Browse files Browse the repository at this point in the history
Attempt to detect when we are direct-launched without the necessary P…
  • Loading branch information
Ralph Castain committed Jun 29, 2017
2 parents cb19296 + bd4a6fe commit 7cbea77
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 7 deletions.
2 changes: 1 addition & 1 deletion ompi/runtime/ompi_mpi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
error:
if (ret != OMPI_SUCCESS) {
/* Only print a message if one was not already printed */
if (NULL != error) {
if (NULL != error && OMPI_ERR_SILENT != ret) {
const char *err_msg = opal_strerror(ret);
opal_show_help("help-mpi-runtime.txt",
"mpi_init:startup:internal-failure", true,
Expand Down
40 changes: 40 additions & 0 deletions orte/mca/ess/base/help-ess-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,43 @@ MCA parameter:
param: %s

This is not a recognized signal value. Please fix or remove it.
#
[slurm-error]
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM's PMI support and therefore cannot
execute. There are several options for building PMI support under
SLURM, depending upon the SLURM version you are using:

version 16.05 or later: you can use SLURM's PMIx support. This
requires that you configure and build SLURM --with-pmix.

Versions earlier than 16.05: you must use either SLURM's PMI-1 or
PMI-2 support. SLURM builds PMI-1 by default, or you can manually
install PMI-2. You must then build Open MPI using --with-pmi pointing
to the SLURM PMI library location.

Please configure as appropriate and try again.
#
[slurm-error2]
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM support. This usually happens
when OMPI was not configured --with-slurm and we weren't able
to discover a SLURM installation in the usual places.

Please configure as appropriate and try again.
#
[alps-error]
The application appears to have been direct launched using "aprun",
but OMPI was not built with ALPS PMI support and therefore cannot
execute. You must build Open MPI using --with-pmi pointing
to the ALPS PMI library location.

Please configure as appropriate and try again.
#
[alps-error2]
The application appears to have been direct launched using "aprun",
but OMPI was not built with ALPS support. This usually happens
when OMPI was not configured --with-alps and we weren't able
to discover an ALPS installation in the usual places.

Please configure as appropriate and try again.
20 changes: 19 additions & 1 deletion orte/mca/ess/pmi/ess_pmi_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/schizo/schizo.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
Expand Down Expand Up @@ -125,7 +126,24 @@ static int rte_init(void)
opal_pmix_base_set_evbase(orte_event_base);
/* initialize the selected module */
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
/* we cannot run */
/* we cannot run - this could be due to being direct launched
* without the required PMI support being built. Try to detect
* that scenario and warn the user */
if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() &&
NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) {
if (0 == strcmp(envar, "SLURM")) {
/* yes to both - so emit a hopefully helpful
* error message and abort */
orte_show_help_finalize();
orte_show_help("help-ess-base.txt", "slurm-error", true);
return ORTE_ERR_SILENT;
} else if (0 == strcmp(envar, "ALPS")) {
/* we were direct launched by ALPS */
orte_show_help_finalize();
orte_show_help("help-ess-base.txt", "alps-error", true);
return ORTE_ERR_SILENT;
}
}
error = "pmix init";
goto error;
}
Expand Down
30 changes: 28 additions & 2 deletions orte/mca/ess/singleton/ess_singleton_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -33,6 +33,7 @@
#include "opal/mca/pmix/base/base.h"

#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/mca/schizo/schizo.h"

#include "orte/mca/ess/ess.h"
Expand Down Expand Up @@ -131,6 +132,32 @@ static int component_query(mca_base_module_t **module, int *priority)
return ORTE_ERROR;
}

/* we may be incorrectly trying to run as a singleton - e.g.,
* someone direct-launched us under SLURM without building
* ORTE --with-slurm or in a slurm environment (so we didn't
* autodetect slurm). Try to detect that here. Sadly, we
* cannot just use the schizo framework to help us here as
* the corresponding schizo component may not have even
* been build. So we have to do things a little uglier */

if (ORTE_SCHIZO_UNMANAGED_SINGLETON == ret) {
/* see if we are in a SLURM allocation */
if (NULL != getenv("SLURM_NODELIST")) {
/* emit a hopefully helpful error message and abort */
orte_show_help("help-ess-base.txt", "slurm-error2", true);
*module = NULL;
*priority = 0;
return ORTE_ERR_SILENT;
}
/* see if we are under ALPS */
if (NULL != getenv("ALPS_APP_ID")) {
orte_show_help("help-ess-base.txt", "alps-error2", true);
*module = NULL;
*priority = 0;
return ORTE_ERR_SILENT;
}
}

/* okay, we want to be selected as we must be a singleton */
*priority = 100;
*module = (mca_base_module_t *)&orte_ess_singleton_module;
Expand All @@ -142,4 +169,3 @@ static int component_close(void)
{
return ORTE_SUCCESS;
}

9 changes: 8 additions & 1 deletion orte/mca/schizo/alps/schizo_alps.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -65,9 +65,16 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
* launch performance penalty for hwloc at high ppn on knl */
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX "orte_bound_at_launch");
opal_argv_append_nosize(&pushed_vals, "true");
/* mark that we are native */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "NATIVE");
goto setup;
}

/* mark that we are on ALPS */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "ALPS");

/* see if we are running in a Cray PAGG container */
fd = fopen(proc_job_file, "r");
if (NULL == fd) {
Expand Down
8 changes: 6 additions & 2 deletions orte/mca/schizo/orte/schizo_orte.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -53,7 +53,7 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
* so no need to further check that here. Instead,
* see if we were direct launched vs launched via mpirun */
if (NULL != orte_process_info.my_daemon_uri) {
/* nope */
/* yes we were */
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "pmi");
Expand All @@ -65,6 +65,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
myenv = ORTE_SCHIZO_UNMANAGED_SINGLETON;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "singleton");
/* mark that we are in ORTE */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "ORTE");


setup:
opal_output_verbose(1, orte_schizo_base_framework.framework_output,
Expand Down
7 changes: 7 additions & 0 deletions orte/mca/schizo/slurm/schizo_slurm.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "pmi");
/* mark that we are native */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "NATIVE");
goto setup;
}

Expand All @@ -72,6 +75,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
return myenv;
}

/* mark that we are in SLURM */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "SLURM");

/* we are in an allocation, but were we direct launched
* or are we a singleton? */
if (NULL == getenv("SLURM_STEP_ID")) {
Expand Down

0 comments on commit 7cbea77

Please sign in to comment.