Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attempt to detect when we are direct-launched without the necessary P… #3778

Merged
merged 1 commit into from
Jun 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion ompi/runtime/ompi_mpi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
error:
if (ret != OMPI_SUCCESS) {
/* Only print a message if one was not already printed */
if (NULL != error) {
if (NULL != error && OMPI_ERR_SILENT != ret) {
const char *err_msg = opal_strerror(ret);
opal_show_help("help-mpi-runtime.txt",
"mpi_init:startup:internal-failure", true,
Expand Down
40 changes: 40 additions & 0 deletions orte/mca/ess/base/help-ess-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,43 @@ MCA parameter:
param: %s

This is not a recognized signal value. Please fix or remove it.
#
[slurm-error]
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM's PMI support and therefore cannot
execute. There are several options for building PMI support under
SLURM, depending upon the SLURM version you are using:

version 16.05 or later: you can use SLURM's PMIx support. This
requires that you configure and build SLURM --with-pmix.

Versions earlier than 16.05: you must use either SLURM's PMI-1 or
PMI-2 support. SLURM builds PMI-1 by default, or you can manually
install PMI-2. You must then build Open MPI using --with-pmi pointing
to the SLURM PMI library location.

Please configure as appropriate and try again.
#
[slurm-error2]
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM support. This usually happens
when OMPI was not configured --with-slurm and we weren't able
to discover a SLURM installation in the usual places.

Please configure as appropriate and try again.
#
[alps-error]
The application appears to have been direct launched using "aprun",
but OMPI was not built with ALPS PMI support and therefore cannot
execute. You must build Open MPI using --with-pmi pointing
to the ALPS PMI library location.

Please configure as appropriate and try again.
#
[alps-error2]
The application appears to have been direct launched using "aprun",
but OMPI was not built with ALPS support. This usually happens
when OMPI was not configured --with-alps and we weren't able
to discover an ALPS installation in the usual places.

Please configure as appropriate and try again.
20 changes: 19 additions & 1 deletion orte/mca/ess/pmi/ess_pmi_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/schizo/schizo.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
Expand Down Expand Up @@ -125,7 +126,24 @@ static int rte_init(void)
opal_pmix_base_set_evbase(orte_event_base);
/* initialize the selected module */
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
/* we cannot run */
/* we cannot run - this could be due to being direct launched
* without the required PMI support being built. Try to detect
* that scenario and warn the user */
if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() &&
NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) {
if (0 == strcmp(envar, "SLURM")) {
/* yes to both - so emit a hopefully helpful
* error message and abort */
orte_show_help_finalize();
orte_show_help("help-ess-base.txt", "slurm-error", true);
return ORTE_ERR_SILENT;
} else if (0 == strcmp(envar, "ALPS")) {
/* we were direct launched by ALPS */
orte_show_help_finalize();
orte_show_help("help-ess-base.txt", "alps-error", true);
return ORTE_ERR_SILENT;
}
}
error = "pmix init";
goto error;
}
Expand Down
30 changes: 28 additions & 2 deletions orte/mca/ess/singleton/ess_singleton_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -33,6 +33,7 @@
#include "opal/mca/pmix/base/base.h"

#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/mca/schizo/schizo.h"

#include "orte/mca/ess/ess.h"
Expand Down Expand Up @@ -131,6 +132,32 @@ static int component_query(mca_base_module_t **module, int *priority)
return ORTE_ERROR;
}

/* we may be incorrectly trying to run as a singleton - e.g.,
* someone direct-launched us under SLURM without building
* ORTE --with-slurm or in a slurm environment (so we didn't
* autodetect slurm). Try to detect that here. Sadly, we
* cannot just use the schizo framework to help us here as
* the corresponding schizo component may not have even
* been build. So we have to do things a little uglier */

if (ORTE_SCHIZO_UNMANAGED_SINGLETON == ret) {
/* see if we are in a SLURM allocation */
if (NULL != getenv("SLURM_NODELIST")) {
/* emit a hopefully helpful error message and abort */
orte_show_help("help-ess-base.txt", "slurm-error2", true);
*module = NULL;
*priority = 0;
return ORTE_ERR_SILENT;
}
/* see if we are under ALPS */
if (NULL != getenv("ALPS_APP_ID")) {
orte_show_help("help-ess-base.txt", "alps-error2", true);
*module = NULL;
*priority = 0;
return ORTE_ERR_SILENT;
}
}

/* okay, we want to be selected as we must be a singleton */
*priority = 100;
*module = (mca_base_module_t *)&orte_ess_singleton_module;
Expand All @@ -142,4 +169,3 @@ static int component_close(void)
{
return ORTE_SUCCESS;
}

9 changes: 8 additions & 1 deletion orte/mca/schizo/alps/schizo_alps.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -65,9 +65,16 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
* launch performance penalty for hwloc at high ppn on knl */
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX "orte_bound_at_launch");
opal_argv_append_nosize(&pushed_vals, "true");
/* mark that we are native */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "NATIVE");
goto setup;
}

/* mark that we are on ALPS */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "ALPS");

/* see if we are running in a Cray PAGG container */
fd = fopen(proc_job_file, "r");
if (NULL == fd) {
Expand Down
8 changes: 6 additions & 2 deletions orte/mca/schizo/orte/schizo_orte.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -53,7 +53,7 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
* so no need to further check that here. Instead,
* see if we were direct launched vs launched via mpirun */
if (NULL != orte_process_info.my_daemon_uri) {
/* nope */
/* yes we were */
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "pmi");
Expand All @@ -65,6 +65,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
myenv = ORTE_SCHIZO_UNMANAGED_SINGLETON;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "singleton");
/* mark that we are in ORTE */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "ORTE");


setup:
opal_output_verbose(1, orte_schizo_base_framework.framework_output,
Expand Down
7 changes: 7 additions & 0 deletions orte/mca/schizo/slurm/schizo_slurm.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "pmi");
/* mark that we are native */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "NATIVE");
goto setup;
}

Expand All @@ -72,6 +75,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
return myenv;
}

/* mark that we are in SLURM */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "SLURM");

/* we are in an allocation, but were we direct launched
* or are we a singleton? */
if (NULL == getenv("SLURM_STEP_ID")) {
Expand Down