Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[submodule "prrte"]
path = 3rd-party/prrte
url = ../../open-mpi/prrte
branch = master
branch = ompi_main
[submodule "openpmix"]
path = 3rd-party/openpmix
url = ../../openpmix/openpmix.git
Expand Down
2 changes: 1 addition & 1 deletion 3rd-party/openpmix
Submodule openpmix updated 407 files
2 changes: 1 addition & 1 deletion 3rd-party/prrte
Submodule prrte updated 470 files
16 changes: 16 additions & 0 deletions config/ompi_setup_prrte.m4
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ dnl Copyright (c) 2021 Nanook Consulting. All rights reserved.
dnl Copyright (c) 2021-2022 IBM Corporation. All rights reserved.
dnl Copyright (c) 2023-2024 Jeffrey M. Squyres. All rights reserved.
dnl Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
dnl Copyright (c) 2025 Triad National Security, LLC. All rights
dnl reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
Expand Down Expand Up @@ -118,10 +120,19 @@ OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy targe
[$OMPI_USING_INTERNAL_PRRTE],
[Whether or not we are using the internal PRRTE])

AM_CONDITIONAL(OMPI_USING_INTERNAL_PRRTE, [test $OMPI_USING_INTERNAL_PRRTE -eq 1])

AC_SUBST(OMPI_PRRTE_RST_CONTENT_DIR)
AC_SUBST(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR)
AM_CONDITIONAL(OMPI_HAVE_PRRTE_RST, [test $OMPI_HAVE_PRRTE_RST -eq 1])

dnl
dnl If using external prrte that supports prte_launch or using internal prtte then
dnl set OMPI_HAVE_PRTE_LAUNCH
dnl
AS_IF([test "$setup_pprte_external_has_prte_launch" = "1" -o "$prrte_setup_internal_happy" = "1"],
[AC_DEFINE_UNQUOTED([OMPI_HAVE_PRTE_LAUNCH], [1], [Whether prte_launch support available])])

OPAL_SUMMARY_ADD([Miscellaneous], [PRRTE], [], [$opal_prrte_mode])

OPAL_VAR_SCOPE_POP
Expand Down Expand Up @@ -297,6 +308,11 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [
AS_IF([test "${ompi_setup_prrte_cv_version_happy}" = "no"],
[setup_prrte_external_happy="no"])])

AS_IF([test "${setup_prrte_external_happy}" = "yes"],
[AC_CHECK_DECL([prte_launch],
[setup_prrte_external_has_prte_launch=1], [setup_pprte_external_has_prte_launch=0],
[#include "prte.h"])],[])

CPPFLAGS="$opal_prrte_CPPFLAGS_save"

# If an external build and the user told us where to find PRRTE,
Expand Down
5 changes: 4 additions & 1 deletion ompi/dpm/dpm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1975,7 +1975,10 @@ static char *find_prte(void)
#if OMPI_USING_INTERNAL_PRRTE
/* 2) If using internal PRRTE, use our bindir. Note that this
* will obey OPAL_PREFIX and OPAL_DESTDIR */
opal_asprintf(&filename, "%s%sprte", opal_install_dirs.bindir, OPAL_PATH_SEP);
/*
* TODO: HPP replace hard-wired prrte prefix with something configurable
*/
opal_asprintf(&filename, "%s%sompi-prte", opal_install_dirs.bindir, OPAL_PATH_SEP);
return filename;
#else

Expand Down
13 changes: 13 additions & 0 deletions ompi/tools/mpirun/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,28 @@

if OMPI_WANT_PRRTE

#
# to help VPATH'd builds find prrte_version.h
#
AM_CFLAGS = \
-I$(top_srcdir)/3rd-party/prrte/include -I$(includedir)

bin_PROGRAMS = mpirun

EXTRA_DIST = help-mpirun.txt

mpirun_SOURCES = \
main.c

#
# TODO: HPP replace hard-wired prrte prefix with something configurable
#
mpirun_LDADD = \
$(top_builddir)/opal/libopen-pal_core.la
if OMPI_USING_INTERNAL_PRRTE
mpirun_LDADD += \
$(top_builddir)/3rd-party/prrte/src/libompi-prrte.la
endif

mpirun_CPPFLAGS = \
-DMCA_oshmem_FRAMEWORKS="\"$(MCA_oshmem_FRAMEWORKS)\"" \
Expand Down
6 changes: 6 additions & 0 deletions ompi/tools/mpirun/help-mpirun.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
# This is the US/English help file for Open MPI wrapper compiler error
# messages.
#
[prte-launch-failed]
Open MPI's mpirun command was unable to launch the user's application.
This may indicate an issue with the environment or incorrect configuration.

Error Message: %s
#
[no-prterun-found]
Open MPI's mpirun command was unable to find an underlying prterun
command to execute. Consider setting the OMPI_PRTERUN environment
Expand Down
114 changes: 58 additions & 56 deletions ompi/tools/mpirun/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,47 +28,10 @@
#include "opal/util/printf.h"
#include "opal/util/show_help.h"
#include "ompi/constants.h"

static char *find_prterun(void)
{
char *filename = NULL;
#if !OMPI_USING_INTERNAL_PRRTE
char *prrte_prefix = NULL;
#if OMPI_HAVE_PRTE_LAUNCH
#include "prte.h"
#endif

/* 1) Did the user tell us exactly where to find prterun? */
filename = getenv("OMPI_PRTERUN");
if (NULL != filename) {
return filename;
}

#if OMPI_USING_INTERNAL_PRRTE
/* 2) If using internal PRRTE, use our bindir. Note that this
* will obey OPAL_PREFIX and OPAL_DESTDIR */
opal_asprintf(&filename, "%s%sprterun", opal_install_dirs.bindir, OPAL_PATH_SEP);
return filename;
#else

/* 3) Look in ${PRTE_PREFIX}/bin */
prrte_prefix = getenv("PRTE_PREFIX");
if (NULL != prrte_prefix) {
opal_asprintf(&filename, "%s%sbin%sprterun", prrte_prefix, OPAL_PATH_SEP, OPAL_PATH_SEP);
return filename;
}

/* 4) See if configure told us where to look, if set */
#if defined(OMPI_PRTERUN_PATH)
return strdup(OMPI_PRTERUN_PATH);
#else

/* 5) Use path search */
filename = opal_find_absolute_path("prterun");

return filename;
#endif
#endif
}

static void append_prefixes(char ***out, const char *in)
{
if (NULL == in) {
Expand Down Expand Up @@ -115,14 +78,43 @@ static void setup_mca_prefixes(void)
opal_argv_free(tmp);
}

__opal_attribute_unused__
static char *find_prterun(void)
{
char *filename = NULL;
char *prrte_prefix = NULL;

/* 1) Did the user tell us exactly where to find prterun? */
filename = getenv("OMPI_PRTERUN");
if (NULL != filename) {
return filename;
}

/* 2) Look in ${PRTE_PREFIX}/bin */
prrte_prefix = getenv("PRTE_PREFIX");
if (NULL != prrte_prefix) {
opal_asprintf(&filename, "%s%sbin%sprterun", prrte_prefix, OPAL_PATH_SEP, OPAL_PATH_SEP);
return filename;
}

/* 4) See if configure told us where to look, if set */
#if defined(OMPI_PRTERUN_PATH)
return strdup(OMPI_PRTERUN_PATH);
#else

/* 5) Use path search */
filename = opal_find_absolute_path("prterun");

return filename;
#endif
}

int main(int argc, char *argv[])
{
char *opal_prefix = getenv("OPAL_PREFIX");
char *full_prterun_path = NULL;
char **prterun_args = NULL;
char __opal_attribute_unused__ *full_prterun_path = NULL;
char __opal_attribute_unused__ **prterun_args = NULL;
int ret;
size_t i;

ret = opal_init_util(&argc, &argv);
if (OMPI_SUCCESS != ret) {
Expand Down Expand Up @@ -154,23 +146,33 @@ int main(int argc, char *argv[])
#endif
}

full_prterun_path = find_prterun();
if (NULL == full_prterun_path) {
opal_show_help("help-mpirun.txt", "no-prterun-found", 1);
exit(1);
}

/*
* set environment variable for our install location
* used within the OMPI prrte schizo component
*/

setenv("OMPI_LIBDIR_LOC", opal_install_dirs.libdir, 1);

// Set environment variable to tell PRTE what MCA prefixes belong
// to Open MPI.
setup_mca_prefixes();

#if OMPI_HAVE_PRTE_LAUNCH

ret = prte_launch(argc, argv);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I said elsewhere, why not just use "prun_common"? Already exists, public symbol, does pretty much everything you have in your "prte_launch" code. The little bit that is left can easily be put here, just like we did with our "prte", and then call "prun_common". Avoid a bunch of code duplication that adds nothing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rhc54 Excellent suggestion! I spent a bit of time looking into this. I see in prun.c that it sets up schizo before invoking prun_common(). How should we do that from OMPI? I couldn't figure out a way to get into the PRTE (and PMIX) internals safely to do that from the outside.

This is similar to the types of reasons we wrote prte_launch() to just take argc/argv -- i.e., emulate exactly as if we have fork/exec'd prun and could only pass information via the argv command line.

I'm open to suggestion here...?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, you definitely don't want to emulate prun - you need to emulate prterun. prun is missing a bunch of stuff that you need for mpirun.

I haven't looked at prte_launch but can take a gander if you can point me at it. My guess is that we simply need to add a brief init function to PRRTE to fill in the gap. My concern is that you'll be adding maintenance burden for every time a new cmd line option gets added, or some other change internal to PRRTE requires modifying prun_common. Avoiding code duplication would probably be a good thing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prte_launch is on the ompi_main branch in our PRTE fork: https://github.com/open-mpi/prrte/blob/ompi_main/src/runtime/prte_launcher.c#L250

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, will take a look this week and see what can be done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you sir!

I'm pretty brain-dead tonight; lemme look at this tomorrow...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rhc54 I hacked up a local copy of your patch in my dev environment and it seems to be exactly what we need. Thanks!

@hppritcha Could we get Ralph's upstream PR merged, and then update both our submodule and remove our prte_launch() code to use his? His prte_launch() is a drop-in replacement for ours; we just don't need to carry that code now. The only minor complication might be getting a prototype for prte_launch() in a PRTE-installed header file, but if that's a problem, we can just prototype it ourselves (it's just int, char *[]).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had thought to use include/prrte.h as the header, but you guys are already using it for something (not entirely sure what). It has a prototype for a "prte" function that doesn't exist. So we could rename prte_launch to be just prte and use that header, or modify that header to prototype prte_launch, though that might mess up whatever you currently do with it.

I'm okay either way - maybe a slight preference to change prte_launch to just prte and use the existing header as-is.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we're doing anything else with that header; that old prte() function listed in there might be stale...?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I can see, you were just using it in configure to detect that PRRTE was installed and available. The function never existed, so OMPI was only looking to see that the file existed.

I went ahead and used that header file as-is, renaming the "prte_launch" function to just be "prte". Keeps things cleaner as that is what we had done before - e.g., with "prun". I went ahead and committed it, so you can update here whenever you are ready.

if (OMPI_SUCCESS != ret) {
opal_show_help("help-mpirun.txt", "prte-launch-failed", 1, strerror(errno));
exit(1);
}

return 0;
#else

full_prterun_path = find_prterun();
if (NULL == full_prterun_path) {
opal_show_help("help-mpirun.txt", "no-prterun-found", 1);
exit(1);
}

/* calling mpirun (and now prterun) with a full path has a special
* meaning in terms of -prefix behavior, so copy that behavior
* into prterun */
Expand All @@ -182,16 +184,16 @@ int main(int argc, char *argv[])

/* Copy all the mpirun arguments to prterun.
* TODO: Need to handle --prefix rationally here. */
for (i = 1; NULL != argv[i]; i++) {
for (size_t i = 1; NULL != argv[i]; i++) {
opal_argv_append_nosize(&prterun_args, argv[i]);
}
ret = execv(full_prterun_path, prterun_args);
opal_show_help("help-mpirun.txt", "prterun-exec-failed",
1, full_prterun_path, strerror(errno));
exit(1);
}

/*
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you want to remove these copyrghts.

I think they're at the bottom of the file because MPI-enabled debuggers would default to opening the source code for mpirun.c upon attach, so we wanted to show the specific banner at the top of this file (vs. all the copyrights). But we still need the copyrights.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh i see. didn't realize that.

exit(1);
#endif /* OMPI_HAVE_PRTE_LAUNCH*/
}
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
Expand All @@ -206,9 +208,9 @@ int main(int argc, char *argv[])
* Copyright (c) 2020-2022 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2022 Triad National Security, LLC. All rights
* Copyright (c) 2022-2025 Triad National Security, LLC. All rights
* reserved.

* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down
Loading