-
Notifications
You must be signed in to change notification settings - Fork 920
uofl: changes to Open MPI and move prrte/pmix shas #13299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,47 +28,10 @@ | |
#include "opal/util/printf.h" | ||
#include "opal/util/show_help.h" | ||
#include "ompi/constants.h" | ||
|
||
static char *find_prterun(void) | ||
{ | ||
char *filename = NULL; | ||
#if !OMPI_USING_INTERNAL_PRRTE | ||
char *prrte_prefix = NULL; | ||
#if OMPI_HAVE_PRTE_LAUNCH | ||
#include "prte.h" | ||
#endif | ||
|
||
/* 1) Did the user tell us exactly where to find prterun? */ | ||
filename = getenv("OMPI_PRTERUN"); | ||
if (NULL != filename) { | ||
return filename; | ||
} | ||
|
||
#if OMPI_USING_INTERNAL_PRRTE | ||
/* 2) If using internal PRRTE, use our bindir. Note that this | ||
* will obey OPAL_PREFIX and OPAL_DESTDIR */ | ||
opal_asprintf(&filename, "%s%sprterun", opal_install_dirs.bindir, OPAL_PATH_SEP); | ||
return filename; | ||
#else | ||
|
||
/* 3) Look in ${PRTE_PREFIX}/bin */ | ||
prrte_prefix = getenv("PRTE_PREFIX"); | ||
if (NULL != prrte_prefix) { | ||
opal_asprintf(&filename, "%s%sbin%sprterun", prrte_prefix, OPAL_PATH_SEP, OPAL_PATH_SEP); | ||
return filename; | ||
} | ||
|
||
/* 4) See if configure told us where to look, if set */ | ||
#if defined(OMPI_PRTERUN_PATH) | ||
return strdup(OMPI_PRTERUN_PATH); | ||
#else | ||
|
||
/* 5) Use path search */ | ||
filename = opal_find_absolute_path("prterun"); | ||
|
||
return filename; | ||
#endif | ||
#endif | ||
} | ||
|
||
static void append_prefixes(char ***out, const char *in) | ||
{ | ||
if (NULL == in) { | ||
|
@@ -115,14 +78,43 @@ static void setup_mca_prefixes(void) | |
opal_argv_free(tmp); | ||
} | ||
|
||
__opal_attribute_unused__ | ||
static char *find_prterun(void) | ||
{ | ||
char *filename = NULL; | ||
char *prrte_prefix = NULL; | ||
|
||
/* 1) Did the user tell us exactly where to find prterun? */ | ||
filename = getenv("OMPI_PRTERUN"); | ||
if (NULL != filename) { | ||
return filename; | ||
} | ||
|
||
/* 2) Look in ${PRTE_PREFIX}/bin */ | ||
prrte_prefix = getenv("PRTE_PREFIX"); | ||
if (NULL != prrte_prefix) { | ||
opal_asprintf(&filename, "%s%sbin%sprterun", prrte_prefix, OPAL_PATH_SEP, OPAL_PATH_SEP); | ||
return filename; | ||
} | ||
|
||
/* 4) See if configure told us where to look, if set */ | ||
#if defined(OMPI_PRTERUN_PATH) | ||
return strdup(OMPI_PRTERUN_PATH); | ||
#else | ||
|
||
/* 5) Use path search */ | ||
filename = opal_find_absolute_path("prterun"); | ||
|
||
return filename; | ||
#endif | ||
} | ||
|
||
int main(int argc, char *argv[]) | ||
{ | ||
char *opal_prefix = getenv("OPAL_PREFIX"); | ||
char *full_prterun_path = NULL; | ||
char **prterun_args = NULL; | ||
char __opal_attribute_unused__ *full_prterun_path = NULL; | ||
char __opal_attribute_unused__ **prterun_args = NULL; | ||
int ret; | ||
size_t i; | ||
|
||
ret = opal_init_util(&argc, &argv); | ||
if (OMPI_SUCCESS != ret) { | ||
|
@@ -154,23 +146,33 @@ int main(int argc, char *argv[]) | |
#endif | ||
} | ||
|
||
full_prterun_path = find_prterun(); | ||
if (NULL == full_prterun_path) { | ||
opal_show_help("help-mpirun.txt", "no-prterun-found", 1); | ||
exit(1); | ||
} | ||
|
||
/* | ||
* set environment variable for our install location | ||
* used within the OMPI prrte schizo component | ||
*/ | ||
|
||
setenv("OMPI_LIBDIR_LOC", opal_install_dirs.libdir, 1); | ||
|
||
// Set environment variable to tell PRTE what MCA prefixes belong | ||
// to Open MPI. | ||
setup_mca_prefixes(); | ||
|
||
#if OMPI_HAVE_PRTE_LAUNCH | ||
|
||
ret = prte_launch(argc, argv); | ||
if (OMPI_SUCCESS != ret) { | ||
opal_show_help("help-mpirun.txt", "prte-launch-failed", 1, strerror(errno)); | ||
exit(1); | ||
} | ||
|
||
return 0; | ||
#else | ||
|
||
full_prterun_path = find_prterun(); | ||
if (NULL == full_prterun_path) { | ||
opal_show_help("help-mpirun.txt", "no-prterun-found", 1); | ||
exit(1); | ||
} | ||
|
||
/* calling mpirun (and now prterun) with a full path has a special | ||
* meaning in terms of -prefix behavior, so copy that behavior | ||
* into prterun */ | ||
|
@@ -182,16 +184,16 @@ int main(int argc, char *argv[]) | |
|
||
/* Copy all the mpirun arguments to prterun. | ||
* TODO: Need to handle --prefix rationally here. */ | ||
for (i = 1; NULL != argv[i]; i++) { | ||
for (size_t i = 1; NULL != argv[i]; i++) { | ||
opal_argv_append_nosize(&prterun_args, argv[i]); | ||
} | ||
ret = execv(full_prterun_path, prterun_args); | ||
opal_show_help("help-mpirun.txt", "prterun-exec-failed", | ||
1, full_prterun_path, strerror(errno)); | ||
exit(1); | ||
} | ||
|
||
/* | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you want to remove these copyrghts. I think they're at the bottom of the file because MPI-enabled debuggers would default to opening the source code for mpirun.c upon attach, so we wanted to show the specific banner at the top of this file (vs. all the copyrights). But we still need the copyrights. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh i see. didn't realize that. |
||
exit(1); | ||
#endif /* OMPI_HAVE_PRTE_LAUNCH*/ | ||
} | ||
/* | ||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana | ||
* University Research and Technology | ||
* Corporation. All rights reserved. | ||
|
@@ -206,9 +208,9 @@ int main(int argc, char *argv[]) | |
* Copyright (c) 2020-2022 Cisco Systems, Inc. All rights reserved | ||
* Copyright (c) 2021 Nanook Consulting. All rights reserved. | ||
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. | ||
* Copyright (c) 2022 Triad National Security, LLC. All rights | ||
* Copyright (c) 2022-2025 Triad National Security, LLC. All rights | ||
* reserved. | ||
|
||
* $COPYRIGHT$ | ||
* | ||
* Additional copyrights may follow | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I said elsewhere, why not just use "prun_common"? Already exists, public symbol, does pretty much everything you have in your "prte_launch" code. The little bit that is left can easily be put here, just like we did with our "prte", and then call "prun_common". Avoid a bunch of code duplication that adds nothing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@rhc54 Excellent suggestion! I spent a bit of time looking into this. I see in
prun.c
that it sets up schizo before invokingprun_common()
. How should we do that from OMPI? I couldn't figure out a way to get into the PRTE (and PMIX) internals safely to do that from the outside.This is similar to the types of reasons we wrote
prte_launch()
to just take argc/argv -- i.e., emulate exactly as if we have fork/exec'dprun
and could only pass information via the argv command line.I'm open to suggestion here...?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, you definitely don't want to emulate
prun
- you need to emulateprterun
.prun
is missing a bunch of stuff that you need formpirun
.I haven't looked at
prte_launch
but can take a gander if you can point me at it. My guess is that we simply need to add a brief init function to PRRTE to fill in the gap. My concern is that you'll be adding maintenance burden for every time a new cmd line option gets added, or some other change internal to PRRTE requires modifyingprun_common
. Avoiding code duplication would probably be a good thing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
prte_launch
is on theompi_main
branch in our PRTE fork: https://github.com/open-mpi/prrte/blob/ompi_main/src/runtime/prte_launcher.c#L250There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, will take a look this week and see what can be done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you sir!
I'm pretty brain-dead tonight; lemme look at this tomorrow...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@rhc54 I hacked up a local copy of your patch in my dev environment and it seems to be exactly what we need. Thanks!
@hppritcha Could we get Ralph's upstream PR merged, and then update both our submodule and remove our
prte_launch()
code to use his? Hisprte_launch()
is a drop-in replacement for ours; we just don't need to carry that code now. The only minor complication might be getting a prototype forprte_launch()
in a PRTE-installed header file, but if that's a problem, we can just prototype it ourselves (it's justint, char *[]
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had thought to use
include/prrte.h
as the header, but you guys are already using it for something (not entirely sure what). It has a prototype for a "prte" function that doesn't exist. So we could renameprte_launch
to be justprte
and use that header, or modify that header to prototypeprte_launch
, though that might mess up whatever you currently do with it.I'm okay either way - maybe a slight preference to change
prte_launch
to justprte
and use the existing header as-is.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we're doing anything else with that header; that old
prte()
function listed in there might be stale...?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From what I can see, you were just using it in configure to detect that PRRTE was installed and available. The function never existed, so OMPI was only looking to see that the file existed.
I went ahead and used that header file as-is, renaming the "prte_launch" function to just be "prte". Keeps things cleaner as that is what we had done before - e.g., with "prun". I went ahead and committed it, so you can update here whenever you are ready.