From defea5f18103e4de2b3c34d90bf1f5124a3c8340 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 25 Apr 2025 11:01:13 -0400 Subject: [PATCH 01/24] bugfix: Setting OMPI_MPI_THREAD_LEVEL to a value different than `requested` in `MPI_Init_thread` would invoke the error handler, even though it is an useful override in some threaded library use cases. Signed-off-by: Aurelien Bouteiller (cherry picked from commit 27332fcc80a03e996bf8d1ed231690ee6eacdd4e) --- ompi/mpi/c/init.c | 8 +++++++- ompi/mpi/c/init_thread.c | 32 ++++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/ompi/mpi/c/init.c b/ompi/mpi/c/init.c index eb5a50a7643..26cee04ff4f 100644 --- a/ompi/mpi/c/init.c +++ b/ompi/mpi/c/init.c @@ -13,6 +13,7 @@ * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -54,7 +55,12 @@ int MPI_Init(int *argc, char ***argv) if (NULL != (env = getenv("OMPI_MPI_THREAD_LEVEL"))) { required = atoi(env); - if (required < MPI_THREAD_SINGLE || required > MPI_THREAD_MULTIPLE) { + /* In the future we may have to contend with non-sequential (MPI ABI) values + * If you are implementing MPI ABI changes please refer to + * https://github.com/open-mpi/ompi/pull/13211#discussion_r2085086844 + */ + if (required != MPI_THREAD_SINGLE && required != MPI_THREAD_FUNNELED && + required != MPI_THREAD_SERIALIZED && required != MPI_THREAD_MULTIPLE) { required = MPI_THREAD_MULTIPLE; } } diff --git a/ompi/mpi/c/init_thread.c b/ompi/mpi/c/init_thread.c index 95ca9df25e2..581edf9ff45 100644 --- a/ompi/mpi/c/init_thread.c +++ b/ompi/mpi/c/init_thread.c @@ -16,6 +16,7 @@ * Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved * Copyright (c) 2016 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,6 +49,7 @@ int MPI_Init_thread(int *argc, char ***argv, int required, int *provided) { int err, safe_required = MPI_THREAD_SERIALIZED; + bool err_arg_required = false; char *env; ompi_hook_base_mpi_init_thread_top(argc, argv, required, provided); @@ -55,14 +57,28 @@ int MPI_Init_thread(int *argc, char ***argv, int required, /* Detect an incorrect thread support level, but dont report until we have the minimum * infrastructure setup. */ - if( (MPI_THREAD_SINGLE == required) || (MPI_THREAD_SERIALIZED == required) || - (MPI_THREAD_FUNNELED == required) || (MPI_THREAD_MULTIPLE == required) ) { + err_arg_required = (required != MPI_THREAD_SINGLE && required != MPI_THREAD_FUNNELED && + required != MPI_THREAD_SERIALIZED && required != MPI_THREAD_MULTIPLE); + if (!err_arg_required) { + safe_required = required; + } - if (NULL != (env = getenv("OMPI_MPI_THREAD_LEVEL"))) { - safe_required = atoi(env); - } - else { - safe_required = required; + /* check for environment overrides for required thread level. If + * there is, check to see that it is a valid/supported thread level. + * If valid, the environment variable always override the provided thread + * level (even if lower than argument `required`). A user program can + * check `provided != required` to check if `required` has been overruled. + */ + if (NULL != (env = getenv("OMPI_MPI_THREAD_LEVEL"))) { + int env_required = atoi(env); + /* In the future we may have to contend with non-sequential (MPI ABI) values + * If you are implementing MPI ABI changes please refer to + * https://github.com/open-mpi/ompi/pull/13211#discussion_r2085086844 + */ + err_arg_required |= (env_required != MPI_THREAD_SINGLE && env_required != MPI_THREAD_FUNNELED && + env_required != MPI_THREAD_SERIALIZED && env_required != MPI_THREAD_MULTIPLE); + if (!err_arg_required) { + safe_required = env_required; } } @@ -78,7 +94,7 @@ int MPI_Init_thread(int *argc, char ***argv, int required, err = ompi_mpi_init(0, NULL, safe_required, provided, false); } - if( safe_required != required ) { + if (err_arg_required) { /* Trigger the error handler for the incorrect argument. Keep it separate from the * check on the ompi_mpi_init return and report a nice, meaningful error message to * the user. */ From 6ad6cc8745462458b6f072aa7d2c102f432682a7 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 12 Jun 2025 23:42:06 -0400 Subject: [PATCH 02/24] OMPI_MPI_THREAD_LEVEL can now take 'multiple' 'MPI_THREAD_MULTIPLE' (single,etc) in addition to numeric 0-3 values Signed-off-by: Aurelien Bouteiller (cherry picked from commit 3de248902a38a7ebe8299f8a35e677eb8196b255) --- ompi/mpi/c/init.c | 14 ++------------ ompi/mpi/c/init_thread.c | 16 +++------------- ompi/runtime/mpiruntime.h | 22 +++++++++++++++++++++ ompi/runtime/ompi_mpi_init.c | 37 ++++++++++++++++++++++++++++++++++++ 4 files changed, 64 insertions(+), 25 deletions(-) diff --git a/ompi/mpi/c/init.c b/ompi/mpi/c/init.c index 26cee04ff4f..290a28efd5b 100644 --- a/ompi/mpi/c/init.c +++ b/ompi/mpi/c/init.c @@ -46,23 +46,13 @@ int MPI_Init(int *argc, char ***argv) { int err; int provided; - char *env; int required = MPI_THREAD_SINGLE; /* check for environment overrides for required thread level. If there is, check to see that it is a valid/supported thread level. If not, default to MPI_THREAD_MULTIPLE. */ - - if (NULL != (env = getenv("OMPI_MPI_THREAD_LEVEL"))) { - required = atoi(env); - /* In the future we may have to contend with non-sequential (MPI ABI) values - * If you are implementing MPI ABI changes please refer to - * https://github.com/open-mpi/ompi/pull/13211#discussion_r2085086844 - */ - if (required != MPI_THREAD_SINGLE && required != MPI_THREAD_FUNNELED && - required != MPI_THREAD_SERIALIZED && required != MPI_THREAD_MULTIPLE) { - required = MPI_THREAD_MULTIPLE; - } + if (OMPI_SUCCESS > ompi_getenv_mpi_thread_level(&required)) { + required = MPI_THREAD_MULTIPLE; } /* Call the back-end initialization function (we need to put as diff --git a/ompi/mpi/c/init_thread.c b/ompi/mpi/c/init_thread.c index 581edf9ff45..a12743ffb3b 100644 --- a/ompi/mpi/c/init_thread.c +++ b/ompi/mpi/c/init_thread.c @@ -50,12 +50,13 @@ int MPI_Init_thread(int *argc, char ***argv, int required, { int err, safe_required = MPI_THREAD_SERIALIZED; bool err_arg_required = false; - char *env; ompi_hook_base_mpi_init_thread_top(argc, argv, required, provided); /* Detect an incorrect thread support level, but dont report until we have the minimum * infrastructure setup. + * In the future integer MPI_ABI values for MPI_THREAD_SINGLE-MULTIPLE + * may have gaps between them, so just checking the range is not enough. */ err_arg_required = (required != MPI_THREAD_SINGLE && required != MPI_THREAD_FUNNELED && required != MPI_THREAD_SERIALIZED && required != MPI_THREAD_MULTIPLE); @@ -69,18 +70,7 @@ int MPI_Init_thread(int *argc, char ***argv, int required, * level (even if lower than argument `required`). A user program can * check `provided != required` to check if `required` has been overruled. */ - if (NULL != (env = getenv("OMPI_MPI_THREAD_LEVEL"))) { - int env_required = atoi(env); - /* In the future we may have to contend with non-sequential (MPI ABI) values - * If you are implementing MPI ABI changes please refer to - * https://github.com/open-mpi/ompi/pull/13211#discussion_r2085086844 - */ - err_arg_required |= (env_required != MPI_THREAD_SINGLE && env_required != MPI_THREAD_FUNNELED && - env_required != MPI_THREAD_SERIALIZED && env_required != MPI_THREAD_MULTIPLE); - if (!err_arg_required) { - safe_required = env_required; - } - } + err_arg_required |= (OMPI_SUCCESS > ompi_getenv_mpi_thread_level(&safe_required)); *provided = safe_required; diff --git a/ompi/runtime/mpiruntime.h b/ompi/runtime/mpiruntime.h index 5ce1ce53539..6fde46fb190 100644 --- a/ompi/runtime/mpiruntime.h +++ b/ompi/runtime/mpiruntime.h @@ -16,6 +16,7 @@ * Copyright (c) 2009 University of Houston. All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -163,6 +164,27 @@ extern opal_hash_table_t ompi_mpi_f90_complex_hashtable; /** version string of ompi */ OMPI_DECLSPEC extern const char ompi_version_string[]; +/** + * Obtain the required thread level from environment (if any) + * + * @param requested Thread support that is requested (OUT) + * + * @returns Error code if environment exist but has an invalid value + * + * The function reads the environment variable OMPI_MPI_THREAD_LEVEL + * and set parameter requested accordingly. If the environment is not + * set, or has an invalid value, requested is left unchanged. + */ +int ompi_getenv_mpi_thread_level(int *requested); + +/** + * Determine the thread level + * + * @param requested Thread support that is requested (IN) + * @param provided Thread support that is provided (OUT) + */ +void ompi_mpi_thread_level(int requested, int *provided); + /** * Determine the thread level * diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 19c0999d163..787e1e10249 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -29,6 +29,7 @@ * Copyright (c) 2021 Nanook Consulting. All rights reserved. * Copyright (c) 2021-2022 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -268,6 +269,42 @@ MPI_Fint *MPI_F08_STATUSES_IGNORE = NULL; #include "mpif-c-constants.h" +int ompi_getenv_mpi_thread_level(int *requested) +{ + char* env; + if (NULL != (env = getenv("OMPI_MPI_THREAD_LEVEL"))) { + /* deal with string values, int values (no atoi, it doesn't error check) */ + /* In the future integer MPI_ABI values for MPI_THREAD_SINGLE-MULTIPLE + * may be non-sequential (but ordered) integer values. + * If you are implementing MPI ABI changes please refer to + * https://github.com/open-mpi/ompi/pull/13211#discussion_r2085086844 + */ + if (0 == strcasecmp(env, "multiple") || + 0 == strcasecmp(env, "MPI_THREAD_MULTIPLE") || + 0 == strcmp(env, "3")) { + return *requested = MPI_THREAD_MULTIPLE; + } + if (0 == strcasecmp(env, "serialized") || + 0 == strcasecmp(env, "MPI_THREAD_SERIALIZED") || + 0 == strcmp(env, "2")) { + return *requested = MPI_THREAD_SERIALIZED; + } + if (0 == strcasecmp(env, "funneled") || + 0 == strcasecmp(env, "MPI_THREAD_FUNNELED") || + 0 == strcmp(env, "1")) { + return *requested = MPI_THREAD_FUNNELED; + } + if (0 == strcasecmp(env, "single") || + 0 == strcasecmp(env, "MPI_THREAD_SINGLE") || + 0 == strcmp(env, "0")) { + return *requested = MPI_THREAD_SINGLE; + } + /* the env value is invalid... */ + return OMPI_ERR_BAD_PARAM; + } + return OMPI_SUCCESS; +} + void ompi_mpi_thread_level(int requested, int *provided) { /** From 2d5505cf08c6abc9228efa03c70877ad1380a49d Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 21 Jun 2025 15:34:04 -0400 Subject: [PATCH 03/24] docs: numerous updates to MPI_Init*/MPI_Finalize*/MPI_Session_* man pages Including, but not limited to: * Added much more description of and distinction between the MPI world model and the MPI session model. Updated a lot of old, pre-MPI-world-model/pre-MPI-session-model text that was now stale / outdated, especially in the following pages: * MPI_Init(3), MPI_Init_thread(3) * MPI_Initialized(3) * MPI_Finalize(3) * MPI_Finalized(3) * MPI_Session_init(3) * MPI_Session_finalize(3) * Numerous formatting updates * Slightly improve the C code examples * Describe the mathematical relationship between the various MPI_THREAD_* constants in MPI_Init_thread(3) * Note that the mathematical relationships render nicely in HTML, but don't render entirely properly in nroff. This commit author is of the opinion that the nroff rendering is currently "good enough", and some Sphinx maintainer will fix it someday. * Add descriptions about the $OMPI_MPI_THREAD_LEVEL env variable and how it is used in MPI_Init_thread(3) * Added more seealso links Signed-off-by: Jeff Squyres (cherry picked from commit aff3afde47bdc30643c0b5e97833793910c54999) --- docs/man-openmpi/man3/MPI_Finalize.3.rst | 114 ++++++++----- docs/man-openmpi/man3/MPI_Finalized.3.rst | 25 ++- docs/man-openmpi/man3/MPI_Init.3.rst | 56 ++++-- docs/man-openmpi/man3/MPI_Init_thread.3.rst | 160 +++++++++++++----- docs/man-openmpi/man3/MPI_Initialized.3.rst | 17 +- .../man3/MPI_Session_finalize.3.rst | 16 +- docs/man-openmpi/man3/MPI_Session_init.3.rst | 46 ++++- 7 files changed, 309 insertions(+), 125 deletions(-) diff --git a/docs/man-openmpi/man3/MPI_Finalize.3.rst b/docs/man-openmpi/man3/MPI_Finalize.3.rst index 174b0c77ebb..032eb7fd40f 100644 --- a/docs/man-openmpi/man3/MPI_Finalize.3.rst +++ b/docs/man-openmpi/man3/MPI_Finalize.3.rst @@ -5,7 +5,7 @@ MPI_Finalize .. include_body -:ref:`MPI_Finalize` |mdash| Terminates MPI execution environment. +:ref:`MPI_Finalize` |mdash| Terminates MPI world model. SYNTAX ------ @@ -48,56 +48,82 @@ OUTPUT PARAMETER DESCRIPTION ----------- -This routine cleans up all MPI states. Once this routine is called, no -MPI routine (not even MPI_Init) may be called, except for -:ref:`MPI_Get_version`, :ref:`MPI_Initialized`, and :ref:`MPI_Finalized`. Unless there has -been a call to :ref:`MPI_Abort`, you must ensure that all pending -communications involving a process are complete before the process calls -:ref:`MPI_Finalize`. If the call returns, each process may either continue -local computations or exit without participating in further -communication with other processes. At the moment when the last process -calls :ref:`MPI_Finalize`, all pending sends must be matched by a receive, and -all pending receives must be matched by a send. - -:ref:`MPI_Finalize` is collective over all connected processes. If no processes -were spawned, accepted, or connected, then this means it is collective -over MPI_COMM_WORLD. Otherwise, it is collective over the union of all -processes that have been and continue to be connected. +This routine finalizes the MPI world model. If the MPI world model +has been initialized in an MPI process, it *must* be finalized exactly +once by invoking this routine during the lifetime of that MPI process. +This is different than the MPI session model, which can be initialized +and finalized multiple times in an MPI process. See +:ref:`MPI_Session_init` and :ref:`MPI_Session_finalize`. + +Unless there has been a call to :ref:`MPI_Abort`, you must +ensure that all pending communications in the MPI world model +involving a process are complete before the process calls +:ref:`MPI_Finalize`. If the call returns, each process may either +continue local computations or exit without participating in further +communication with other processes in the MPI world model. At the +moment when the last process calls :ref:`MPI_Finalize`, all pending +sends in the MPI world model must be matched by a receive, and all +pending receives in the MPI world model must be matched by a send. + +See `MPI-5.0:11.4.1 `_ for a list of MPI +functionality that is available (e.g., even when the MPI +world model has not yet initialized or has already been finalized). + +:ref:`MPI_Finalize` is collective over all connected processes. If no +processes were spawned, accepted, or connected, then this means it is +collective over ``MPI_COMM_WORLD``. Otherwise, it is collective over +the union of all processes that have been and continue to be +connected. NOTES ----- -All processes must call this routine before exiting. All processes will -still exist but may not make any further MPI calls. :ref:`MPI_Finalize` -guarantees that all local actions required by communications the user -has completed will, in fact, occur before it returns. However, -:ref:`MPI_Finalize` guarantees nothing about pending communications that have -not been completed; completion is ensured only by :ref:`MPI_Wait`, :ref:`MPI_Test`, or -:ref:`MPI_Request_free` combined with some other verification of completion. - -For example, a successful return from a blocking communication operation -or from :ref:`MPI_Wait` or :ref:`MPI_Test` means that the communication is completed -by the user and the buffer can be reused, but does not guarantee that -the local process has no more work to do. Similarly, a successful return -from :ref:`MPI_Request_free` with a request handle generated by an :ref:`MPI_Isend` -nullifies the handle but does not guarantee that the operation has -completed. The :ref:`MPI_Isend` is complete only when a matching receive has -completed. - -If you would like to cause actions to happen when a process finishes, -attach an attribute to MPI_COMM_SELF with a callback function. Then, -when :ref:`MPI_Finalize` is called, it will first execute the equivalent of an -:ref:`MPI_Comm_free` on MPI_COMM_SELF. This will cause the delete callback -function to be executed on all keys associated with MPI_COMM_SELF in an -arbitrary order. If no key has been attached to MPI_COMM_SELF, then no -callback is invoked. This freeing of MPI_COMM_SELF happens before any -other parts of MPI are affected. Calling :ref:`MPI_Finalized` will thus return -"false" in any of these callback functions. Once you have done this with -MPI_COMM_SELF, the results of :ref:`MPI_Finalize` are not specified. +The MPI session model is different than the MPI world model, and has +different scopes of availability for MPI functionality. See +:ref:`MPI_Session_init` and :ref:`MPI_Session_finalize`. + +All processes that initialized the MPI world model must call this +routine before exiting. All processes will still exist but may not +make any further MPI calls in the MPI world model. :ref:`MPI_Finalize` +guarantees that all local actions required by communications in the +MPI world model that the user has completed will, in fact, occur +before it returns. However, :ref:`MPI_Finalize` guarantees nothing +about pending communications in the MPI world model that have not been +completed; completion is ensured only by the :ref:`MPI_Wait` and +:ref:`MPI_Test` variants, or :ref:`MPI_Request_free` combined with +some other verification of completion. + +For example, a successful return from a blocking communication +operation or from one of the :ref:`MPI_Wait` or :ref:`MPI_Test` +varients means that the communication is completed by the user and the +buffer can be reused, but does not guarantee that the local process +has no more work to do. Similarly, a successful return from +:ref:`MPI_Request_free` with a request handle generated by an +:ref:`MPI_Isend` nullifies the handle but does not guarantee that the +operation has completed. The :ref:`MPI_Isend` is complete only when a +matching receive has completed. + +If you would like to cause actions to happen when a process finalizes the MPI +world model, attach an attribute to ``MPI_COMM_SELF`` with a callback +function. Then, when :ref:`MPI_Finalize` is called, it will first +execute the equivalent of an :ref:`MPI_Comm_free` on +``MPI_COMM_SELF``. This will cause the delete callback function to be +executed on all keys associated with ``MPI_COMM_SELF`` in an arbitrary +order. If no key has been attached to ``MPI_COMM_SELF``, then no +callback is invoked. This freeing of ``MPI_COMM_SELF`` happens before +any other parts of the MPI world model are affected. Calling +:ref:`MPI_Finalized` will thus return ``false`` in any of these +callback functions. Once you have done this with ``MPI_COMM_SELF``, +the results of :ref:`MPI_Finalize` are not specified. ERRORS ------ .. include:: ./ERRORS.rst -.. seealso:: :ref:`MPI_Init` +.. seealso:: + * :ref:`MPI_Finalized` + * :ref:`MPI_Init` + * :ref:`MPI_Initialized` + * :ref:`MPI_Session_init` + * :ref:`MPI_Session_finalize` diff --git a/docs/man-openmpi/man3/MPI_Finalized.3.rst b/docs/man-openmpi/man3/MPI_Finalized.3.rst index 84c3e71d7a1..4d1cc0117b0 100644 --- a/docs/man-openmpi/man3/MPI_Finalized.3.rst +++ b/docs/man-openmpi/man3/MPI_Finalized.3.rst @@ -5,7 +5,7 @@ MPI_Finalized .. include_body -:ref:`MPI_Finalized` |mdash| Checks whether MPI has been finalized +:ref:`MPI_Finalized` |mdash| Checks whether the MPI world model has been finalized SYNTAX ------ @@ -45,20 +45,31 @@ Fortran 2008 Syntax OUTPUT PARAMETER ---------------- -* ``flag`` : True if MPI was finalized, and false otherwise (logical). +* ``flag`` : True if the MPI world model was finalized, and false + otherwise (logical). * ``ierror`` : Fortran only: Error status (integer). DESCRIPTION ----------- -This routine may be used to determine whether MPI has been finalized. It -is one of a small number of routines that may be called before MPI is -initialized and after MPI has been finalized (:ref:`MPI_Initialized` is -another). +This routine may be used to determine whether the MPI world model has +been finalized. A different routine |mdash| :ref:`MPI_Initialized` +|mdash| is used to indicate whether the MPI world model has been +initialized. + +See `MPI-5.0:11.4.1 `_ for a list of MPI +functionality that is available (e.g., even when the MPI +world model has not yet initialized or has already been finalized). ERRORS ------ .. include:: ./ERRORS.rst -.. seealso:: :ref:`MPI_Init` +.. seealso:: + * :ref:`MPI_Init` + * :ref:`MPI_Init_thread` + * :ref:`MPI_Finalize` + * :ref:`MPI_Finalized` + * :ref:`MPI_Session_init` + * :ref:`MPI_Session_finalize` diff --git a/docs/man-openmpi/man3/MPI_Init.3.rst b/docs/man-openmpi/man3/MPI_Init.3.rst index 6477ec495c9..08af1fa7d01 100644 --- a/docs/man-openmpi/man3/MPI_Init.3.rst +++ b/docs/man-openmpi/man3/MPI_Init.3.rst @@ -6,7 +6,7 @@ MPI_Init .. include_body -:ref:`MPI_Init` |mdash| Initializes the MPI execution environment +:ref:`MPI_Init` |mdash| Initializes the MPI world model SYNTAX @@ -56,23 +56,40 @@ OUTPUT PARAMETER DESCRIPTION ----------- -This routine, or :ref:`MPI_Init_thread`, must be called before most other MPI -routines are called. There are a small number of errors, such as -:ref:`MPI_Initialized` and :ref:`MPI_Finalized`. MPI can be initialized at most once; -subsequent calls to :ref:`MPI_Init` or :ref:`MPI_Init_thread` are erroneous. +This routine, or :ref:`MPI_Init_thread`, initializes the MPI world +model. Either of these routines must be called before MPI +communication routines are called within the MPI world model. The MPI +world model can be initialized at most exactly once in the lifetime of +an MPI process. This is different than the MPI session model, which +can be initialized and finalized multiple times in an MPI process. +See :ref:`MPI_Session_init` and :ref:`MPI_Session_finalize`. -All MPI programs must contain a call to :ref:`MPI_Init` or :ref:`MPI_Init_thread`. -Open MPI accepts the C *argc* and *argv* arguments to main, but neither -modifies, interprets, nor distributes them: +See `MPI-5.0:11.4.1 `_ for a list of MPI +functionality that is available (e.g., even when the MPI +world model has not yet initialized or has already been finalized). + +Open MPI's :ref:`MPI_Init` and :ref:`MPI_Init_thread` both accept the +C *argc* and *argv* arguments to main, but neither modifies, +interprets, nor distributes them: .. code-block:: c - /* declare variables */ - MPI_Init(&argc, &argv); - /* parse arguments */ - /* main program */ - MPI_Finalize(); + #include + int main(int argv, char *argv[]) { + MPI_Init(&argc, &argv); + /* ...body of main MPI pogram... */ + MPI_Finalize(); + return 0; + } + +By default, :ref:`MPI_Init` is effectively equivalent to invoking +:ref:`MPI_Init_thread` with a *required* value of +``MPI_THREAD_SINGLE``. However, if the ``OMPI_MPI_THREAD_LEVEL`` +environment variable is set to a valid value when :ref:`MPI_Init` is +invoked, it is equivalent to invoking :ref:`MPI_Init_thread` with +*required* set to the corresponding value of the ``OMPI_MPI_THREAD_LEVEL`` +environment variable. See :ref:`MPI_Init_thread` for more details. NOTES ----- @@ -80,11 +97,12 @@ NOTES The Fortran version does not have provisions for *argc* and *argv* and takes only IERROR. -The MPI Standard does not say what a program can do before an :ref:`MPI_Init` -or after an :ref:`MPI_Finalize`. In the Open MPI implementation, it should do -as little as possible. In particular, avoid anything that changes the -external state of the program, such as opening files, reading standard -input, or writing to standard output. +The MPI Standard does not specify what a program using the MPI world +model can do before invoking :ref:`MPI_Init` or :ref:`MPI_Init_thread` +or after invoking :ref:`MPI_Finalize`. In the Open MPI implementation, +it should do as little as possible. In particular, avoid anything that +changes the external state of the program, such as opening files, +reading standard input, or writing to standard output. ERRORS @@ -97,3 +115,5 @@ ERRORS * :ref:`MPI_Initialized` * :ref:`MPI_Finalize` * :ref:`MPI_Finalized` + * :ref:`MPI_Session_finalize` + * :ref:`MPI_Session_init` diff --git a/docs/man-openmpi/man3/MPI_Init_thread.3.rst b/docs/man-openmpi/man3/MPI_Init_thread.3.rst index e5173ff90a4..aca6307a347 100644 --- a/docs/man-openmpi/man3/MPI_Init_thread.3.rst +++ b/docs/man-openmpi/man3/MPI_Init_thread.3.rst @@ -6,7 +6,7 @@ MPI_Init_thread .. include_body -:ref:`MPI_Init_thread` |mdash| Initializes the MPI execution environment +:ref:`MPI_Init_thread` |mdash| Initializes the MPI world model SYNTAX @@ -61,64 +61,142 @@ OUTPUT PARAMETERS DESCRIPTION ----------- -This routine, or :ref:`MPI_Init`, must be called before most other MPI routines -are called. There are a small number of exceptions, such as -:ref:`MPI_Initialized` and :ref:`MPI_Finalized`. MPI can be initialized at most once; -subsequent calls to :ref:`MPI_Init` or :ref:`MPI_Init_thread` are erroneous. +This routine, or :ref:`MPI_Init`, initializes the MPI world +model. Either of these routines must be called before MPI +communication routines are called within the MPI world model. The MPI +world model can be initialized at most exactly once in the lifetime of +an MPI process. This is different than the MPI session model, which +can be initialized and finalized multiple times in an MPI process. +See :ref:`MPI_Session_init` and :ref:`MPI_Session_finalize`. -:ref:`MPI_Init_thread`, as compared to :ref:`MPI_Init`, has a provision to request a -certain level of thread support in *required*: +See `MPI-5.0:11.4.1 `_ for a list of MPI +functionality that is available (e.g., even when the MPI +world model has not yet initialized or has already been finalized). -MPI_THREAD_SINGLE - Only one thread will execute. +The MPI world model can be initialized at most once; subsequent calls +to :ref:`MPI_Init` or :ref:`MPI_Init_thread` are erroneous. -MPI_THREAD_FUNNELED - If the process is multithreaded, only the thread that called - :ref:`MPI_Init_thread` will make MPI calls. +Alternatively, instead of the MPI world model, MPI applications can +use the sessions model; see :ref:`MPI_Session_init`. -MPI_THREAD_SERIALIZED - If the process is multithreaded, only one thread will make MPI - library calls at one time. +Upon return, the level of thread support available to the program is +set in *provided*. In Open MPI, the value is dependent on how the +library was configured and built. Note that there is no guarantee that +*provided* will be greater than or equal to *required*. -MPI_THREAD_MULTIPLE - If the process is multithreaded, multiple threads may call MPI at - once with no restrictions. +Open MPI accepts the C *argc* and *argv* arguments to main, but +neither modifies, interprets, nor distributes them: -The level of thread support available to the program is set in -*provided*. In Open MPI, the value is dependent on how the library was -configured and built. Note that there is no guarantee that *provided* -will be greater than or equal to *required*. +.. code-block:: c -Also note that calling :ref:`MPI_Init_thread` with a *required* value of -MPI_THREAD_SINGLE is equivalent to calling :ref:`MPI_Init`. + #include -All MPI programs must contain a call to :ref:`MPI_Init` or :ref:`MPI_Init_thread`. -Open MPI accepts the C *argc* and *argv* arguments to main, but neither -modifies, interprets, nor distributes them: + int main(int argv, char *argv[]) { + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + /* ...body of main MPI pogram... */ + MPI_Finalize(); + return 0; + } -.. code-block:: c - /* declare variables */ - MPI_Init_thread(&argc, &argv, req, &prov); - /* parse arguments */ - /* main program */ - MPI_Finalize(); +:ref:`MPI_Init_thread` has both a direct and an indirect mechanism to +request a specific level of thread support. :ref:`MPI_Init` only has +an indirect mechanism to request a specific level of thread support. + +Direct request of thread level +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:ref:`MPI_Init_thread` has the *required* parameter, which can be set +to any one of the following constants (from ``mpi.h``): + +* ``MPI_THREAD_SINGLE``: Indicating that only one thread will execute. + +* ``MPI_THREAD_FUNNELED``: Indicating that if the process is + multithreaded, only the thread that called :ref:`MPI_Init_thread` + will make MPI calls. + +* ``MPI_THREAD_SERIALIZED``: Indicating that if the process is + multithreaded, only one thread will make MPI library calls at one + time. + +* ``MPI_THREAD_MULTIPLE``: Indicating that if the process is + multithreaded, multiple threads may call MPI at once with no + restrictions. + +The values of these constants adhere to the following relationships: + +.. math:: + :nowrap: + + \begin{eqnarray} + MPI\_THREAD\_SINGLE & < & MPI\_THREAD\_FUNNELED \\ + MPI\_THREAD\_FUNNELED & < & MPI\_THREAD\_SERIALIZED \\ + MPI\_THREAD\_SERIALIZED & < & MPI\_THREAD\_MULTIPLE \\ + \end{eqnarray} + +Indirect request of thread level +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Both :ref:`MPI_Init_thread` and :ref:`MPI_Init` support an indirect +method of indicating the required thread level: setting the +``OMPI_MPI_THREAD_LEVEL`` environment variable: + +* If the ``OMPI_MPI_THREAD_LEVEL`` environment variable is set at the + time :ref:`MPI_Init` is invoked, it behaves as if + :ref:`MPI_Init_thread` was invoked with the corresponding + ``MPI_THREAD_*`` constant value passed via the *required* parameter. + +* If the ``OMPI_MPI_THREAD_LEVEL`` environment variable is set at the + time :ref:`MPI_Init_thread` is invoked, the ``MPI_THREAD_*`` + constant value corresponding to the environment variable value + overrides the value passed via the *required* parameter. + +The ``OMPI_MPI_THREAD_LEVEL`` environment variable can be set to any +of the values listed below. + +.. list-table:: + :header-rows: 1 + + * - Value that Open MPI uses + - Allowable values (case-insensitive) + + * - ``MPI_THREAD_SINGLE`` + - ``MPI_THREAD_SINGLE``, ``SINGLE``, 0 + + * - ``MPI_THREAD_FUNNELED`` + - ``MPI_THREAD_FUNNELED``, ``FUNNELED``, 1 + + * - ``MPI_THREAD_SERIALIZED`` + - ``MPI_THREAD_SERIALIZED``, ``SERIALIZED``, 2 + + * - ``MPI_THREAD_MULTIPLE`` + - ``MPI_THREAD_MULTIPLE``, ``MULTIPLE``, 3 + +.. note:: Prior to Open MPI v6.0.0, only the integer values 0 through + 3 were acceptable values for the ``OMPI_MPI_THREAD_LEVEL`` + environment variable. + Starting with Open MPI v6.0.0, the Open MPI community + recomends using one of the string name variants so that it + can be correctly mapped to the corresponding Open MPI ABI + value or the MPI Standard ABI value, as relevant. NOTES ----- The Fortran version does not have provisions for ``argc`` and ``argv`` and -takes only ``IERROR``. +takes only ``REQUIRED``, ``PROVIDED``, and ``IERROR``. It is the caller's responsibility to check the value of ``provided``, as it may be less than what was requested in ``required``. -The MPI Standard does not say what a program can do before an -:ref:`MPI_Init_thread` or after an :ref:`MPI_Finalize`. In the Open MPI -implementation, it should do as little as possible. In particular, avoid -anything that changes the external state of the program, such as opening -files, reading standard input, or writing to standard output. +The MPI Standard does not specify what a program using the MPI world +model can do before invoking :ref:`MPI_Init` or :ref:`MPI_Init_thread` +or after invoking :ref:`MPI_Finalize`. In the Open MPI implementation, +it should do as little as possible. In particular, avoid anything that +changes the external state of the program, such as opening files, +reading standard input, or writing to standard output. MPI_THREAD_MULTIPLE Support @@ -129,7 +207,7 @@ Open MPI was built supports threading. You can check the output of :ref:`ompi_info(1) ` to see if Open MPI has ``MPI_THREAD_MULTIPLE`` support: -:: +.. code-block:: bash shell$ ompi_info | grep "Thread support" Thread support: posix (MPI_THREAD_MULTIPLE: yes, OPAL support: yes, OMPI progress: no, Event lib: yes) @@ -153,3 +231,5 @@ ERRORS * :ref:`MPI_Initialized` * :ref:`MPI_Finalize` * :ref:`MPI_Finalized` + * :ref:`MPI_Session_finalize` + * :ref:`MPI_Session_init` diff --git a/docs/man-openmpi/man3/MPI_Initialized.3.rst b/docs/man-openmpi/man3/MPI_Initialized.3.rst index 069d089c7dc..46ccf0ba5b8 100644 --- a/docs/man-openmpi/man3/MPI_Initialized.3.rst +++ b/docs/man-openmpi/man3/MPI_Initialized.3.rst @@ -6,7 +6,7 @@ MPI_Initialized .. include_body -:ref:`MPI_Initialized` |mdash| Checks whether MPI has been initialized +:ref:`MPI_Initialized` |mdash| Checks whether the MPI world model has been initialized SYNTAX @@ -48,15 +48,20 @@ Fortran 2008 Syntax OUTPUT PARAMETERS ----------------- -* ``flag``: True if MPI has been initialized, and false otherwise (logical). +* ``flag``: True if the MPI world model has been initialized, and false otherwise (logical). * ``ierror``: Fortran only: Error status (integer). DESCRIPTION ----------- -This routine may be used to determine whether MPI has been initialized. -It is one of a small number of routines that may be called before MPI is -initialized and after MPI has been finalized (:ref:`MPI_Finalized` is another). +This routine may be used to determine whether the MPI world model has +been initialized. A different routine |mdash| :ref:`MPI_Finalized` +|mdash| is used to indicate whether the MPI world model has been +finalized. + +See `MPI-5.0:11.4.1 `_ for a list of MPI +functionality that is available (e.g., even when the MPI +world model has not yet initialized or has already been finalized). ERRORS @@ -69,3 +74,5 @@ ERRORS * :ref:`MPI_Init_thread` * :ref:`MPI_Finalize` * :ref:`MPI_Finalized` + * :ref:`MPI_Session_init` + * :ref:`MPI_Session_finalize` diff --git a/docs/man-openmpi/man3/MPI_Session_finalize.3.rst b/docs/man-openmpi/man3/MPI_Session_finalize.3.rst index b91e2085a6b..4cf4820c1ab 100644 --- a/docs/man-openmpi/man3/MPI_Session_finalize.3.rst +++ b/docs/man-openmpi/man3/MPI_Session_finalize.3.rst @@ -56,9 +56,15 @@ DESCRIPTION :ref:`MPI_Session_finalize` releases all MPI state associated with the supplied session. Every instantiated session must be finalized using -:ref:`MPI_Session_finalize`. The handle session is set to MPI_SESSION_NULL by +:ref:`MPI_Session_finalize`. The handle session is set to ``MPI_SESSION_NULL`` by the call. +Multiple sessions can be created and destroyed during the lifetime of +an MPI process. This is different than MPI world model, which can be +initialized at most exactly once (and then subsequently finalized) +during the lifetime of an MPI process. + + NOTES ----- @@ -83,4 +89,10 @@ ERRORS .. include:: ./ERRORS.rst -.. seealso:: :ref:`MPI_Session_init` +.. seealso:: + * :ref:`MPI_Init` + * :ref:`MPI_Initialized` + * :ref:`MPI_Init_thread` + * :ref:`MPI_Finalize` + * :ref:`MPI_Finalized` + * :ref:`MPI_Session_init` diff --git a/docs/man-openmpi/man3/MPI_Session_init.3.rst b/docs/man-openmpi/man3/MPI_Session_init.3.rst index 813ad196a83..43432c3e8d6 100644 --- a/docs/man-openmpi/man3/MPI_Session_init.3.rst +++ b/docs/man-openmpi/man3/MPI_Session_init.3.rst @@ -59,19 +59,27 @@ OUTPUT PARAMETERS DESCRIPTION ----------- -:ref:`MPI_Session_init` is used to instantiate an MPI Session. The returned -session handle can be used to query the runtime system about -characteristics of the job within which the process is running, as well -as other system resources. An application can make multiple calls to -:ref:`MPI_Session_init` and the related :ref:`MPI_Session_finalize` routine. +:ref:`MPI_Session_init` is used to instantiate an MPI Session. The +returned session handle can be used to query the runtime system about +characteristics of the job within which the process is running, as +well as other system resources. Other MPI communications can also be +initiated in the context of an MPI session handle. All sessions must +be finalized via :ref:`MPI_Session_finalize` before the MPI process +terminates. + +Multiple sessions can be created and destroyed during the lifetime of +an MPI process. This is different than MPI world model, which can be +initialized at most exactly once (and then subsequently finalized) +during the lifetime of an MPI process. + NOTES ----- -The info argument is used to request MPI functionality requirements and -possible MPI implementation specific capabilities. +The *info* argument is used to request MPI functionality requirements +and possible MPI implementation specific capabilities. -The errhandler argument specifies an error handler to invoke in the +The *errhandler* argument specifies an error handler to invoke in the event that the Session instantiation call encounters an error. ERRORS @@ -79,4 +87,24 @@ ERRORS .. include:: ./ERRORS.rst -.. seealso:: :ref:`MPI_Session_get_num_psets` MPI_Session_group_from_pset +.. seealso:: + * :ref:`MPI_Init` + * :ref:`MPI_Initialized` + * :ref:`MPI_Init_thread` + * :ref:`MPI_Finalize` + * :ref:`MPI_Finalized` + * :ref:`MPI_Group_from_session_pset` + * :ref:`MPI_Session_c2f` + * :ref:`MPI_Session_call_errhandler` + * :ref:`MPI_Session_create_errhandler` + * :ref:`MPI_Session_f2c` + * :ref:`MPI_Session_finalize` + * :ref:`MPI_Session_get_errhandler` + * :ref:`MPI_Session_get_info` + * :ref:`MPI_Session_get_nth_pset` + * :ref:`MPI_Session_get_num_psets` + * :ref:`MPI_Session_get_pset_info` + * :ref:`MPI_Session_init` + * :ref:`MPI_Session_set_errhandler` + * :ref:`MPI_T_pvar_session_create` + * :ref:`MPI_T_pvar_session_free` From 1bea4f7e7e087faed20652e136eaacf6a03b6703 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 10 Jul 2025 16:28:28 -0400 Subject: [PATCH 04/24] Add missing file MPI_Session_c2f.3.rst referenced from MPI_Session_init doc. Signed-off-by: Aurelien Bouteiller --- docs/man-openmpi/man3/MPI_Session_c2f.3.rst | 9 +++++++++ docs/man-openmpi/man3/index.rst | 1 + 2 files changed, 10 insertions(+) create mode 100644 docs/man-openmpi/man3/MPI_Session_c2f.3.rst diff --git a/docs/man-openmpi/man3/MPI_Session_c2f.3.rst b/docs/man-openmpi/man3/MPI_Session_c2f.3.rst new file mode 100644 index 00000000000..c798c3f0a7a --- /dev/null +++ b/docs/man-openmpi/man3/MPI_Session_c2f.3.rst @@ -0,0 +1,9 @@ +.. _mpi_session_c2f: + +MPI_Session_c2f +=============== + .. include_body + +.. include:: ../man3/MPI_Session_f2c.3.rst + :start-after: .. include_body + diff --git a/docs/man-openmpi/man3/index.rst b/docs/man-openmpi/man3/index.rst index 7df08ccb542..cb9961970ca 100644 --- a/docs/man-openmpi/man3/index.rst +++ b/docs/man-openmpi/man3/index.rst @@ -324,6 +324,7 @@ MPI API manual pages (section 3) MPI_Sendrecv_replace.3.rst MPI_Session_call_errhandler.3.rst MPI_Session_create_errhandler.3.rst + MPI_Session_c2f.3.rst MPI_Session_f2c.3.rst MPI_Session_finalize.3.rst MPI_Session_get_errhandler.3.rst From 91a8d34fa3c6f365ffa8e698343885bb458d9f8e Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 10 Jul 2025 17:06:20 -0400 Subject: [PATCH 05/24] Update hardcoded version values in documentation for Init_threads Signed-off-by: Aurelien Bouteiller --- docs/man-openmpi/man3/MPI_Init_thread.3.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/man-openmpi/man3/MPI_Init_thread.3.rst b/docs/man-openmpi/man3/MPI_Init_thread.3.rst index aca6307a347..dd37b866ef1 100644 --- a/docs/man-openmpi/man3/MPI_Init_thread.3.rst +++ b/docs/man-openmpi/man3/MPI_Init_thread.3.rst @@ -173,14 +173,12 @@ of the values listed below. * - ``MPI_THREAD_MULTIPLE`` - ``MPI_THREAD_MULTIPLE``, ``MULTIPLE``, 3 -.. note:: Prior to Open MPI v6.0.0, only the integer values 0 through +.. note:: In Open MPI v5.0.8 and prior, only the integer values 0 through 3 were acceptable values for the ``OMPI_MPI_THREAD_LEVEL`` environment variable. - Starting with Open MPI v6.0.0, the Open MPI community - recomends using one of the string name variants so that it - can be correctly mapped to the corresponding Open MPI ABI - value or the MPI Standard ABI value, as relevant. + Starting with Open MPI v5.0.9, the Open MPI community + recomends using one of the string name variants. NOTES ----- From 147548988a64769c822bb8f22f8ba93d63534340 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Tue, 26 Aug 2025 09:14:01 -0400 Subject: [PATCH 06/24] fortran: fix ompi string c2f where len(fstr) < len(cstr) Thanks to Ben Menadue for pointing out that ompi_fortran_string_c2f() missed a case to properly terminate the resulting Fortran string when copying from a longer C source string. Signed-off-by: Jeff Squyres (cherry picked from commit 694e78aa864e5dc219172a710bda8e129542094a) --- ompi/mpi/fortran/base/strings.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ompi/mpi/fortran/base/strings.c b/ompi/mpi/fortran/base/strings.c index 5bbd96d5eea..63ea0db6f15 100644 --- a/ompi/mpi/fortran/base/strings.c +++ b/ompi/mpi/fortran/base/strings.c @@ -12,6 +12,7 @@ * Copyright (c) 2010-2018 Cisco Systems, Inc. All rights reserved * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Jeffrey M. Squyres. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -95,8 +96,19 @@ int ompi_fortran_string_c2f(const char *cstr, char *fstr, int len) int i; opal_string_copy(fstr, cstr, len); - for (i = strlen(cstr); i < len; ++i) { - fstr[i] = ' '; + + // If len < len(cstr), then opal_string_copy() will have copied a + // trailing \0 into the last position in fstr. This is not what + // Fortran wants; overwrite that \0 with the actual last character + // that will fit into fstr. + if (len < strlen(cstr)) { + fstr[len - 1] = cstr[len - 1]; + } else { + // Otherwise, pad the end of the resulting Fortran string with + // spaces. + for (i = strlen(cstr); i < len; ++i) { + fstr[i] = ' '; + } } return OMPI_SUCCESS; From cc72cc7d9679f2a36223a5f20bee1d227bcea758 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Wed, 27 Aug 2025 10:07:51 -0400 Subject: [PATCH 07/24] fortran: fix off-by-one string copy error Followup to commit 694e78aa8: Ben Menadue correctly pointed out that < should have been <=. Signed-off-by: Jeff Squyres (cherry picked from commit cc03d5b14ad7d30991f07a57dfb0f3d004bb631b) --- ompi/mpi/fortran/base/strings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/mpi/fortran/base/strings.c b/ompi/mpi/fortran/base/strings.c index 63ea0db6f15..1016b850aaf 100644 --- a/ompi/mpi/fortran/base/strings.c +++ b/ompi/mpi/fortran/base/strings.c @@ -101,7 +101,7 @@ int ompi_fortran_string_c2f(const char *cstr, char *fstr, int len) // trailing \0 into the last position in fstr. This is not what // Fortran wants; overwrite that \0 with the actual last character // that will fit into fstr. - if (len < strlen(cstr)) { + if (len <= strlen(cstr)) { fstr[len - 1] = cstr[len - 1]; } else { // Otherwise, pad the end of the resulting Fortran string with From 9c7e2f49c4c3f9bb9c6911637e2e77aa1ec77a6b Mon Sep 17 00:00:00 2001 From: Philippe Blain Date: Wed, 27 Aug 2025 21:22:41 -0400 Subject: [PATCH 08/24] docs/mca.rst: fix MCA environment variable prefix for PRRTE The table added in 061f90860f (A variety of docs updates:, 2022-09-12) mentioning the different prefixes for Open MPI, PMIx and PRRTE MCA parameters set via environment variables has one too many "R"'s in 'PRRTE_MCA_': the correct prefix is 'PRTE_MCA_'. Fix that, and make it clear that it is not a typo. Signed-off-by: Philippe Blain (cherry picked from commit bd9adb41c1644775f4792221fbc01f552d1d637c) --- docs/mca.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/mca.rst b/docs/mca.rst index ff640817806..3ab33d46fe0 100644 --- a/docs/mca.rst +++ b/docs/mca.rst @@ -252,13 +252,13 @@ shells): When setting PMIx- and PRRTE-specific MCA parameters via environment variables, use a different prefix: - +----------+----------------+ - | Open MPI | ``OMPI_MCA_`` | - +----------+----------------+ - | PMIx | ``PMIX_MCA_`` | - +----------+----------------+ - | PRRTE | ``PRRTE_MCA_`` | - +----------+----------------+ + +----------+-----------------------------------+ + | Open MPI | ``OMPI_MCA_`` | + +----------+-----------------------------------+ + | PMIx | ``PMIX_MCA_`` | + +----------+-----------------------------------+ + | PRRTE | ``PRTE_MCA_`` (with a single "R") | + +----------+-----------------------------------+ Tuning MCA parameter files ^^^^^^^^^^^^^^^^^^^^^^^^^^ From b357357c09915779eabd362f6857f61b41680329 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 4 Sep 2025 07:17:37 -0600 Subject: [PATCH 09/24] Update submodules to latest PMIx/PRRTE releases PMIx v5.0.9 PRRTE v3.0.12 Signed-off-by: Ralph Castain --- 3rd-party/openpmix | 2 +- 3rd-party/prrte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3rd-party/openpmix b/3rd-party/openpmix index 907b1ccaeec..a84ed686ae8 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit 907b1ccaeec61a1197f0ee5264d4fef20b257b84 +Subproject commit a84ed686ae84fb6a4b251b29b75ecc38f4621ad9 diff --git a/3rd-party/prrte b/3rd-party/prrte index 222f03fbb98..2e893392405 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 222f03fbb98b71abd293aa205b38fa9a38e57965 +Subproject commit 2e893392405afd914717a2c077accf1c1ec9ee55 From d0a7aff77ce929636368e2f687fcf15fd639a974 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Fri, 19 Sep 2025 14:40:21 -0600 Subject: [PATCH 10/24] Fix bug in MCA_PML_OB1_ADD_ACK_TO_PENDING that causes memory overruns or failure The MCA_PML_OB1_ADD_ACK_TO_PENDING method creates a mca_pml_ob1_pckt_pending_t to hold an ack to be sent later. This method builds the pending packet then puts it on the mca_pml_ob1.pckt_pending list for later transmission. It does not, however, set the required hdr_size field on the struct. This leads to issues when the packet is later sent because it could contain any value. With some btls this will lead to memory corruption (if the size is not checked against btl_max_send_size) or just allocation failure because the size is too big. In other situations it could lead to a truncated packet being send (if the size previously in hdr_size is smaller than an ack). To fix the issue this commit gets rid of the macro entirely and replaces it with a new inline helper method that does the same thing. This helper uses the existing mca_pml_ob1_add_to_pending helper (which sets hdr_size) to reduce duplicated code. Tested and verified this fixes a critical issue triggered on our hardware. Signed-off-by: Nathan Hjelm (cherry picked from commit 48490b95d636517c7a00e2f6c4301878e6cdd0c2) --- ompi/mca/pml/ob1/pml_ob1_recvreq.h | 34 ++++++++++++++---------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 402d3f4dcec..a266e3388bb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -407,23 +407,21 @@ static inline void mca_pml_ob1_recv_request_schedule( (void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl); } -#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \ - do { \ - mca_pml_ob1_pckt_pending_t *_pckt; \ - \ - MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \ - _pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK; \ - _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ - _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ - _pckt->hdr.hdr_ack.hdr_send_offset = (O); \ - _pckt->hdr.hdr_ack.hdr_send_size = (Sz); \ - _pckt->proc = (P); \ - _pckt->bml_btl = NULL; \ - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ - opal_list_append(&mca_pml_ob1.pckt_pending, \ - (opal_list_item_t*)_pckt); \ - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); \ - } while(0) +static inline void mca_pml_ob1_add_ack_to_pending(ompi_proc_t *proc, uintptr_t src_req, void *dst_req, + uint64_t send_offset, uint64_t send_size) { + mca_pml_ob1_hdr_t hdr = { + .hdr_ack = { + .hdr_common = { .hdr_type = MCA_PML_OB1_HDR_TYPE_ACK }, + .hdr_src_req = { .lval = src_req }, + .hdr_dst_req = { .pval = dst_req }, + .hdr_send_offset = send_offset, + .hdr_send_size = send_size, + }, + }; + + mca_pml_ob1_add_to_pending(proc, /*bml_btl=*/NULL, /*order=*/0, + &hdr, sizeof(hdr.hdr_ack)); +} int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, @@ -455,7 +453,7 @@ mca_pml_ob1_recv_request_ack_send(mca_btl_base_module_t* btl, } } - MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, + mca_pml_ob1_add_ack_to_pending(proc, hdr_src_req, hdr_dst_req, hdr_send_offset, size); return OMPI_ERR_OUT_OF_RESOURCE; From 2a08db2be13ddf8739c7ffdf06312a99cd4e145f Mon Sep 17 00:00:00 2001 From: Tomislav Janjusic Date: Tue, 23 Sep 2025 21:39:11 +0300 Subject: [PATCH 11/24] v5.0.x: prepare v5.0.9rc1 release - Update VERSION file to v5.0.9rc1 with correct date (23 September 2025) - Update NEWS with actual changes from v5.0.8 to v5.0.9rc1 including: * PMIx v5.0.9 and PRRTE v3.0.12 updates * GPFS 5.2.3-0+ support * OFI accelerator memory enhancements * Critical PML OB1 bug fix for memory overruns * Fortran string conversion fixes * Threading improvements * Various documentation and build system fixes Signed-off-by: Tomislav Janjusic --- VERSION | 4 ++-- docs/release-notes/changelog/v5.0.x.rst | 27 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index b53a821da8a..f37241ba12d 100644 --- a/VERSION +++ b/VERSION @@ -41,7 +41,7 @@ flex_min_version=2.5.4 # requirement is that it must be entirely printable ASCII characters # and have no white space. -greek=a1 +greek=rc1 # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" @@ -62,7 +62,7 @@ tarball_version=gitclone # The date when this release was created -date="30 May 2025" +date="23 September 2025" # The shared library version of each of Open MPI's public libraries. # These versions are maintained in accordance with the "Library diff --git a/docs/release-notes/changelog/v5.0.x.rst b/docs/release-notes/changelog/v5.0.x.rst index a4b042d8831..460db24ecb9 100644 --- a/docs/release-notes/changelog/v5.0.x.rst +++ b/docs/release-notes/changelog/v5.0.x.rst @@ -4,6 +4,33 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. +Open MPI Version v5.0.9rc1 +------------------------------ +:Date: 23 September 2025 + +- Internal PMIx and PRRTe versions: + - PMIx (v5.0.9). Repo: ``https://github.com/openpmix/openpmix``. Commit hash: ``b357357c09915779eabd362f6857f61b41680329``. + - PRRTE (v3.0.12). Repo: ``https://github.com/openpmix/prrte``. Commit hash: ``b357357c09915779eabd362f6857f61b41680329``. + +- Functionality Enhancements + - GPFS: Added support for GPFS 5.2.3-0 and newer versions + - OFI: Enhanced accelerator memory support with proper rcache flag handling + - OFI: Added memory monitor export for better memory management + - ROCm: Added missing header for memcpy operations in accelerator component + +- Bug Fixes and Minor Enhancements + - PML OB1: Fixed critical bug in MCA_PML_OB1_ADD_ACK_TO_PENDING that could cause memory overruns or allocation failures + - Fortran: Fixed off-by-one string copy error in C2F string conversion + - Fortran: Fixed ompi string c2f conversion when Fortran string length is less than C string length + - Threading: Fixed OMPI_MPI_THREAD_LEVEL environment variable handling to allow useful overrides in threaded library use cases + - Threading: Enhanced OMPI_MPI_THREAD_LEVEL to accept both numeric (0-3) and string ('multiple', 'MPI_THREAD_MULTIPLE', etc.) values + - OSC: Fixed rdma component when not using ob1 PML + - S390x: Fixed alignment of opal_atomic_int128_t to be 16-byte aligned + - Configury: Improved Fortran complex(real16) testing and module file cleanup + - Documentation: Fixed MCA environment variable prefix documentation for PRRTE + - Documentation: Updated MPI_Init*/MPI_Finalize*/MPI_Session_* man pages with numerous improvements + - Build system: Removed whitespace from conftestval-style tests and cleaned up configuration + Open MPI Version v5.0.8 ------------------------------ :Date: 30 May 2025 From fa0a52b75399c697533a6b58129d253801665ef9 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Wed, 24 Sep 2025 19:25:28 +0300 Subject: [PATCH 12/24] OMPI/MCA/PML/UCX: Set node local id - v5.0.x Signed-off-by: Mikhail Brinskii --- config/ompi_check_ucx.m4 | 3 ++- ompi/mca/osc/ucx/osc_ucx_component.c | 5 +++++ ompi/mca/pml/ucx/pml_ucx.c | 5 +++++ oshmem/mca/spml/ucx/spml_ucx_component.c | 5 +++++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 1da2455f1b7..8f609167f31 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -99,7 +99,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[ UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK, UCP_OP_ATTR_FLAG_MULTI_SEND, UCP_MEM_MAP_SYMMETRIC_RKEY, - UCS_MEMORY_TYPE_RDMA], + UCS_MEMORY_TYPE_RDMA, + UCP_PARAM_FIELD_NODE_LOCAL_ID], [], [], [#include ]) AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index e01968ae8ae..1196ba61936 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -270,6 +270,11 @@ static int ucp_context_init(bool enable_mt, int proc_world_size) { context_params.field_mask |= UCP_PARAM_FIELD_ESTIMATED_NUM_PPN; #endif +#if HAVE_DECL_UCP_PARAM_FIELD_NODE_LOCAL_ID + context_params.node_local_id = opal_process_info.my_local_rank; + context_params.field_mask |= UCP_PARAM_FIELD_NODE_LOCAL_ID; +#endif + status = ucp_init(&context_params, config, &mca_osc_ucx_component.wpool->ucp_ctx); if (UCS_OK != status) { OSC_UCX_VERBOSE(1, "ucp_init failed: %d", status); diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index dd16a27b154..91fb879c5b1 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -248,6 +248,11 @@ int mca_pml_ucx_open(void) params.field_mask |= UCP_PARAM_FIELD_ESTIMATED_NUM_PPN; #endif +#if HAVE_DECL_UCP_PARAM_FIELD_NODE_LOCAL_ID + params.node_local_id = opal_process_info.my_local_rank; + params.field_mask |= UCP_PARAM_FIELD_NODE_LOCAL_ID; +#endif + status = ucp_init(¶ms, config, &ompi_pml_ucx.ucp_context); ucp_config_release(config); diff --git a/oshmem/mca/spml/ucx/spml_ucx_component.c b/oshmem/mca/spml/ucx/spml_ucx_component.c index affc73f3f88..a57c987397d 100644 --- a/oshmem/mca/spml/ucx/spml_ucx_component.c +++ b/oshmem/mca/spml/ucx/spml_ucx_component.c @@ -292,6 +292,11 @@ static int spml_ucx_init(void) params.field_mask |= UCP_PARAM_FIELD_ESTIMATED_NUM_PPN; #endif +#if HAVE_DECL_UCP_PARAM_FIELD_NODE_LOCAL_ID + params.node_local_id = opal_process_info.my_local_rank; + params.field_mask |= UCP_PARAM_FIELD_NODE_LOCAL_ID; +#endif + err = ucp_init(¶ms, ucp_config, &mca_spml_ucx.ucp_context); ucp_config_release(ucp_config); if (UCS_OK != err) { From e3beb104187e184ac2a6f4ebcc018984305489bc Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Thu, 25 Sep 2025 15:46:59 +0200 Subject: [PATCH 13/24] COLL/UCC: set node local id Signed-off-by: Sergey Lebedev (cherry picked from commit 0caae60bc84517b3990391d8950e548072f94bd3) --- ompi/mca/coll/ucc/coll_ucc_module.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ompi/mca/coll/ucc/coll_ucc_module.c b/ompi/mca/coll/ucc/coll_ucc_module.c index 3cf73d4ee25..fd6df963633 100644 --- a/ompi/mca/coll/ucc/coll_ucc_module.c +++ b/ompi/mca/coll/ucc/coll_ucc_module.c @@ -291,6 +291,9 @@ static int mca_coll_ucc_init_ctx() { ucc_thread_mode_t tm_requested; ucc_lib_params_t lib_params; ucc_context_params_t ctx_params; + unsigned ucc_api_major, ucc_api_minor, ucc_api_patch; + + ucc_get_version(&ucc_api_major, &ucc_api_minor, &ucc_api_patch); tm_requested = ompi_mpi_thread_multiple ? UCC_THREAD_MULTIPLE : UCC_THREAD_SINGLE; @@ -354,6 +357,15 @@ static int mca_coll_ucc_init_ctx() { goto cleanup_lib; } + if (ucc_api_major > 1 || (ucc_api_major == 1 && ucc_api_minor >= 6)) { + sprintf(str_buf, "%u", opal_process_info.my_local_rank); + if (UCC_OK != ucc_context_config_modify(ctx_config, NULL, "NODE_LOCAL_ID", + str_buf)) { + UCC_ERROR("UCC context config modify failed for node_local_id"); + goto cleanup_lib; + } + } + if (UCC_OK != ucc_context_create(cm->ucc_lib, &ctx_params, ctx_config, &cm->ucc_context)) { UCC_ERROR("UCC context create failed"); From 384f57a8397ad49cdea6b345e3e3b0be5d0eac5c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 2 Oct 2025 02:16:57 +0300 Subject: [PATCH 14/24] Only pick one CUDA In some cases the CUDA install directory contains two libcuda.so and this breaks OMPI CUDA detection. Pick the first of these libraries seems to be a good soltuion for all cases. Signed-off-by: George Bosilca (cherry picked from commit c7e27b9c575374f53a1c1a8e93b347fd2c2b050e) --- config/opal_check_cuda.m4 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4 index ebd70d59e32..a6bf80a1b2a 100644 --- a/config/opal_check_cuda.m4 +++ b/config/opal_check_cuda.m4 @@ -60,8 +60,8 @@ AC_ARG_WITH([cuda-libdir], [Search for CUDA libraries in DIR])], [], [AS_IF([test -d "$with_cuda"], - [with_cuda_libdir=$(dirname $(find -H $with_cuda -name libcuda.so 2> /dev/null) 2> /dev/null)], - [with_cuda_libdir=$(dirname $(find -H /usr/local/cuda -name libcuda.so 2> /dev/null) 2> /dev/null)]) + [with_cuda_libdir=$(dirname $(find -H $with_cuda -name libcuda.so 2> /dev/null | head -n 1) 2> /dev/null)], + [with_cuda_libdir=$(dirname $(find -H /usr/local/cuda -name libcuda.so 2> /dev/null) 2> /dev/null | head -n 1)]) ]) # Note that CUDA support is off by default. To turn it on, the user has to From b156808e01be80b520762dd06786e0181ff8b594 Mon Sep 17 00:00:00 2001 From: xbw <78337767+xbw22109@users.noreply.github.com> Date: Wed, 15 Oct 2025 05:14:43 +0000 Subject: [PATCH 15/24] Fix `see-also` errors in the document. Signed-off-by: xbw <78337767+xbw22109@users.noreply.github.com> (cherry picked from commit 0999325ad5f92a675c80acca3dfd603928bfebdb) --- docs/man-openmpi/man3/MPI_Recv_init.3.rst | 2 +- docs/man-openmpi/man3/MPI_Rsend_init.3.rst | 2 +- docs/man-openmpi/man3/MPI_Start.3.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/man-openmpi/man3/MPI_Recv_init.3.rst b/docs/man-openmpi/man3/MPI_Recv_init.3.rst index e47265348f7..cd44cfa63bc 100644 --- a/docs/man-openmpi/man3/MPI_Recv_init.3.rst +++ b/docs/man-openmpi/man3/MPI_Recv_init.3.rst @@ -92,7 +92,7 @@ ERRORS * :ref:`MPI_Bsend_init` * :ref:`MPI_Rsend_init` * :ref:`MPI_Send_init` - * MPI_Sssend_init + * :ref:`MPI_Ssend_init` * :ref:`MPI_Start` * :ref:`MPI_Startall` * :ref:`MPI_Request_free` diff --git a/docs/man-openmpi/man3/MPI_Rsend_init.3.rst b/docs/man-openmpi/man3/MPI_Rsend_init.3.rst index b76563e8b66..77a18d9b5a6 100644 --- a/docs/man-openmpi/man3/MPI_Rsend_init.3.rst +++ b/docs/man-openmpi/man3/MPI_Rsend_init.3.rst @@ -84,7 +84,7 @@ ERRORS .. seealso:: * :ref:`MPI_Bsend_init` * :ref:`MPI_Send_init` - * MPI_Sssend_init + * :ref:`MPI_Ssend_init` * :ref:`MPI_Recv_init` * :ref:`MPI_Start` * :ref:`MPI_Startall` diff --git a/docs/man-openmpi/man3/MPI_Start.3.rst b/docs/man-openmpi/man3/MPI_Start.3.rst index 1cb01efb6d8..16a49d4e288 100644 --- a/docs/man-openmpi/man3/MPI_Start.3.rst +++ b/docs/man-openmpi/man3/MPI_Start.3.rst @@ -90,6 +90,6 @@ ERRORS * :ref:`MPI_Bsend_init` * :ref:`MPI_Rsend_init` * :ref:`MPI_Send_init` - * MPI_Sssend_init + * :ref:`MPI_Ssend_init` * :ref:`MPI_Recv_init` * :ref:`MPI_Startall` From 314d738b3c48e03b13b466d225f24de79372688a Mon Sep 17 00:00:00 2001 From: charlesgwaldman <120225331+charlesgwaldman@users.noreply.github.com> Date: Tue, 14 Oct 2025 15:29:11 -0500 Subject: [PATCH 16/24] Update history.rst (spelling) Signed-off-by: charlesgwaldman <120225331+charlesgwaldman@users.noreply.github.com> (cherry picked from commit a6b8cd3c98587f3b8fbccc4fe3e592a41e17f7cc) --- docs/history.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/history.rst b/docs/history.rst index 8cb1c38eb3d..55fa867b716 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -7,7 +7,7 @@ Open MPI represents the merger of three prior MPI implementations: center and later migrated to the University of Notre Dame. #. LA-MPI: from the US Department of Energy Los Alamos National Laboratory. -#. FT-MPI: from the University of Tennassee at Knoxville. One of the +#. FT-MPI: from the University of Tennessee at Knoxville. One of the UTK developers moved back to the University of Stuttgart in late 2004, which effectively added their team into the project. From 8050607031880b3d171d336cbba43fe2af965ff6 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Wed, 15 Oct 2025 09:28:49 -0400 Subject: [PATCH 17/24] NVIDIA github workflows: use unique workflow names Use unique, NVIDIA-specific workflow names so that it's easier to identify these workflows on the github dashboard backend. Signed-off-by: Jeff Squyres (cherry picked from commit dcac103c7e7917c55433e000133dde2ab1e05798) --- .github/workflows/ompi_nvidia.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ompi_nvidia.yaml b/.github/workflows/ompi_nvidia.yaml index 6a3201a3648..ed850df454c 100644 --- a/.github/workflows/ompi_nvidia.yaml +++ b/.github/workflows/ompi_nvidia.yaml @@ -2,7 +2,7 @@ name: ompi_NVIDIA CI on: [pull_request] jobs: - deployment: + nvidia_deployment: if: github.repository == 'open-mpi/ompi' runs-on: [self-hosted, linux, x64, nvidia] steps: @@ -17,25 +17,25 @@ jobs: path: ompi_ci - name: Deployment infrastructure run: /start deploy - build: - needs: [deployment] + nvidia_build: + needs: [nvidia_deployment] runs-on: [self-hosted, linux, x64, nvidia] steps: - name: Building OMPI,UCX and tests run: /start build - test: - needs: [deployment, build] + nvidia_test: + needs: [nvidia_deployment, nvidia_build] runs-on: [self-hosted, linux, x64, nvidia] steps: - name: Running tests run: /start test - clean: -# always() should be used to run "clean" even when the workflow was canceled + nvidia_clean: +# always() should be used to run "clean" even when the workflow was canceled # ( in case of the right repository name) # The second condition doesn't work when the workflow was canceled if: always() && (github.repository == 'open-mpi/ompi') - needs: [deployment, build, test] + needs: [nvidia_deployment, nvidia_build, nvidia_test] runs-on: [self-hosted, linux, x64, nvidia] steps: - name: Cleaning From 37bf4483dee0c122159983e0d52c7205b12bfd18 Mon Sep 17 00:00:00 2001 From: Tomislav Janjusic Date: Fri, 10 Oct 2025 17:14:00 +0300 Subject: [PATCH 18/24] v5.0.x: prepare v5.0.9rc2 release Signed-off-by: Tomislav Janjusic --- VERSION | 4 ++-- docs/release-notes/changelog/v5.0.x.rst | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index f37241ba12d..dd4245eef57 100644 --- a/VERSION +++ b/VERSION @@ -41,7 +41,7 @@ flex_min_version=2.5.4 # requirement is that it must be entirely printable ASCII characters # and have no white space. -greek=rc1 +greek=rc2 # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always" @@ -62,7 +62,7 @@ tarball_version=gitclone # The date when this release was created -date="23 September 2025" +date="15 October 2025" # The shared library version of each of Open MPI's public libraries. # These versions are maintained in accordance with the "Library diff --git a/docs/release-notes/changelog/v5.0.x.rst b/docs/release-notes/changelog/v5.0.x.rst index 460db24ecb9..ee5f6f49131 100644 --- a/docs/release-notes/changelog/v5.0.x.rst +++ b/docs/release-notes/changelog/v5.0.x.rst @@ -4,6 +4,21 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. +Open MPI Version v5.0.9rc2 +------------------------------ +:Date: 10 October 2025 + +- Internal PMIx and PRRTe versions: + - PMIx (v5.0.9). Repo: ``https://github.com/openpmix/openpmix``. Commit hash: ``b357357c09915779eabd362f6857f61b41680329``. + - PRRTE (v3.0.12). Repo: ``https://github.com/openpmix/prrte``. Commit hash: ``b357357c09915779eabd362f6857f61b41680329``. + +- Functionality Enhancements + - UCC: Set node local id for better collective operation performance + - UCX: Set node local id in PML component for improved communication + +- Bug Fixes and Minor Enhancements + - CUDA: Fixed CUDA detection when multiple libcuda.so files exist in CUDA install directory + Open MPI Version v5.0.9rc1 ------------------------------ :Date: 23 September 2025 From 8d88926356a3c9820f4a67c52a9b7b05cfa0efaf Mon Sep 17 00:00:00 2001 From: Kento Hasegawa Date: Wed, 15 Oct 2025 16:49:23 +0900 Subject: [PATCH 19/24] COLL/UCC: Fix initialization in non-blocking and persistent Signed-off-by: Kento Hasegawa (cherry picked from commit 4b1b9a9c947b7cfad8389ebec52e74788747abe8) --- .mailmap | 2 ++ ompi/mca/coll/ucc/coll_ucc_allgather.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_allgatherv.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_allreduce.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_alltoall.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_alltoallv.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_barrier.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_bcast.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_common.h | 4 +++- ompi/mca/coll/ucc/coll_ucc_gather.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_gatherv.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_reduce.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_reduce_scatter.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_reduce_scatter_block.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_scatter.c | 3 ++- ompi/mca/coll/ucc/coll_ucc_scatterv.c | 3 ++- 16 files changed, 33 insertions(+), 15 deletions(-) diff --git a/.mailmap b/.mailmap index 42895b1ddd6..17c977feab8 100644 --- a/.mailmap +++ b/.mailmap @@ -137,3 +137,5 @@ George Katevenis Brian Barrett Andrii Bilokur B-a-S + +Kento Hasegawa hasegawa.kento diff --git a/ompi/mca/coll/ucc/coll_ucc_allgather.c b/ompi/mca/coll/ucc/coll_ucc_allgather.c index c80aebb2a2c..08eb6eeb5e8 100644 --- a/ompi/mca/coll/ucc/coll_ucc_allgather.c +++ b/ompi/mca/coll/ucc/coll_ucc_allgather.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -98,7 +99,7 @@ int mca_coll_ucc_iallgather(const void *sbuf, int scount, struct ompi_datatype_t mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc iallgather"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_allgather_init(sbuf, scount, sdtype, rbuf, rcount, rdtype, ucc_module, &req, coll_req)); diff --git a/ompi/mca/coll/ucc/coll_ucc_allgatherv.c b/ompi/mca/coll/ucc/coll_ucc_allgatherv.c index 1a3ba27f053..64ff9856c58 100644 --- a/ompi/mca/coll/ucc/coll_ucc_allgatherv.c +++ b/ompi/mca/coll/ucc/coll_ucc_allgatherv.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -99,7 +100,7 @@ int mca_coll_ucc_iallgatherv(const void *sbuf, int scount, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc iallgatherv"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_allgatherv_init(sbuf, scount, sdtype, rbuf, rcounts, rdisps, rdtype, ucc_module, &req, coll_req)); diff --git a/ompi/mca/coll/ucc/coll_ucc_allreduce.c b/ompi/mca/coll/ucc/coll_ucc_allreduce.c index 3ed8e8cc372..5eb2793985a 100644 --- a/ompi/mca/coll/ucc/coll_ucc_allreduce.c +++ b/ompi/mca/coll/ucc/coll_ucc_allreduce.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -89,7 +90,7 @@ int mca_coll_ucc_iallreduce(const void *sbuf, void *rbuf, int count, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc iallreduce"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_allreduce_init(sbuf, rbuf, count, dtype, op, ucc_module, &req, coll_req)); COLL_UCC_POST_AND_CHECK(req); diff --git a/ompi/mca/coll/ucc/coll_ucc_alltoall.c b/ompi/mca/coll/ucc/coll_ucc_alltoall.c index 1fce7b1f733..26700932bf2 100644 --- a/ompi/mca/coll/ucc/coll_ucc_alltoall.c +++ b/ompi/mca/coll/ucc/coll_ucc_alltoall.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -97,7 +98,7 @@ int mca_coll_ucc_ialltoall(const void *sbuf, int scount, struct ompi_datatype_t mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc ialltoall"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_alltoall_init(sbuf, scount, sdtype, rbuf, rcount, rdtype, ucc_module, &req, coll_req)); diff --git a/ompi/mca/coll/ucc/coll_ucc_alltoallv.c b/ompi/mca/coll/ucc/coll_ucc_alltoallv.c index 53fd0cfa4d7..82ee126309e 100644 --- a/ompi/mca/coll/ucc/coll_ucc_alltoallv.c +++ b/ompi/mca/coll/ucc/coll_ucc_alltoallv.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -100,7 +101,7 @@ int mca_coll_ucc_ialltoallv(const void *sbuf, const int *scounts, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc ialltoallv"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_alltoallv_init(sbuf, scounts, sdisps, sdtype, rbuf, rcounts, rdisps, rdtype, ucc_module, &req, coll_req)); diff --git a/ompi/mca/coll/ucc/coll_ucc_barrier.c b/ompi/mca/coll/ucc/coll_ucc_barrier.c index 9790fffc2f9..a8cc72eaf95 100644 --- a/ompi/mca/coll/ucc/coll_ucc_barrier.c +++ b/ompi/mca/coll/ucc/coll_ucc_barrier.c @@ -1,5 +1,6 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,7 +49,7 @@ int mca_coll_ucc_ibarrier(struct ompi_communicator_t *comm, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc ibarrier"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_barrier_init(ucc_module, &req, coll_req)); COLL_UCC_POST_AND_CHECK(req); *request = &coll_req->super; diff --git a/ompi/mca/coll/ucc/coll_ucc_bcast.c b/ompi/mca/coll/ucc/coll_ucc_bcast.c index fb80fb03f8a..8b3f5593de9 100644 --- a/ompi/mca/coll/ucc/coll_ucc_bcast.c +++ b/ompi/mca/coll/ucc/coll_ucc_bcast.c @@ -1,5 +1,6 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -66,7 +67,7 @@ int mca_coll_ucc_ibcast(void *buf, int count, struct ompi_datatype_t *dtype, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc ibcast"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_bcast_init(buf, count, dtype, root, ucc_module, &req, coll_req)); COLL_UCC_POST_AND_CHECK(req); diff --git a/ompi/mca/coll/ucc/coll_ucc_common.h b/ompi/mca/coll/ucc/coll_ucc_common.h index 9d9163aa46d..7f9b3c84c62 100644 --- a/ompi/mca/coll/ucc/coll_ucc_common.h +++ b/ompi/mca/coll/ucc/coll_ucc_common.h @@ -1,5 +1,6 @@ /** Copyright (c) 2021 Mellanox Technologies. All rights reserved. + Copyright (c) 2025 Fujitsu Limited. All rights reserved. $COPYRIGHT$ Additional copyrights may follow $HEADER$ @@ -25,7 +26,7 @@ } \ } while(0) -#define COLL_UCC_GET_REQ(_coll_req) do { \ +#define COLL_UCC_GET_REQ(_coll_req, _comm) do { \ opal_free_list_item_t *item; \ item = opal_free_list_wait (&mca_coll_ucc_component.requests); \ if (OPAL_UNLIKELY(NULL == item)) { \ @@ -40,6 +41,7 @@ _coll_req->super.req_state = OMPI_REQUEST_ACTIVE; \ _coll_req->super.req_free = mca_coll_ucc_req_free; \ _coll_req->super.req_type = OMPI_REQUEST_COLL; \ + _coll_req->super.req_mpi_object.comm = _comm; \ } while(0) #define COLL_UCC_REQ_INIT(_coll_req, _req, _coll, _module) do{ \ diff --git a/ompi/mca/coll/ucc/coll_ucc_gather.c b/ompi/mca/coll/ucc/coll_ucc_gather.c index 8ede6a58e58..c8cf40908a3 100644 --- a/ompi/mca/coll/ucc/coll_ucc_gather.c +++ b/ompi/mca/coll/ucc/coll_ucc_gather.c @@ -2,6 +2,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -114,7 +115,7 @@ int mca_coll_ucc_igather(const void *sbuf, int scount, struct ompi_datatype_t *s mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc igather"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_gather_init(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, ucc_module, &req, coll_req)); diff --git a/ompi/mca/coll/ucc/coll_ucc_gatherv.c b/ompi/mca/coll/ucc/coll_ucc_gatherv.c index 13049a76e0f..4cd9b651d4e 100644 --- a/ompi/mca/coll/ucc/coll_ucc_gatherv.c +++ b/ompi/mca/coll/ucc/coll_ucc_gatherv.c @@ -2,6 +2,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -106,7 +107,7 @@ int mca_coll_ucc_igatherv(const void *sbuf, int scount, struct ompi_datatype_t * mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc igatherv"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_gatherv_init(sbuf, scount, sdtype, rbuf, rcounts, disps, rdtype, root, ucc_module, &req, coll_req)); diff --git a/ompi/mca/coll/ucc/coll_ucc_reduce.c b/ompi/mca/coll/ucc/coll_ucc_reduce.c index 0de4b2ff421..3595a448d35 100644 --- a/ompi/mca/coll/ucc/coll_ucc_reduce.c +++ b/ompi/mca/coll/ucc/coll_ucc_reduce.c @@ -1,5 +1,6 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -92,7 +93,7 @@ int mca_coll_ucc_ireduce(const void *sbuf, void* rbuf, int count, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc ireduce"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_reduce_init(sbuf, rbuf, count, dtype, op, root, ucc_module, &req, coll_req)); COLL_UCC_POST_AND_CHECK(req); diff --git a/ompi/mca/coll/ucc/coll_ucc_reduce_scatter.c b/ompi/mca/coll/ucc/coll_ucc_reduce_scatter.c index 93a9b295ac0..a0a33f3c1ac 100644 --- a/ompi/mca/coll/ucc/coll_ucc_reduce_scatter.c +++ b/ompi/mca/coll/ucc/coll_ucc_reduce_scatter.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -103,7 +104,7 @@ int mca_coll_ucc_ireduce_scatter(const void *sbuf, void *rbuf, const int *rcount mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc ireduce_scatter"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_reduce_scatter_init(sbuf, rbuf, rcounts, dtype, op, ucc_module, &req, coll_req)); COLL_UCC_POST_AND_CHECK(req); diff --git a/ompi/mca/coll/ucc/coll_ucc_reduce_scatter_block.c b/ompi/mca/coll/ucc/coll_ucc_reduce_scatter_block.c index e9352d669e2..ebc22e5a409 100644 --- a/ompi/mca/coll/ucc/coll_ucc_reduce_scatter_block.c +++ b/ompi/mca/coll/ucc/coll_ucc_reduce_scatter_block.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -98,7 +99,7 @@ int mca_coll_ucc_ireduce_scatter_block(const void *sbuf, void *rbuf, int rcount, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc ireduce_scatter_block"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_reduce_scatter_block_init(sbuf, rbuf, rcount, dtype, op, ucc_module, &req, coll_req)); diff --git a/ompi/mca/coll/ucc/coll_ucc_scatter.c b/ompi/mca/coll/ucc/coll_ucc_scatter.c index 548ce290bdf..c31fcb04ec4 100644 --- a/ompi/mca/coll/ucc/coll_ucc_scatter.c +++ b/ompi/mca/coll/ucc/coll_ucc_scatter.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -117,7 +118,7 @@ int mca_coll_ucc_iscatter(const void *sbuf, int scount, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc iscatter"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_scatter_init(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, ucc_module, &req, coll_req)); diff --git a/ompi/mca/coll/ucc/coll_ucc_scatterv.c b/ompi/mca/coll/ucc/coll_ucc_scatterv.c index 738aa14a953..121c894179e 100644 --- a/ompi/mca/coll/ucc/coll_ucc_scatterv.c +++ b/ompi/mca/coll/ucc/coll_ucc_scatterv.c @@ -1,6 +1,7 @@ /** * Copyright (c) 2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Fujitsu Limited. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -110,7 +111,7 @@ int mca_coll_ucc_iscatterv(const void *sbuf, const int *scounts, mca_coll_ucc_req_t *coll_req = NULL; UCC_VERBOSE(3, "running ucc iscatterv"); - COLL_UCC_GET_REQ(coll_req); + COLL_UCC_GET_REQ(coll_req, comm); COLL_UCC_CHECK(mca_coll_ucc_scatterv_init(sbuf, scounts, disps, sdtype, rbuf, rcount, rdtype, root, ucc_module, &req, coll_req)); From 6fe1db16b7ff0037079eaff5cc39466b49d68cf6 Mon Sep 17 00:00:00 2001 From: Tomislav Janjusic Date: Thu, 30 Oct 2025 20:31:47 +0200 Subject: [PATCH 20/24] v5.0.9: consolidate news. Signed-off-by: Tomislav Janjusic --- docs/release-notes/changelog/v5.0.x.rst | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/docs/release-notes/changelog/v5.0.x.rst b/docs/release-notes/changelog/v5.0.x.rst index ee5f6f49131..0c966e4c0f0 100644 --- a/docs/release-notes/changelog/v5.0.x.rst +++ b/docs/release-notes/changelog/v5.0.x.rst @@ -4,24 +4,9 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. -Open MPI Version v5.0.9rc2 +Open MPI Version v5.0.9 ------------------------------ -:Date: 10 October 2025 - -- Internal PMIx and PRRTe versions: - - PMIx (v5.0.9). Repo: ``https://github.com/openpmix/openpmix``. Commit hash: ``b357357c09915779eabd362f6857f61b41680329``. - - PRRTE (v3.0.12). Repo: ``https://github.com/openpmix/prrte``. Commit hash: ``b357357c09915779eabd362f6857f61b41680329``. - -- Functionality Enhancements - - UCC: Set node local id for better collective operation performance - - UCX: Set node local id in PML component for improved communication - -- Bug Fixes and Minor Enhancements - - CUDA: Fixed CUDA detection when multiple libcuda.so files exist in CUDA install directory - -Open MPI Version v5.0.9rc1 ------------------------------- -:Date: 23 September 2025 +:Date: 30 October 2025 - Internal PMIx and PRRTe versions: - PMIx (v5.0.9). Repo: ``https://github.com/openpmix/openpmix``. Commit hash: ``b357357c09915779eabd362f6857f61b41680329``. @@ -32,9 +17,12 @@ Open MPI Version v5.0.9rc1 - OFI: Enhanced accelerator memory support with proper rcache flag handling - OFI: Added memory monitor export for better memory management - ROCm: Added missing header for memcpy operations in accelerator component + - UCC: Set node local id for better collective operation performance + - UCX: Set node local id in PML component for improved communication - Bug Fixes and Minor Enhancements - PML OB1: Fixed critical bug in MCA_PML_OB1_ADD_ACK_TO_PENDING that could cause memory overruns or allocation failures + - CUDA: Fixed CUDA detection when multiple libcuda.so files exist in CUDA install directory - Fortran: Fixed off-by-one string copy error in C2F string conversion - Fortran: Fixed ompi string c2f conversion when Fortran string length is less than C string length - Threading: Fixed OMPI_MPI_THREAD_LEVEL environment variable handling to allow useful overrides in threaded library use cases From 655d6610bd74c6c2d073ba989fef3c75db8ea4a3 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 14 Oct 2025 15:33:21 -0700 Subject: [PATCH 21/24] btl/ofi: Set domain threading model based on MPI thread support Signed-off-by: Jessie Yang (cherry picked from commit f65f900bbbd044fa17e9153adb3bdf2906d2b28d) --- opal/mca/btl/ofi/btl_ofi_component.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 110004f8094..5a0baee44fd 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -339,6 +339,12 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, domain_attr.control_progress = progress_mode; domain_attr.data_progress = progress_mode; + if (enable_mpi_threads) { + domain_attr.threading = FI_THREAD_SAFE; + } else { + domain_attr.threading = FI_THREAD_DOMAIN; + } + /* select endpoint type */ ep_attr.type = FI_EP_RDM; From 84c0e93ba4102d9ff8def25d317243764f35feb6 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 14 Oct 2025 15:35:18 -0700 Subject: [PATCH 22/24] btl/ofi: Add FI_COMPLETION flag to tx and rx attributes Add FI_COMPLETION flag to ensure completion entries are generated for all data transfer operations. Signed-off-by: Jessie Yang (cherry picked from commit 15fe24645cd21c1c99f5ce7796e86344f442971d) --- opal/mca/btl/ofi/btl_ofi_component.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 5a0baee44fd..6c0af9be3b0 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -365,7 +365,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, tx_attr.iov_limit = 1; rx_attr.iov_limit = 1; - tx_attr.op_flags = FI_DELIVERY_COMPLETE; + tx_attr.op_flags = FI_DELIVERY_COMPLETE | FI_COMPLETION; + rx_attr.op_flags = FI_COMPLETION; mca_btl_ofi_component.module_count = 0; From 29382f346b634c4af58ca188fa13f7da496437ea Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 7 Oct 2025 23:52:01 +0000 Subject: [PATCH 23/24] ofi: Share domain between MTL and BTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Share the domain between the MTL and BTL layers to reduce the total number of domains created. This helps avoid hitting system resource limits on platforms with high core counts. Instead of having the common code allocate a single domain with the superset of all required capabilities, we attempt to reuse an existing fabric and domain if the providers can support MTL’s and BTL’s different capability sets. This approach allows providers that support domain sharing to reuse resources efficiently while still preserving flexibility. If the providers cannot reuse the fabric and domain due to incompatible requirements, separate domains will be created as before. Signed-off-by: Jessie Yang (cherry picked from commit 69d273793dfb5e26fe93e2e3de58d511cb35b3f1) --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 35 ++++-- opal/mca/btl/ofi/btl_ofi_component.c | 17 ++- opal/mca/btl/ofi/btl_ofi_module.c | 4 +- opal/mca/common/ofi/common_ofi.c | 161 ++++++++++++++++++++++++++- opal/mca/common/ofi/common_ofi.h | 57 +++++++++- 5 files changed, 253 insertions(+), 21 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 0ca9b31aad7..c26b6118195 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -694,6 +694,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, } hints->domain_attr->resource_mgmt = FI_RM_ENABLED; + hints->domain_attr->domain = opal_common_ofi.domain; + hints->fabric_attr->fabric = opal_common_ofi.fabric; /** * The EFA provider in Libfabric versions prior to 1.10 contains a bug @@ -715,10 +717,16 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, hints_dup->fabric_attr->prov_name = strdup("efa"); ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers); + if (FI_ENODATA == -ret && (hints_dup->fabric_attr->fabric || hints_dup->domain_attr->domain)) { + /* Retry without fabric and domain */ + hints_dup->fabric_attr->fabric = NULL; + hints_dup->domain_attr->domain = NULL; + ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers); + } if (FI_ENOSYS == -ret) { /* libfabric is not new enough, fallback to use older version of API */ ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints_dup, &providers); - } + } opal_output_verbose(1, opal_common_ofi.output, "%s:%d: EFA specific fi_getinfo(): %s\n", @@ -756,6 +764,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, 0ULL, /* Optional flag */ hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ + if (FI_ENODATA == -ret && (hints->fabric_attr->fabric || hints->domain_attr->domain)) { + hints->fabric_attr->fabric = NULL; + hints->domain_attr->domain = NULL; + ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints, &providers); + } if (FI_ENOSYS == -ret) { ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints, &providers); } @@ -972,9 +985,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, * instantiate the virtual or physical network. This opens a "fabric * provider". See man fi_fabric for details. */ - ret = fi_fabric(prov->fabric_attr, /* In: Fabric attributes */ - &ompi_mtl_ofi.fabric, /* Out: Fabric handle */ - NULL); /* Optional context for fabric events */ + ret = opal_common_ofi_fi_fabric(prov->fabric_attr, /* In: Fabric attributes */ + &ompi_mtl_ofi.fabric); /* Out: Fabric handle */ if (0 != ret) { opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_fabric", @@ -988,10 +1000,9 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, * hardware port/collection of ports. Returns a domain object that can be * used to create endpoints. See man fi_domain for details. */ - ret = fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */ - prov, /* In: Provider */ - &ompi_mtl_ofi.domain, /* Out: Domain object */ - NULL); /* Optional context for domain events */ + ret = opal_common_ofi_fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */ + prov, /* In: Provider */ + &ompi_mtl_ofi.domain); /* Out: Domain object */ if (0 != ret) { opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_domain", @@ -1158,10 +1169,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, (void) fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq); } if (ompi_mtl_ofi.domain) { - (void) fi_close((fid_t)ompi_mtl_ofi.domain); + (void) opal_common_ofi_domain_release(ompi_mtl_ofi.domain); } if (ompi_mtl_ofi.fabric) { - (void) fi_close((fid_t)ompi_mtl_ofi.fabric); + (void) opal_common_ofi_fabric_release(ompi_mtl_ofi.fabric); } if (ompi_mtl_ofi.comm_to_context) { free(ompi_mtl_ofi.comm_to_context); @@ -1209,11 +1220,11 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) } } - if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) { + if ((ret = opal_common_ofi_domain_release(ompi_mtl_ofi.domain))) { goto finalize_err; } - if ((ret = fi_close((fid_t)ompi_mtl_ofi.fabric))) { + if ((ret = opal_common_ofi_fabric_release(ompi_mtl_ofi.fabric))) { goto finalize_err; } diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 6c0af9be3b0..e0a16848da1 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -379,9 +379,18 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, no_hmem: #endif + hints.fabric_attr->fabric = opal_common_ofi.fabric; + hints.domain_attr->domain = opal_common_ofi.domain; + /* Do the query. The earliest version that supports FI_HMEM hints is 1.9. * The earliest version the explictly allow provider to call CUDA API is 1.18 */ rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list); + if (FI_ENODATA == -rc && (hints.fabric_attr->fabric || hints.domain_attr->domain)) { + /* Retry without fabric and domain */ + hints.fabric_attr->fabric = NULL; + hints.domain_attr->domain = NULL; + rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list); + } if (FI_ENOSYS == -rc) { rc = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, &hints, &info_list); } @@ -560,14 +569,14 @@ static int mca_btl_ofi_init_device(struct fi_info *info) ("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name)); /* fabric */ - rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL); + rc = opal_common_ofi_fi_fabric(ofi_info->fabric_attr, &fabric); if (0 != rc) { BTL_VERBOSE(("%s failed fi_fabric with err=%s", linux_device_name, fi_strerror(-rc))); goto fail; } /* domain */ - rc = fi_domain(fabric, ofi_info, &domain, NULL); + rc = opal_common_ofi_fi_domain(fabric, ofi_info, &domain); if (0 != rc) { BTL_VERBOSE(("%s failed fi_domain with err=%s", linux_device_name, fi_strerror(-rc))); goto fail; @@ -750,11 +759,11 @@ static int mca_btl_ofi_init_device(struct fi_info *info) } if (NULL != domain) { - fi_close(&domain->fid); + opal_common_ofi_domain_release(domain); } if (NULL != fabric) { - fi_close(&fabric->fid); + opal_common_ofi_fabric_release(fabric); } free(module); diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index 330ebbae66c..17c4d281a3d 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -380,11 +380,11 @@ int mca_btl_ofi_finalize(mca_btl_base_module_t *btl) } if (NULL != ofi_btl->domain) { - fi_close(&ofi_btl->domain->fid); + opal_common_ofi_domain_release(ofi_btl->domain); } if (NULL != ofi_btl->fabric) { - fi_close(&ofi_btl->fabric->fid); + opal_common_ofi_fabric_release(ofi_btl->fabric); } if (NULL != ofi_btl->fabric_info) { diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 82a331e2527..7624727eb98 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -6,7 +6,7 @@ * reserved. * Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. - * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights + * Copyright (c) 2021-2025 Amazon.com, Inc. or its affiliates. All rights * reserved. * Copyright (c) 2023 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -40,7 +40,11 @@ opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, .prov_exclude = NULL, - .output = -1}; + .output = -1, + .fabric = NULL, + .domain = NULL, + .fabric_ref_count = 0, + .domain_ref_count = 0}; static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic,net"; static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT; static int opal_common_ofi_verbose_level = 0; @@ -1037,3 +1041,156 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add } return ret; } + +/** + * Get or create fabric object + * + * Reuses existing fabric from fabric_attr->fabric if available, + * otherwise creates new fabric using fi_fabric(). + * + * @param fabric_attr (IN) Fabric attributes + * @param fabric (OUT) Fabric object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_fi_fabric(struct fi_fabric_attr *fabric_attr, + struct fid_fabric **fabric) +{ + int ret; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (fabric_attr->fabric) { + *fabric = fabric_attr->fabric; + opal_common_ofi.fabric_ref_count++; + opal_output_verbose(1, opal_common_ofi.output, "Reusing existing fabric: %s", + fabric_attr->name); + } else { + ret = fi_fabric(fabric_attr, fabric, NULL); + if (0 != ret) { + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; + } + opal_common_ofi.fabric = *fabric; + opal_common_ofi.fabric_ref_count = 1; + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return OPAL_SUCCESS; +} + +/** + * Get or create domain object + * + * Reuses existing domain from info->domain_attr->domain if available, + * otherwise creates new domain using fi_domain(). + * + * @param fabric (IN) Fabric object + * @param info (IN) Provider info + * @param domain (OUT) Domain object (new or existing) + * + * @return OPAL_SUCCESS or OPAL error code + */ +int opal_common_ofi_fi_domain(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain) +{ + int ret; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (info->domain_attr->domain) { + *domain = info->domain_attr->domain; + opal_common_ofi.domain_ref_count++; + opal_output_verbose(1, opal_common_ofi.output, "Reusing existing domain: %s", + info->domain_attr->name); + } else { + ret = fi_domain(fabric, info, domain, NULL); + if (0 != ret) { + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; + } + opal_common_ofi.domain = *domain; + opal_common_ofi.domain_ref_count = 1; + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return OPAL_SUCCESS; +} + +/** + * Release fabric reference + * + * Decrements fabric reference count and closes fabric if count reaches zero. + * + * @param fabric (IN) Fabric object to release + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_fabric_release(struct fid_fabric *fabric) +{ + int ret = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (fabric == opal_common_ofi.fabric && opal_common_ofi.fabric_ref_count > 0) { + opal_common_ofi.fabric_ref_count--; + if (opal_common_ofi.fabric_ref_count == 0) { + ret = fi_close(&fabric->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for fabric: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + opal_common_ofi.fabric = NULL; + } + } else { + ret = fi_close(&fabric->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for fabric: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; +} + +/** + * Release domain reference + * + * Decrements domain reference count and closes domain if count reaches zero. + * + * @param domain (IN) Domain object to release + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_domain_release(struct fid_domain *domain) +{ + int ret = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (domain == opal_common_ofi.domain && opal_common_ofi.domain_ref_count > 0) { + opal_common_ofi.domain_ref_count--; + if (opal_common_ofi.domain_ref_count == 0) { + ret = fi_close(&domain->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for domain: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + opal_common_ofi.domain = NULL; + } + } else { + ret = fi_close(&domain->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for domain: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; +} diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 3deeb0c63ec..4357840604f 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -5,7 +5,7 @@ * reserved. * Copyright (c) 2020-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights + * Copyright (c) 2021-2025 Amazon.com, Inc. or its affiliates. All rights * reserved. * * $COPYRIGHT$ @@ -30,6 +30,10 @@ typedef struct opal_common_ofi_module { char **prov_include; char **prov_exclude; int output; + struct fid_fabric *fabric; + struct fid_domain *domain; + int fabric_ref_count; + int domain_ref_count; } opal_common_ofi_module_t; /** @@ -223,6 +227,57 @@ OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *pr */ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *addrlen); +/** + * Get or create fabric object + * + * Reuses existing fabric from fabric_attr->fabric if available, + * otherwise creates new fabric using fi_fabric(). + * + * @param fabric_attr (IN) Fabric attributes + * @param fabric (OUT) Fabric object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fi_fabric(struct fi_fabric_attr *fabric_attr, + struct fid_fabric **fabric); + +/** + * Get or create domain object + * + * Reuses existing domain from info->domain_attr->domain if available, + * otherwise creates new domain using fi_domain(). + * + * @param fabric (IN) Fabric object + * @param info (IN) Provider info + * @param domain (OUT) Domain object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fi_domain(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain); + +/** + * Release fabric reference + * + * Decrements fabric reference count and closes fabric if count reaches zero. + * + * @param fabric (IN) Fabric object to release + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fabric_release(struct fid_fabric *fabric); + +/** + * Release domain reference + * + * Decrements domain reference count and closes domain if count reaches zero. + * + * @param domain (IN) Domain object to release + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_domain_release(struct fid_domain *domain); + END_C_DECLS #endif /* OPAL_MCA_COMMON_OFI_H */ From 22b7e2eed7f140602a2fffb461957f01a4d8bd2a Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 24 Nov 2025 14:16:05 -0800 Subject: [PATCH 24/24] dist: Prep for 5.0.9amzn1 release Signed-off-by: Brian Barrett --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index dd4245eef57..6e355fb0ea8 100644 --- a/VERSION +++ b/VERSION @@ -41,7 +41,7 @@ flex_min_version=2.5.4 # requirement is that it must be entirely printable ASCII characters # and have no white space. -greek=rc2 +greek=amzn1 # If repo_rev is empty, then the repository version number will be # obtained during "make dist" via the "git describe --tags --always"