From 03ec24a5f9d31aafa2fe27282618eba30e3167f3 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 13 Oct 2025 12:11:31 -0400 Subject: [PATCH] Fix for issue #13432 @mentOS31 correctly identified few issues with the handling of persistent requests with regard to their error codes (more info in the issue). This PR fixes all wait and test function to have a consistent outcome for all types of requests. It also prevents the error handler invocation from releasing persistent requests. Signed-off-by: George Bosilca --- ompi/errhandler/errhandler_invoke.c | 2 ++ ompi/request/req_test.c | 53 ++++++++++++++--------------- ompi/request/req_wait.c | 18 +++++----- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/ompi/errhandler/errhandler_invoke.c b/ompi/errhandler/errhandler_invoke.c index d9d3ede4677..7cb4e343d50 100644 --- a/ompi/errhandler/errhandler_invoke.c +++ b/ompi/errhandler/errhandler_invoke.c @@ -18,6 +18,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2023 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2025 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -180,6 +181,7 @@ int ompi_errhandler_request_invoke(int count, that had an error. */ for (; i < count; ++i) { if (MPI_REQUEST_NULL != requests[i] && + !requests[i]->req_persistent && MPI_SUCCESS != requests[i]->req_status.MPI_ERROR) { #if OPAL_ENABLE_FT_MPI /* Special case for MPI_ANY_SOURCE when marked as diff --git a/ompi/request/req_test.c b/ompi/request/req_test.c index b28ade8a67a..8eaa37e0d29 100644 --- a/ompi/request/req_test.c +++ b/ompi/request/req_test.c @@ -13,6 +13,7 @@ * Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2025 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -136,7 +137,7 @@ int ompi_request_default_test_any( if( request->req_persistent ) { request->req_state = OMPI_REQUEST_INACTIVE; - return OMPI_SUCCESS; + return request->req_status.MPI_ERROR; } /* If there is an error on the request, don't free it */ if (MPI_SUCCESS != request->req_status.MPI_ERROR) { @@ -248,27 +249,26 @@ int ompi_request_default_test_all( ompi_grequest_invoke_query(request, &request->req_status); } OMPI_COPY_STATUS(&statuses[i], request->req_status, true); - if( request->req_persistent ) { - request->req_state = OMPI_REQUEST_INACTIVE; - continue; - } - /* MPI-2:4.5.1 says that we can return MPI_ERR_IN_STATUS - even if MPI_STATUSES_IGNORE was used. Woot! */ - /* Only free the request if there was no error on it */ if (MPI_SUCCESS == request->req_status.MPI_ERROR) { - int tmp = ompi_request_free(rptr); - if (tmp != OMPI_SUCCESS) { - return tmp; - } - } else { rc = MPI_ERR_IN_STATUS; #if OPAL_ENABLE_FT_MPI if (MPI_ERR_PROC_FAILED == request->req_status.MPI_ERROR - || MPI_ERR_REVOKED == request->req_status.MPI_ERROR) { + || MPI_ERR_REVOKED == request->req_status.MPI_ERROR) { rc = request->req_status.MPI_ERROR; } #endif /* OPAL_ENABLE_FT_MPI */ } + if (request->req_persistent) { + request->req_state = OMPI_REQUEST_INACTIVE; + } else if (MPI_SUCCESS == request->req_status.MPI_ERROR) { + /* MPI-2:4.5.1 says that we can return MPI_ERR_IN_STATUS + even if MPI_STATUSES_IGNORE was used. Woot! */ + /* Only free the request if there was no error on it */ + int tmp = ompi_request_free(rptr); + if (tmp != OMPI_SUCCESS) { + return tmp; + } + } } } else { /* free request if required */ @@ -283,25 +283,24 @@ int ompi_request_default_test_all( if (OMPI_REQUEST_GEN == request->req_type) { ompi_grequest_invoke_query(request, &request->req_status); } - if( request->req_persistent ) { - request->req_state = OMPI_REQUEST_INACTIVE; - continue; - } - /* Only free the request if there was no error */ - if (MPI_SUCCESS == request->req_status.MPI_ERROR) { - int tmp = ompi_request_free(rptr); - if (tmp != OMPI_SUCCESS) { - return tmp; - } - } else { + if (MPI_SUCCESS != request->req_status.MPI_ERROR) { rc = MPI_ERR_IN_STATUS; #if OPAL_ENABLE_FT_MPI if (MPI_ERR_PROC_FAILED == request->req_status.MPI_ERROR - || MPI_ERR_REVOKED == request->req_status.MPI_ERROR) { + || MPI_ERR_REVOKED == request->req_status.MPI_ERROR) { rc = request->req_status.MPI_ERROR; } #endif /* OPAL_ENABLE_FT_MPI */ } + if (request->req_persistent) { + request->req_state = OMPI_REQUEST_INACTIVE; + } else if (MPI_SUCCESS == request->req_status.MPI_ERROR) { + /* Only free the request if there was no error */ + int tmp = ompi_request_free(rptr); + if (tmp != OMPI_SUCCESS) { + return tmp; + } + } } } @@ -398,7 +397,7 @@ int ompi_request_default_test_some( #endif /* OPAL_ENABLE_FT_MPI */ } - if( request->req_persistent ) { + if (request->req_persistent) { request->req_state = OMPI_REQUEST_INACTIVE; } else { /* Only free the request if there was no error */ diff --git a/ompi/request/req_wait.c b/ompi/request/req_wait.c index d66eccf9f79..0b9bafcaab6 100644 --- a/ompi/request/req_wait.c +++ b/ompi/request/req_wait.c @@ -18,6 +18,7 @@ * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -356,13 +357,12 @@ int ompi_request_default_wait_all( size_t count, if( request->req_persistent ) { request->req_state = OMPI_REQUEST_INACTIVE; - continue; - } - /* Only free the request if there is no error on it */ - if (MPI_SUCCESS == request->req_status.MPI_ERROR) { + } else if (MPI_SUCCESS == request->req_status.MPI_ERROR) { + /* Only free the request if there is no error on it */ + /* If there's an error while freeing the request, - assume that the request is still there. - Otherwise, Bad Things will happen later! */ + assume that the request is still there. + Otherwise, Bad Things will happen later! */ int tmp = ompi_request_free(rptr); if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != tmp) { mpi_error = tmp; @@ -417,11 +417,9 @@ int ompi_request_default_wait_all( size_t count, rc = ompi_grequest_invoke_query(request, &request->req_status); } - rc = request->req_status.MPI_ERROR; - - if( request->req_persistent ) { + if (request->req_persistent) { request->req_state = OMPI_REQUEST_INACTIVE; - } else if (MPI_SUCCESS == rc) { + } else if (MPI_SUCCESS == request->req_status.MPI_ERROR) { /* Only free the request if there is no error on it */ int tmp = ompi_request_free(rptr); if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != tmp) {