From 8873d901e882c9cdfe764767125bfe6ab8558cce Mon Sep 17 00:00:00 2001 From: Boris Karasev Date: Mon, 30 Jul 2018 16:55:52 +0600 Subject: [PATCH] pmix: added check for pmix fence status Signed-off-by: Boris Karasev (cherry picked from commit 57683366ca300fe353e91c52dc9aa0f657120d4d) Conflicts: opal/mca/common/ucx/common_ucx.c opal/mca/common/ucx/common_ucx.h Modified: ompi/mca/pml/ucx/pml_ucx.c oshmem/mca/spml/ucx/spml_ucx.c --- ompi/dpm/dpm.c | 6 +++- ompi/mca/bml/r2/bml_r2_ft.c | 10 +++++-- ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c | 20 +++++++++---- ompi/mca/pml/bfo/pml_bfo.c | 40 ++++++++++++++++++++------ ompi/mca/pml/ob1/pml_ob1.c | 40 ++++++++++++++++++++------ ompi/mca/pml/ucx/pml_ucx.c | 6 +++- ompi/mca/pml/yalla/pml_yalla.c | 5 +++- ompi/runtime/ompi_mpi_finalize.c | 12 ++++++-- ompi/runtime/ompi_mpi_init.c | 43 ++++++++++++++++++++++------ opal/mca/common/ucx/common_ucx.c | 12 ++++++-- opal/mca/common/ucx/common_ucx.h | 2 +- orte/mca/ess/pmi/ess_pmi_module.c | 5 +++- orte/mca/snapc/full/snapc_full_app.c | 12 ++++++-- oshmem/mca/spml/ucx/spml_ucx.c | 6 +++- 14 files changed, 174 insertions(+), 45 deletions(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 14810f6b028..a9a2de586c4 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -589,7 +589,11 @@ int ompi_dpm_disconnect(ompi_communicator_t *comm) /* ensure we tell the host RM to disconnect us - this * is a blocking operation so just use a fence */ - ret = opal_pmix.fence(&coll, false); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(&coll, false))) { + OMPI_ERROR_LOG(ret); + OPAL_LIST_DESTRUCT(&coll); + return ret; + } OPAL_LIST_DESTRUCT(&coll); return ret; diff --git a/ompi/mca/bml/r2/bml_r2_ft.c b/ompi/mca/bml/r2/bml_r2_ft.c index 95fc6ade66b..8dc45d4f1e3 100644 --- a/ompi/mca/bml/r2/bml_r2_ft.c +++ b/ompi/mca/bml/r2/bml_r2_ft.c @@ -155,7 +155,10 @@ int mca_bml_r2_ft_event(int state) * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ - opal_pmix.fence(NULL, 0); + if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n"); + return ret; + } /* * Re-open the BTL framework to get the full list of components. @@ -224,7 +227,10 @@ int mca_bml_r2_ft_event(int state) * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ - opal_pmix.fence(NULL, 0); + if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n"); + return ret; + } /* * Re-open the BTL framework to get the full list of components. diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c index 7d5e480095a..3276df26d0a 100644 --- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c +++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c @@ -3028,7 +3028,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event( if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0); - opal_pmix.fence(NULL, 0); + if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + exit_status = ret; + goto DONE; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0); @@ -3096,7 +3099,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event( if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1); - opal_pmix.fence(NULL, 0); + if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + exit_status = ret; + goto DONE; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2); } @@ -6207,14 +6213,16 @@ static void clear_timers(void) { static void display_all_timers(int state) { bool report_ready = false; double barrier_start, barrier_stop; - int i; + int i, ret; if( 0 != OMPI_PROC_MY_NAME->vpid ) { if( 2 > timing_enabled ) { return; } else if( 2 == timing_enabled ) { - opal_pmix.fence(NULL, 0); + if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + OPAL_ERROR_LOG(ret); + } return; } } @@ -6235,7 +6243,9 @@ static void display_all_timers(int state) { if( timing_enabled >= 2) { barrier_start = get_time(); - opal_pmix.fence(NULL, 0); + if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + OPAL_ERROR_LOG(ret); + } barrier_stop = get_time(); opal_output(0, "crcp:bkmrk: timing(%20s): %20s = %10.2f s\n", diff --git a/ompi/mca/pml/bfo/pml_bfo.c b/ompi/mca/pml/bfo/pml_bfo.c index e3a1beb447a..ce33b0d57be 100644 --- a/ompi/mca/pml/bfo/pml_bfo.c +++ b/ompi/mca/pml/bfo/pml_bfo.c @@ -666,7 +666,10 @@ int mca_pml_bfo_ft_event( int state ) if(OPAL_CRS_CHECKPOINT == state) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete"); + return ret; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0); @@ -677,7 +680,10 @@ int mca_pml_bfo_ft_event( int state ) if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete"); + return ret; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2); } @@ -777,7 +783,10 @@ int mca_pml_bfo_ft_event( int state ) if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete"); + return ret; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3); } @@ -787,7 +796,10 @@ int mca_pml_bfo_ft_event( int state ) * Exchange the modex information once again. * BTLs will have republished their modex information. */ - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete"); + return ret; + } /* * Startup the PML stack now that the modex is running again @@ -799,7 +811,10 @@ int mca_pml_bfo_ft_event( int state ) } /* Is this barrier necessary ? JJH */ - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete"); + return ret; + } if( NULL != procs ) { for(p = 0; p < (int)num_procs; ++p) { @@ -812,7 +827,10 @@ int mca_pml_bfo_ft_event( int state ) if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete"); + return ret; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1); } @@ -825,7 +843,10 @@ int mca_pml_bfo_ft_event( int state ) * Exchange the modex information once again. * BTLs will have republished their modex information. */ - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete"); + return ret; + } /* * Startup the PML stack now that the modex is running again @@ -837,7 +858,10 @@ int mca_pml_bfo_ft_event( int state ) } /* Is this barrier necessary ? JJH */ - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete"); + return ret; + } if( NULL != procs ) { for(p = 0; p < (int)num_procs; ++p) { diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index 5adf19028a8..f4cc24c8ba0 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -807,7 +807,10 @@ int mca_pml_ob1_ft_event( int state ) if(OPAL_CRS_CHECKPOINT == state) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete"); + return ret; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0); @@ -818,7 +821,10 @@ int mca_pml_ob1_ft_event( int state ) if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete"); + return ret; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2); } @@ -918,13 +924,19 @@ int mca_pml_ob1_ft_event( int state ) if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete"); + return ret; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3); } if (opal_cr_continue_like_restart && !first_continue_pass) { - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete"); + return ret; + } /* * Startup the PML stack now that the modex is running again @@ -936,7 +948,10 @@ int mca_pml_ob1_ft_event( int state ) } /* Is this barrier necessary ? JJH */ - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete"); + return ret; + } if( NULL != procs ) { for(p = 0; p < (int)num_procs; ++p) { @@ -949,7 +964,10 @@ int mca_pml_ob1_ft_event( int state ) if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete"); + return ret; + } } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1); } @@ -962,7 +980,10 @@ int mca_pml_ob1_ft_event( int state ) * Exchange the modex information once again. * BTLs will have republished their modex information. */ - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete"); + return ret; + } /* * Startup the PML stack now that the modex is running again @@ -974,7 +995,10 @@ int mca_pml_ob1_ft_event( int state ) } /* Is this barrier necessary ? JJH */ - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete"); + return ret; + } if( NULL != procs ) { for(p = 0; p < (int)num_procs; ++p) { diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 9dcae3dd6b5..697dd078e38 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -389,6 +389,7 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) void *dreq, **dreqs; ucp_ep_h ep; size_t i; + int ret; max_reqs = ompi_pml_ucx.num_disconnect; if (max_reqs > nprocs) { @@ -433,7 +434,10 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) mca_pml_ucx_waitall(dreqs, &num_reqs); free(dreqs); - opal_common_ucx_mca_pmix_fence(ompi_pml_ucx.ucp_worker); + if (OMPI_SUCCESS != (ret = opal_common_ucx_mca_pmix_fence( + ompi_pml_ucx.ucp_worker))) { + return ret; + } return OMPI_SUCCESS; } diff --git a/ompi/mca/pml/yalla/pml_yalla.c b/ompi/mca/pml/yalla/pml_yalla.c index 7890293c330..03bb65d420d 100644 --- a/ompi/mca/pml/yalla/pml_yalla.c +++ b/ompi/mca/pml/yalla/pml_yalla.c @@ -265,6 +265,7 @@ int mca_pml_yalla_add_procs(struct ompi_proc_t **procs, size_t nprocs) int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs) { size_t i; + int ret; if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) { PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown"); @@ -276,7 +277,9 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs) PML_YALLA_VERBOSE(2, "disconnected from rank %s", OPAL_NAME_PRINT(procs[i]->super.proc_name)); procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL; } - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + return ret; + } return OMPI_SUCCESS; } diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c index a235f6ba2d2..b636ddfbaab 100644 --- a/ompi/runtime/ompi_mpi_finalize.c +++ b/ompi/runtime/ompi_mpi_finalize.c @@ -257,7 +257,13 @@ int ompi_mpi_finalize(void) * communications/actions to complete. See * https://github.com/open-mpi/ompi/issues/1576 for the * original bug report. */ - opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active); + if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, fence_cbfunc, + (void*)&active))) { + OMPI_ERROR_LOG(ret); + /* Reset the active flag to false, to avoid waiting for + * completion when the fence was failed. */ + active = false; + } OMPI_LAZY_WAIT_FOR_COMPLETION(active); } else { /* However, we cannot guarantee that the provided PMIx has @@ -268,7 +274,9 @@ int ompi_mpi_finalize(void) ompi_communicator_t *comm = &ompi_mpi_comm_world.comm; comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + OMPI_ERROR_LOG(ret); + } } } diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 00e450c923a..b35c491b735 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -662,9 +662,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, #if (OPAL_ENABLE_TIMING) if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex && opal_pmix_collect_all_data) { - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + error = "timing: pmix-barrier-1 failed"; + goto error; + } OMPI_TIMING_NEXT("pmix-barrier-1"); - opal_pmix.fence(NULL, 0); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + error = "timing: pmix-barrier-2 failed"; + goto error; + } OMPI_TIMING_NEXT("pmix-barrier-2"); } #endif @@ -687,19 +693,32 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, background_fence = true; active = true; OPAL_POST_OBJECT(&active); - opal_pmix.fence_nb(NULL, true, fence_release, (void*)&active); + if( OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, true, + fence_release, + (void*)&active))) { + error = "opal_pmix.fence_nb() failed"; + goto error; + } + } else if (!opal_pmix_base_async_modex) { /* we want to do the modex */ active = true; OPAL_POST_OBJECT(&active); - opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data, - fence_release, (void*)&active); + if( OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, + opal_pmix_collect_all_data, fence_release, (void*)&active))) { + error = "opal_pmix.fence_nb() failed"; + goto error; + } /* cannot just wait on thread as we need to call opal_progress */ OMPI_LAZY_WAIT_FOR_COMPLETION(active); } /* otherwise, we don't want to do the modex, so fall thru */ } else if (!opal_pmix_base_async_modex || opal_pmix_collect_all_data) { - opal_pmix.fence(NULL, opal_pmix_collect_all_data); + if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, + opal_pmix_collect_all_data))) { + error = "opal_pmix.fence() failed"; + goto error; + } } OMPI_TIMING_NEXT("modex"); @@ -877,11 +896,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided, if (NULL != opal_pmix.fence_nb) { active = true; OPAL_POST_OBJECT(&active); - opal_pmix.fence_nb(NULL, false, - fence_release, (void*)&active); + if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, false, + fence_release, (void*)&active))) { + error = "opal_pmix.fence_nb() failed"; + goto error; + } OMPI_LAZY_WAIT_FOR_COMPLETION(active); } else { - opal_pmix.fence(NULL, false); + if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, false))) { + error = "opal_pmix.fence() failed"; + goto error; + } } } diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index cd54490e4dd..62ee70ff47c 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -97,13 +97,19 @@ static void opal_common_ucx_mca_fence_complete_cb(int status, void *fenced) *(int*)fenced = 1; } -OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker) +OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker) { volatile int fenced = 0; + int ret = OPAL_SUCCESS; + + if (OPAL_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, + opal_common_ucx_mca_fence_complete_cb, (void*)&fenced))){ + return ret; + } - opal_pmix.fence_nb(NULL, 0, opal_common_ucx_mca_fence_complete_cb, (void*)&fenced); while (!fenced) { ucp_worker_progress(worker); } -} + return ret; +} diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 13a03000e83..0fe345c0695 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -65,7 +65,7 @@ extern opal_common_ucx_module_t opal_common_ucx; OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); -OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker); +OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker); static inline int opal_common_ucx_wait_request(ucs_status_ptr_t request, ucp_worker_h worker, diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index 563b8115dcf..1515ae98885 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -458,7 +458,10 @@ static int rte_init(void) if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { /* need to commit the data before we fence */ opal_pmix.commit(); - opal_pmix.fence(NULL, 0); + if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + error = "opal_pmix.fence() failed"; + goto error; + } } OPAL_TIMING_ENV_NEXT(rte_init, "rte_init_done"); diff --git a/orte/mca/snapc/full/snapc_full_app.c b/orte/mca/snapc/full/snapc_full_app.c index 1dd5a8d5edd..3a436bc81d4 100644 --- a/orte/mca/snapc/full/snapc_full_app.c +++ b/orte/mca/snapc/full/snapc_full_app.c @@ -150,7 +150,11 @@ int app_coord_init() "app) Startup Barrier...")); } - opal_pmix.fence(NULL, 0); + if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } if( 0 == ORTE_PROC_MY_NAME->vpid ) { OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle, @@ -216,7 +220,11 @@ int app_coord_finalize() "app) Shutdown Barrier...")); } - opal_pmix.fence(NULL, 0); + if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } if( 0 == ORTE_PROC_MY_NAME->vpid ) { OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle, diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 8cc1153658b..489f314b10e 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -125,6 +125,7 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) void *dreq, **dreqs; ucp_ep_h ep; size_t i, n; + int ret; oshmem_shmem_barrier(); @@ -175,7 +176,10 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) free(dreqs); free(mca_spml_ucx.remote_addrs_tbl); - opal_common_ucx_mca_pmix_fence(mca_spml_ucx_ctx_default.ucp_worker); + if (OSHMEM_SUCCESS != (ret = opal_common_ucx_mca_pmix_fence( + mca_spml_ucx_ctx_default.ucp_worker))) { + return ret; + } free(mca_spml_ucx_ctx_default.ucp_peers); mca_spml_ucx_ctx_default.ucp_peers = NULL; return OSHMEM_SUCCESS;