From f0baed629657aaab631be626c839699ffa15b5df Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Tue, 5 Jun 2018 17:22:03 +0300 Subject: [PATCH 1/2] PML/UCX: fixed hand on MPI_Finalize fixes issue https://github.com/openucx/ucx/issues/2656 added flush for worker object to complete all pending operations Signed-off-by: Sergey Oblomov (cherry picked from commit 0a8261f3b00123822a596485d11fa994c1dbf9df) --- ompi/mca/pml/ucx/pml_ucx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 712deb071b2..9e6e8f73782 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -429,6 +429,10 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) mca_pml_ucx_waitall(dreqs, &num_reqs); free(dreqs); + /* flush worker to allow all pending operations to complete. + * ignore error (we can do nothing here), just try to + * finalize gracefully */ + ucp_worker_flush(ompi_pml_ucx.ucp_worker); opal_pmix.fence(NULL, 0); From 4613d53fe6cf52276641e521fc7f7d94c84fb719 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Sat, 9 Jun 2018 15:06:53 +0300 Subject: [PATCH 2/2] PML/UCX: suppressed coverity issue - added debug output for woker flush failure Signed-off-by: Sergey Oblomov (cherry picked from commit c3dbc865429ab170ee7017438fbda1c0288dc350) --- ompi/mca/pml/ucx/pml_ucx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 9e6e8f73782..9c4fddce2fa 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -388,6 +388,7 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) void *dreq, **dreqs; ucp_ep_h ep; size_t i; + ucs_status_t ret; max_reqs = ompi_pml_ucx.num_disconnect; if (max_reqs > nprocs) { @@ -432,7 +433,11 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) /* flush worker to allow all pending operations to complete. * ignore error (we can do nothing here), just try to * finalize gracefully */ - ucp_worker_flush(ompi_pml_ucx.ucp_worker); + ret = ucp_worker_flush(ompi_pml_ucx.ucp_worker); + if (UCS_OK != ret) { + PML_UCX_ERROR("ucp_worker_flush failed: %s", + ucs_status_string(ret)); + } opal_pmix.fence(NULL, 0);