From 923e15978ff5d0359c7c7525f1b5dfd14d3f7947 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Tue, 5 Jun 2018 17:22:03 +0300 Subject: [PATCH 1/2] PML/UCX: fixed hand on MPI_Finalize fixes issue https://github.com/openucx/ucx/issues/2656 added flush for worker object to complete all pending operations Signed-off-by: Sergey Oblomov (cherry picked from commit 0a8261f3b00123822a596485d11fa994c1dbf9df) --- ompi/mca/pml/ucx/pml_ucx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 98352a951fb..d504de7f437 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -435,6 +435,10 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) mca_pml_ucx_waitall(dreqs, &num_reqs); free(dreqs); + /* flush worker to allow all pending operations to complete. + * ignore error (we can do nothing here), just try to + * finalize gracefully */ + ucp_worker_flush(ompi_pml_ucx.ucp_worker); opal_pmix.fence(NULL, 0); From 2bf00978248f11c29e1f094635d782a803177126 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Sat, 9 Jun 2018 15:06:53 +0300 Subject: [PATCH 2/2] PML/UCX: suppressed coverity issue - added debug output for woker flush failure Signed-off-by: Sergey Oblomov (cherry picked from commit c3dbc865429ab170ee7017438fbda1c0288dc350) --- ompi/mca/pml/ucx/pml_ucx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index d504de7f437..4a2f353c8ca 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -394,6 +394,7 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) void *dreq, **dreqs; ucp_ep_h ep; size_t i; + ucs_status_t ret; max_reqs = ompi_pml_ucx.num_disconnect; if (max_reqs > nprocs) { @@ -438,7 +439,11 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) /* flush worker to allow all pending operations to complete. * ignore error (we can do nothing here), just try to * finalize gracefully */ - ucp_worker_flush(ompi_pml_ucx.ucp_worker); + ret = ucp_worker_flush(ompi_pml_ucx.ucp_worker); + if (UCS_OK != ret) { + PML_UCX_ERROR("ucp_worker_flush failed: %s", + ucs_status_string(ret)); + } opal_pmix.fence(NULL, 0);