From 96717ed88b0f1af848bfb60d1c752cce7e765cae Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 20 Jun 2018 14:54:56 +0300 Subject: [PATCH] PML/UCX: fixed hang on MPI_Finalize - use non-blocking fence to progress UCX Signed-off-by: Sergey Oblomov (cherry picked from commit 10f2d831ecf58f482c5faee73d2956e0eeb801bf) --- ompi/mca/pml/ucx/pml_ucx.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 4a2f353c8ca..a99cf3cda2a 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -387,14 +387,19 @@ static void mca_pml_ucx_waitall(void **reqs, size_t *count_p) *count_p = 0; } +static void mca_pml_fence_complete_cb(int status, void *fenced) +{ + *(int*)fenced = 1; +} + int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) { + int fenced = 0; ompi_proc_t *proc; size_t num_reqs, max_reqs; void *dreq, **dreqs; ucp_ep_h ep; size_t i; - ucs_status_t ret; max_reqs = ompi_pml_ucx.num_disconnect; if (max_reqs > nprocs) { @@ -436,16 +441,11 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) mca_pml_ucx_waitall(dreqs, &num_reqs); free(dreqs); - /* flush worker to allow all pending operations to complete. - * ignore error (we can do nothing here), just try to - * finalize gracefully */ - ret = ucp_worker_flush(ompi_pml_ucx.ucp_worker); - if (UCS_OK != ret) { - PML_UCX_ERROR("ucp_worker_flush failed: %s", - ucs_status_string(ret)); - } - opal_pmix.fence(NULL, 0); + opal_pmix.fence_nb(NULL, 0, mca_pml_fence_complete_cb, &fenced); + while (!fenced) { + ucp_worker_progress(ompi_pml_ucx.ucp_worker); + } return OMPI_SUCCESS; }