From e7d60dc8b2cd22578c9f8c437d458cf8cd5b7380 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Wed, 20 Jun 2018 14:54:56 +0300 Subject: [PATCH 1/2] PML/UCX: fixed hang on MPI_Finalize - use non-blocking fence to progress UCX Signed-off-by: Sergey Oblomov (cherry picked from commit 96717ed88b0f1af848bfb60d1c752cce7e765cae) --- ompi/mca/pml/ucx/pml_ucx.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 9c4fddce2fa..7aae470320b 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -381,14 +381,19 @@ static void mca_pml_ucx_waitall(void **reqs, size_t *count_p) *count_p = 0; } +static void mca_pml_fence_complete_cb(int status, void *fenced) +{ + *(int*)fenced = 1; +} + int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) { + int fenced = 0; ompi_proc_t *proc; size_t num_reqs, max_reqs; void *dreq, **dreqs; ucp_ep_h ep; size_t i; - ucs_status_t ret; max_reqs = ompi_pml_ucx.num_disconnect; if (max_reqs > nprocs) { @@ -430,16 +435,11 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) mca_pml_ucx_waitall(dreqs, &num_reqs); free(dreqs); - /* flush worker to allow all pending operations to complete. - * ignore error (we can do nothing here), just try to - * finalize gracefully */ - ret = ucp_worker_flush(ompi_pml_ucx.ucp_worker); - if (UCS_OK != ret) { - PML_UCX_ERROR("ucp_worker_flush failed: %s", - ucs_status_string(ret)); - } - opal_pmix.fence(NULL, 0); + opal_pmix.fence_nb(NULL, 0, mca_pml_fence_complete_cb, &fenced); + while (!fenced) { + ucp_worker_progress(ompi_pml_ucx.ucp_worker); + } return OMPI_SUCCESS; } From 0399f778f172e760320c0e722666b23a90bd8063 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 22 Jun 2018 13:11:42 +0900 Subject: [PATCH 2/2] pml/ucx: silence a warning declare 'fenced' volatile in order to silence CID 1437465 Signed-off-by: Gilles Gouaillardet (cherry picked from commit open-mpi/ompi@edd02b71447c03a9392c3afb33c588a0fe52cf9a) --- ompi/mca/pml/ucx/pml_ucx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 7aae470320b..da5303b1ec1 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -1,8 +1,10 @@ /* - * Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED. + * Copyright (C) 2001-2011 Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED. * Copyright (c) 2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -388,7 +390,7 @@ static void mca_pml_fence_complete_cb(int status, void *fenced) int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs) { - int fenced = 0; + volatile int fenced = 0; ompi_proc_t *proc; size_t num_reqs, max_reqs; void *dreq, **dreqs;