From 14c271f993a292d74b64f48ef48870cbaccf7fd6 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Tue, 12 Mar 2019 21:14:27 +0200 Subject: [PATCH 1/2] PML/SPML/UCX: added evaluation of mmap events - there was a set of UCX related issues reported which caused by mmap API hooks conflicts. We added diagnostic of such problems to simplify bug-resolving pipeline Signed-off-by: Sergey Oblomov (cherry picked from commit d8e3562bae700d84873c1d5ca9c45c846d7387ed) --- config/ompi_check_ucx.m4 | 3 +++ ompi/mca/pml/ucx/pml_ucx.c | 1 + opal/mca/common/ucx/common_ucx.c | 22 ++++++++++++++++++++++ opal/mca/common/ucx/common_ucx.h | 6 ++++++ oshmem/mca/spml/ucx/spml_ucx.c | 2 ++ 5 files changed, 34 insertions(+) diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 668b0ff1478..044b599dc3b 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -112,6 +112,9 @@ AC_DEFUN([OMPI_CHECK_UCX],[ ucp_request_check_status, ucp_put_nb, ucp_get_nb], [], [], [#include ]) + AC_CHECK_DECLS([ucm_test_events], + [], [], + [#include ]) AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND, UCP_ATOMIC_POST_OP_OR, UCP_ATOMIC_POST_OP_XOR, diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 00a95644c22..e2a614e242b 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -422,6 +422,7 @@ int mca_pml_ucx_add_procs(struct ompi_proc_t **procs, size_t nprocs) } } + opal_common_ucx_mca_proc_added(); return OMPI_SUCCESS; } diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 2213f118eb2..254ac3a032a 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -132,6 +132,28 @@ static void opal_common_ucx_mca_fence_complete_cb(int status, void *fenced) *(int*)fenced = 1; } +void opal_common_ucx_mca_proc_added(void) +{ +#if HAVE_DECL_UCM_TEST_EVENTS + static int warned = 0; + static char *mem_hooks_suggestion = "Try to add command line agrument " + "'--mca opal_common_ucx_opal_mem_hooks 1' to resolve " + "this issue."; + ucs_status_t status; + + if (!warned) { + status = ucm_test_events(UCM_EVENT_VM_UNMAPPED); + if (status != UCS_OK) { + MCA_COMMON_UCX_WARN("UCX is unable to handle VM_UNMAP event. " + "This may cause performance degradation or data " + "corruption. %s", + opal_common_ucx.opal_mem_hooks ? "" : mem_hooks_suggestion); + warned = 1; + } + } +#endif +} + OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker) { volatile int fenced = 0; diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 0cf46e5c28a..0bd33f9cfa8 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -44,6 +44,11 @@ BEGIN_C_DECLS __FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \ " Error: " __VA_ARGS__) +#define MCA_COMMON_UCX_WARN(...) \ + opal_output_verbose(0, opal_common_ucx.output, \ + __FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \ + " Warning: " __VA_ARGS__) + #define MCA_COMMON_UCX_VERBOSE(_level, ... ) \ if (((_level) <= MCA_COMMON_UCX_MAX_VERBOSE) && \ ((_level) <= opal_common_ucx.verbose)) { \ @@ -96,6 +101,7 @@ extern opal_common_ucx_module_t opal_common_ucx; OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); +OPAL_DECLSPEC void opal_common_ucx_mca_proc_added(void); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker); OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component); diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 67bfc7ceabe..d20bfd95838 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -138,6 +138,8 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) mca_spml_ucx_ctx_default.ucp_peers = NULL; + opal_common_ucx_mca_proc_added(); + return ret; } From bed814108880d5efdb23cc0a66cf3c9e2e421705 Mon Sep 17 00:00:00 2001 From: Sergey Oblomov Date: Thu, 14 Mar 2019 11:00:57 +0200 Subject: [PATCH 2/2] COMMON/UCX: rewording of hooks suggestion - also updated output macro Signed-off-by: Sergey Oblomov (cherry picked from commit c319cf9adefb69c78a73eb4a83a40dee5b697a53) --- opal/mca/common/ucx/common_ucx.c | 5 ++--- opal/mca/common/ucx/common_ucx.h | 14 +++++--------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 254ac3a032a..69f8b0c4678 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -136,9 +136,8 @@ void opal_common_ucx_mca_proc_added(void) { #if HAVE_DECL_UCM_TEST_EVENTS static int warned = 0; - static char *mem_hooks_suggestion = "Try to add command line agrument " - "'--mca opal_common_ucx_opal_mem_hooks 1' to resolve " - "this issue."; + static char *mem_hooks_suggestion = "Pls try adding --mca opal_common_ucx_opal_mem_hooks 1 " + "to mpirun/oshrun command line to resolve this issue."; ucs_status_t status; if (!warned) { diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 0bd33f9cfa8..7db964447e9 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -39,15 +39,11 @@ BEGIN_C_DECLS #define MCA_COMMON_UCX_QUOTE(_x) \ _MCA_COMMON_UCX_QUOTE(_x) -#define MCA_COMMON_UCX_ERROR(...) \ - opal_output_verbose(0, opal_common_ucx.output, \ - __FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \ - " Error: " __VA_ARGS__) - -#define MCA_COMMON_UCX_WARN(...) \ - opal_output_verbose(0, opal_common_ucx.output, \ - __FILE__ ":" MCA_COMMON_UCX_QUOTE(__LINE__) \ - " Warning: " __VA_ARGS__) +#define MCA_COMMON_UCX_ERROR(...) \ + MCA_COMMON_UCX_VERBOSE(0, " Error: " __VA_ARGS__) + +#define MCA_COMMON_UCX_WARN(...) \ + MCA_COMMON_UCX_VERBOSE(0, " Warning: " __VA_ARGS__) #define MCA_COMMON_UCX_VERBOSE(_level, ... ) \ if (((_level) <= MCA_COMMON_UCX_MAX_VERBOSE) && \