From 47ec3e4d2bd30d3feaaca3989124452fca60f5d6 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 26 Sep 2019 08:57:00 -0700 Subject: [PATCH 1/3] btl/uct: add support for OpenUCX v1.8 API changes OpenUCX broke the UCT API again in v1.8. This commit updates btl/uct to fix compilation with current OpenUCX master (future v1.8). Further changes will likely be needed for the final release. Signed-off-by: Nathan Hjelm (cherry picked from commit 526775dfd7ad75c308532784de4fb3ffed25458f) --- opal/mca/btl/uct/btl_uct.h | 4 ++ opal/mca/btl/uct/btl_uct_amo.c | 4 +- opal/mca/btl/uct/btl_uct_component.c | 82 +++++++++++++++++++++++++++- opal/mca/btl/uct/btl_uct_rdma.c | 4 +- opal/mca/btl/uct/btl_uct_rdma.h | 14 +++++ opal/mca/btl/uct/btl_uct_tl.c | 6 ++ 6 files changed, 107 insertions(+), 7 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 38756794430..73640103c07 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -85,6 +85,10 @@ struct mca_btl_uct_module_t { /** array containing the am_tl and rdma_tl */ mca_btl_uct_tl_t *comm_tls[2]; +#if UCT_API > UCT_VERSION(1, 7) + uct_component_h uct_component; +#endif + /** registration cache */ mca_rcache_base_module_t *rcache; diff --git a/opal/mca/btl/uct/btl_uct_amo.c b/opal/mca/btl/uct/btl_uct_amo.c index f7d02326884..72398ce7369 100644 --- a/opal/mca/btl/uct/btl_uct_amo.c +++ b/opal/mca/btl/uct/btl_uct_amo.c @@ -110,7 +110,7 @@ int mca_btl_uct_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end mca_btl_uct_uct_completion_release (comp); } - uct_rkey_release (&rkey); + mca_btl_uct_rkey_release (uct_btl, &rkey); return rc; } @@ -184,7 +184,7 @@ int mca_btl_uct_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e mca_btl_uct_uct_completion_release (comp); } - uct_rkey_release (&rkey); + mca_btl_uct_rkey_release (uct_btl, &rkey); return rc; } diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index f968cb9c31c..538872860f9 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -314,7 +314,12 @@ ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsig return UCS_OK; } +#if UCT_API > UCT_VERSION(1, 7) +static int mca_btl_uct_component_process_uct_md (uct_component_h component, uct_md_resource_desc_t *md_desc, + char **allowed_ifaces) +#else static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc, char **allowed_ifaces) +#endif { mca_rcache_base_resources_t rcache_resources; uct_tl_resource_desc_t *tl_desc; @@ -348,8 +353,14 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc md = OBJ_NEW(mca_btl_uct_md_t); + +#if UCT_API > UCT_VERSION(1, 7) + uct_md_config_read (component, NULL, NULL, &uct_config); + uct_md_open (component, md_desc->md_name, uct_config, &md->uct_md); +#else uct_md_config_read (md_desc->md_name, NULL, NULL, &uct_config); uct_md_open (md_desc->md_name, uct_config, &md->uct_md); +#endif uct_config_release (uct_config); uct_md_query (md->uct_md, &md_attr); @@ -375,6 +386,10 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc return OPAL_ERR_NOT_AVAILABLE; } +#if UCT_API > UCT_VERSION(1, 7) + module->uct_component = component; +#endif + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable @@ -400,6 +415,42 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc return OPAL_SUCCESS; } +#if UCT_API > UCT_VERSION(1, 7) +static int mca_btl_uct_component_process_uct_component (uct_component_h component, char **allowed_ifaces) +{ + uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | + UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT}; + ucs_status_t ucs_status; + int rc; + + ucs_status = uct_component_query (component, &attr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + BTL_VERBOSE(("processing uct component %s", attr.name)); + + attr.md_resources = calloc (attr.md_resource_count, sizeof (*attr.md_resources)); + attr.field_mask |= UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES; + ucs_status = uct_component_query (component, &attr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + for (int i = 0 ; i < attr.md_resource_count ; ++i) { + rc = mca_btl_uct_component_process_uct_md (component, attr.md_resources + i, + allowed_ifaces); + if (OPAL_SUCCESS != rc) { + break; + } + } + + free (attr.md_resources); + + return OPAL_SUCCESS; +} +#endif /* UCT_API > UCT_VERSION(1, 7) */ + /* * UCT component initialization: * (1) read interface list from kernel and compare against component parameters @@ -415,6 +466,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules, struct mca_btl_base_module_t **base_modules; uct_md_resource_desc_t *resources; unsigned resource_count; + ucs_status_t ucs_status; char **allowed_ifaces; int rc; @@ -431,10 +483,32 @@ static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules, return NULL; } - uct_query_md_resources (&resources, &resource_count); - mca_btl_uct_component.module_count = 0; +#if UCT_API > UCT_VERSION(1, 7) + uct_component_h *components; + unsigned num_components; + + ucs_status = uct_query_components(&components, &num_components); + if (UCS_OK != ucs_status) { + BTL_ERROR(("could not query UCT components")); + return NULL; + } + + /* generate all suitable btl modules */ + for (unsigned i = 0 ; i < num_components ; ++i) { + rc = mca_btl_uct_component_process_uct_component (components[i], allowed_ifaces); + if (OPAL_SUCCESS != rc) { + break; + } + } + + uct_release_component_list (components); + +#else /* UCT 1.6 and older */ + + uct_query_md_resources (&resources, &resource_count); + /* generate all suitable btl modules */ for (unsigned i = 0 ; i < resource_count ; ++i) { rc = mca_btl_uct_component_process_uct_md (resources + i, allowed_ifaces); @@ -443,9 +517,11 @@ static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules, } } - opal_argv_free (allowed_ifaces); uct_release_md_resource_list (resources); +#endif /* UCT_API > UCT_VERSION(1, 7) */ + + opal_argv_free (allowed_ifaces); mca_btl_uct_modex_send (); /* pass module array back to caller */ diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c index 2d2d1c3f04b..9ee9530f260 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.c +++ b/opal/mca/btl/uct/btl_uct_rdma.c @@ -132,7 +132,7 @@ int mca_btl_uct_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi BTL_VERBOSE(("get issued. status = %d", ucs_status)); - uct_rkey_release (&rkey); + mca_btl_uct_rkey_release (uct_btl, &rkey); return OPAL_LIKELY(UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERR_RESOURCE_BUSY; } @@ -237,7 +237,7 @@ int mca_btl_uct_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi mca_btl_uct_uct_completion_release (comp); } - uct_rkey_release (&rkey); + mca_btl_uct_rkey_release (uct_btl, &rkey); return OPAL_LIKELY(UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERR_RESOURCE_BUSY; } diff --git a/opal/mca/btl/uct/btl_uct_rdma.h b/opal/mca/btl/uct/btl_uct_rdma.h index e9b0d6b19dc..609fec91f52 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.h +++ b/opal/mca/btl/uct/btl_uct_rdma.h @@ -55,8 +55,22 @@ static inline int mca_btl_uct_get_rkey (mca_btl_uct_module_t *module, return rc; } +#if UCT_API > UCT_VERSION(1, 7) + ucs_status = uct_rkey_unpack (module->uct_component, (void *) remote_handle, rkey); +#else ucs_status = uct_rkey_unpack ((void *) remote_handle, rkey); +#endif return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR; } +static inline void mca_btl_uct_rkey_release (mca_btl_uct_module_t *uct_btl, uct_rkey_bundle_t *rkey) +{ +#if UCT_API > UCT_VERSION(1, 7) + uct_rkey_release (uct_btl->uct_component, rkey); +#else + (void) uct_btl; + uct_rkey_release (rkey); +#endif +} + #endif /* !defined(BTL_UCT_RDMA_H) */ diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index a711a41ce99..dcf00f23524 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -516,7 +516,13 @@ static int mca_btl_uct_evaluate_tl (mca_btl_uct_module_t *module, mca_btl_uct_tl * come up with a better estimate. */ /* UCT bandwidth is in bytes/sec, BTL is in MB/sec */ +#if UCT_API > UCT_VERSION(1, 7) + module->super.btl_bandwidth = (uint32_t) ((MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.dedicated + + MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.shared / + (opal_process_info.num_local_peers + 1)) / 1048576.0); +#else module->super.btl_bandwidth = (uint32_t) (MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth / 1048576.0); +#endif /* TODO -- figure out how to translate UCT latency to us */ module->super.btl_latency = 1; } From 55e01220cd213ff2b24950becc49c764dd62de17 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 5 Nov 2019 12:52:42 -0800 Subject: [PATCH 2/3] btl/uct: fix compilation for UCX 1.7.0 Ref #7128 Signed-off-by: Nathan Hjelm (cherry picked from commit a3026c016a6a8be379f62585b6ddc070175c8106) --- opal/mca/btl/uct/btl_uct.h | 3 ++- opal/mca/btl/uct/btl_uct_component.c | 14 +++++++------- opal/mca/btl/uct/btl_uct_rdma.h | 4 ++-- opal/mca/btl/uct/btl_uct_tl.c | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h index 73640103c07..0e4ec9a9498 100644 --- a/opal/mca/btl/uct/btl_uct.h +++ b/opal/mca/btl/uct/btl_uct.h @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2019 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -85,7 +86,7 @@ struct mca_btl_uct_module_t { /** array containing the am_tl and rdma_tl */ mca_btl_uct_tl_t *comm_tls[2]; -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) uct_component_h uct_component; #endif diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c index 538872860f9..bff160736bc 100644 --- a/opal/mca/btl/uct/btl_uct_component.c +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -314,7 +314,7 @@ ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsig return UCS_OK; } -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) static int mca_btl_uct_component_process_uct_md (uct_component_h component, uct_md_resource_desc_t *md_desc, char **allowed_ifaces) #else @@ -354,7 +354,7 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc md = OBJ_NEW(mca_btl_uct_md_t); -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) uct_md_config_read (component, NULL, NULL, &uct_config); uct_md_open (component, md_desc->md_name, uct_config, &md->uct_md); #else @@ -386,7 +386,7 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc return OPAL_ERR_NOT_AVAILABLE; } -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) module->uct_component = component; #endif @@ -415,7 +415,7 @@ static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc return OPAL_SUCCESS; } -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) static int mca_btl_uct_component_process_uct_component (uct_component_h component, char **allowed_ifaces) { uct_component_attr_t attr = {.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME | @@ -449,7 +449,7 @@ static int mca_btl_uct_component_process_uct_component (uct_component_h componen return OPAL_SUCCESS; } -#endif /* UCT_API > UCT_VERSION(1, 7) */ +#endif /* UCT_API >= UCT_VERSION(1, 7) */ /* * UCT component initialization: @@ -485,7 +485,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules, mca_btl_uct_component.module_count = 0; -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) uct_component_h *components; unsigned num_components; @@ -519,7 +519,7 @@ static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules, uct_release_md_resource_list (resources); -#endif /* UCT_API > UCT_VERSION(1, 7) */ +#endif /* UCT_API >= UCT_VERSION(1, 7) */ opal_argv_free (allowed_ifaces); mca_btl_uct_modex_send (); diff --git a/opal/mca/btl/uct/btl_uct_rdma.h b/opal/mca/btl/uct/btl_uct_rdma.h index 609fec91f52..ab790371afe 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.h +++ b/opal/mca/btl/uct/btl_uct_rdma.h @@ -55,7 +55,7 @@ static inline int mca_btl_uct_get_rkey (mca_btl_uct_module_t *module, return rc; } -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) ucs_status = uct_rkey_unpack (module->uct_component, (void *) remote_handle, rkey); #else ucs_status = uct_rkey_unpack ((void *) remote_handle, rkey); @@ -65,7 +65,7 @@ static inline int mca_btl_uct_get_rkey (mca_btl_uct_module_t *module, static inline void mca_btl_uct_rkey_release (mca_btl_uct_module_t *uct_btl, uct_rkey_bundle_t *rkey) { -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) uct_rkey_release (uct_btl->uct_component, rkey); #else (void) uct_btl; diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index dcf00f23524..e69c769b41f 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -516,7 +516,7 @@ static int mca_btl_uct_evaluate_tl (mca_btl_uct_module_t *module, mca_btl_uct_tl * come up with a better estimate. */ /* UCT bandwidth is in bytes/sec, BTL is in MB/sec */ -#if UCT_API > UCT_VERSION(1, 7) +#if UCT_API >= UCT_VERSION(1, 7) module->super.btl_bandwidth = (uint32_t) ((MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.dedicated + MCA_BTL_UCT_TL_ATTR(tl, 0).bandwidth.shared / (opal_process_info.num_local_peers + 1)) / 1048576.0); From 59b24ab4f7f9856e06007d57aa0b41636def672a Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 5 Nov 2019 09:28:34 -0700 Subject: [PATCH 3/3] btl/uct: add UCT API version check to configury related to #7128 The UCX crew is no longer guaranteeing that the UCT API is going to be frozen, so this is kind of a whack-a-mole problem trying to keep the BTL UCT working with various changing UCT APIs. Signed-off-by: Howard Pritchard (cherry picked from commit 9d345d9aa000233bec148540b071cecffc94438c) --- opal/mca/btl/uct/configure.m4 | 37 +++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/opal/mca/btl/uct/configure.m4 b/opal/mca/btl/uct/configure.m4 index 82844857740..61c7c8cd0f3 100644 --- a/opal/mca/btl/uct/configure.m4 +++ b/opal/mca/btl/uct/configure.m4 @@ -16,6 +16,8 @@ # All rights reserved. # Copyright (c) 2018 Research Organization for Information Science # and Technology (RIST). All rights reserved. +# Copyright (c) 2019 Triad National Security, LLC. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -35,6 +37,41 @@ AC_DEFUN([MCA_opal_btl_uct_CONFIG],[ OMPI_CHECK_UCX([btl_uct], [btl_uct_happy="yes"], [btl_uct_happy="no"]) +dnl +dnl check UCT version. UCT API can change at any given release +dnl so we only allow compiling against ones we know work. +dnl + AC_ARG_ENABLE([uct-version-check], + [AC_HELP_STRING([--enable-uct-version-check], + [enable UCT version check (default: enabled)])]) + AC_MSG_CHECKING([check uct version]) + if test "$enable_uct_version_check" != "no"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + + max_allowed_uct_major=1 + max_allowed_uct_minor=7 + if test "$btl_uct_happy" = "yes" && test "$enable_uct_version_check" != "no"; then + AC_MSG_CHECKING([UCT version compatibility]) + OPAL_VAR_SCOPE_PUSH([CPPFLAGS_save]) + CPPFLAGS_save="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $btl_uct_CPPFLAGS" + AC_PREPROC_IFELSE([AC_LANG_PROGRAM([#include + #if (UCT_VERNO_MAJOR > $max_allowed_uct_major) + #error "UCT MAJOR VERNO > $max_allowed_uct_major" + #endif + #if (UCT_VERNO_MINOR > $max_allowed_uct_minor) + #error "UCT MINOR VERNO > $max_allowed_uct_minor" + #endif], [])], + [AC_MSG_RESULT([UCT version compatible])], + [AC_MSG_RESULT([UCT version not compatible - need UCX $max_allowed_uct_major.$max_allowed_uct_minor or older]) + btl_uct_happy="no"]) + CPPFLAGS="$CPPFLAGS_save" + OPAL_VAR_SCOPE_POP + fi + if test "$btl_uct_happy" = "yes" ; then OPAL_VAR_SCOPE_PUSH([CPPFLAGS_save])