From 1c9e381752893d4448d9f51d964b25ff60c91730 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 1 Oct 2016 15:22:26 -0700 Subject: [PATCH 1/4] usnic: print a helpful message invoke PML error callback The previous message was unhelpful / confusing. Signed-off-by: Jeff Squyres (cherry picked from commit b13813810fabe181bb1f1b81db8bb7049c0c6456) --- opal/mca/btl/usnic/btl_usnic_util.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/opal/mca/btl/usnic/btl_usnic_util.c b/opal/mca/btl/usnic/btl_usnic_util.c index 9c1db480cd4..17eeb7650db 100644 --- a/opal/mca/btl/usnic/btl_usnic_util.c +++ b/opal/mca/btl/usnic/btl_usnic_util.c @@ -34,6 +34,8 @@ void opal_btl_usnic_exit(opal_btl_usnic_module_t *module) } /* If we didn't find a PML error callback, just exit. */ if (NULL == module) { + fprintf(stderr, "*** The Open MPI usnic BTL is aborting the MPI job (via exit(3)).\n"); + fflush(stderr); exit(1); } } @@ -47,7 +49,7 @@ void opal_btl_usnic_exit(opal_btl_usnic_module_t *module) module->pml_error_callback(&module->super, MCA_BTL_ERROR_FLAGS_FATAL, (opal_proc_t*) opal_proc_local_get(), - "usnic"); + "The usnic BTL is aborting the MPI job (via PML error callback)."); } /* If the PML error callback returns (or if there wasn't one), From 1b0cca02d3d3cd3b3da87c344666d84ba93ff0d2 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 3 Oct 2016 11:48:41 -0700 Subject: [PATCH 2/4] usnic: require libfabric >= v1.3 at run time There are critical usnic libfabric AV insert bugs before v1.3, so don't allow any version prior to v1.3 at run time (still allow *compiling* with earlier versions, though, since the ABI guarantees allow us to compile with an earlier libfabric and run with a later libfabric). Switch to using fi_version() to check the version (instead of calling fi_getinfo()) as a potentially lighter-weight / simpler solution. This allows us to only call fi_getinfo() once. Signed-off-by: Jeff Squyres (cherry picked from commit 345c07a252651783633d01db66fd950e4cf12ac1) --- opal/mca/btl/usnic/btl_usnic_component.c | 129 +++++++++++++---------- 1 file changed, 76 insertions(+), 53 deletions(-) diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index 2a372964e42..dcbb04b3f15 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -590,25 +590,6 @@ static void free_filter(usnic_if_filter_t *filter) free(filter); } -static int do_fi_getinfo(uint32_t version, struct fi_info **info_list) -{ - struct fi_info hints = {0}; - struct fi_ep_attr ep_attr = {0}; - struct fi_fabric_attr fabric_attr = {0}; - - /* We only want providers named "usnic" that are of type EP_DGRAM */ - fabric_attr.prov_name = "usnic"; - ep_attr.type = FI_EP_DGRAM; - - hints.caps = FI_MSG; - hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX; - hints.addr_format = FI_SOCKADDR; - hints.ep_attr = &ep_attr; - hints.fabric_attr = &fabric_attr; - - return fi_getinfo(version, NULL, 0, 0, &hints, info_list); -} - /* * UD component initialization: * (1) read interface list from kernel and compare against component @@ -652,40 +633,61 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, OBJ_CONSTRUCT(&btl_usnic_lock, opal_recursive_mutex_t); - /* This code understands libfabric API versions v1.0, v1.1, and - v1.4. Even if we were compiled with libfabric API v1.0, we - still want to request v1.1 -- here's why: - - - In libfabric v1.0.0 (i.e., API v1.0), the usnic provider did - not check the value of the "version" parameter passed into - fi_getinfo() - - - If you pass FI_VERSION(1,0) to libfabric v1.1.0 (i.e., API - v1.1), the usnic provider will disable FI_MSG_PREFIX support - (on the assumption that the application will not handle - FI_MSG_PREFIX properly). This can happen if you compile OMPI - against libfabric v1.0.0 (i.e., API v1.0) and run OMPI - against libfabric v1.1.0 (i.e., API v1.1). - - So never request API v1.0 -- always request a minimum of - v1.1. - - The usnic provider changed the strings in the fabric and domain - names in API v1.4. With API <= v1.3: + /* There are multiple dimensions to consider when requesting an + API version number from libfabric: + + 1. This code understands libfabric API versions v1.0 through + v1.4. + + 2. Open MPI may be *compiled* against one version of libfabric, + but may be *running* with another. + + 3. There were usnic-specific bugs in Libfabric prior to + libfabric v1.3.0 (where "v1.3.0" is the tarball/package + version, not the API version; but happily, the API version + was also 1.3 in Libfabric v1.3.0): + + - In libfabric v1.0.0 (i.e., API v1.0), the usnic provider + did not check the value of the "version" parameter passed + into fi_getinfo() + - If you pass FI_VERSION(1,0) to libfabric v1.1.0 (i.e., API + v1.1), the usnic provider will disable FI_MSG_PREFIX + support (on the assumption that the application will not + handle FI_MSG_PREFIX properly). This can happen if you + compile OMPI against libfabric v1.0.0 (i.e., API v1.0) and + run OMPI against libfabric v1.1.0 (i.e., API v1.1). + - Some critical AV bug fixes were included in libfabric + v1.3.0; prior versions can fail in fi_av_* operations in + unexpected ways (libnl: you win again!). + + So always request a minimum API version of v1.3. + + Note that the FI_MAJOR_VERSION and FI_MINOR_VERSION in + represent the API version, not the Libfabric + package (i.e., tarball) version. As of Libfabric v1.3, there + is currently no way to know a) what package version of + Libfabric you were compiled against, and b) what package + version of Libfabric you are running with. + + Also note that the usnic provider changed the strings in the + fabric and domain names in API v1.4. With API <= v1.3: - fabric name is "usnic_X" (device name) - domain name is NULL - With libfabric API >= v1.4: + With libfabric API >= v1.4, all Libfabric IP-based providers + (including usnic) follow the same convention: - fabric name is "a.b.c.d/e" (CIDR notation of network) - domain name is "usnic_X" (device name) NOTE: The configure.m4 in this component will require libfabric - >= v1.1.0 (i.e., it won't accept v1.0.0) because of a critical - bug in the usnic provider in libfabric v1.0.0. However, the - compatibility code with libfabric v1.0.0 in the usNIC BTL has - been retained, for two reasons: + >= v1.1.0 (i.e., it won't accept v1.0.0) because it needs + access to the usNIC extension header structures that only + became available in v1.1.0. + + All that being said, the compatibility code with libfabric + v1.0.0 in the usNIC BTL has been retained, for two reasons: 1. It's not harmful, nor overly complicated. So the compatibility code was not ripped out. @@ -695,19 +697,40 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, Someday, #2 may no longer be true, and we may therefore rip out the libfabric v1.0.0 compatibility code. */ - /* First try API version 1.4. If that doesn't work, try API - version 1.1. */ + /* First, check to see if the libfabric we are running with is <= + libfabric v1.3. If so, don't bother going further. */ uint32_t libfabric_api; - libfabric_api = FI_VERSION(1, 4); - ret = do_fi_getinfo(libfabric_api, &info_list); - // Libfabric core will return -FI_ENOSYS if it is too old - if (-FI_ENOSYS == ret) { - libfabric_api = FI_VERSION(1, 1); - ret = do_fi_getinfo(libfabric_api, &info_list); + libfabric_api = fi_version(); + if (libfabric_api < FI_VERSION(1, 3)) { + opal_output_verbose(5, USNIC_OUT, + "btl:usnic: disqualifiying myself because Libfabric does not support v1.3 of the API (v1.3 is *required* for correct usNIC functionality)."); + return NULL; + } + + /* Libfabric API 1.3 is fine. Above that, we know that Open MPI + works with libfabric API v1.4, so just use that. */ + if (libfabric_api > FI_VERSION(1, 3)) { + libfabric_api = FI_VERSION(1, 4); } + + struct fi_info hints = {0}; + struct fi_ep_attr ep_attr = {0}; + struct fi_fabric_attr fabric_attr = {0}; + + /* We only want providers named "usnic" that are of type EP_DGRAM */ + fabric_attr.prov_name = "usnic"; + ep_attr.type = FI_EP_DGRAM; + + hints.caps = FI_MSG; + hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX; + hints.addr_format = FI_SOCKADDR; + hints.ep_attr = &ep_attr; + hints.fabric_attr = &fabric_attr; + + ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list); if (0 != ret) { opal_output_verbose(5, USNIC_OUT, - "btl:usnic: disqualifiying myself due to fi_getinfo failure: %s (%d)", strerror(-ret), ret); + "btl:usnic: disqualifiying myself due to fi_getinfo(3) failure: %s (%d)", strerror(-ret), ret); return NULL; } From fd79d4d5e7378ba6ad4981cbf375e1b5c528685e Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 3 Oct 2016 11:58:08 -0700 Subject: [PATCH 3/4] usnic: remove some legacy libfabric 1.0/1.1 code We only support running with libfabric v1.3 or greater. So it's safe to remove the legacy/adaptive cq_readerr() behavior. Signed-off-by: Jeff Squyres (cherry picked from commit 8b77359cacd04b0440519c6026a23907471cff31) --- opal/mca/btl/usnic/btl_usnic.h | 7 ---- opal/mca/btl/usnic/btl_usnic_component.c | 47 +++--------------------- 2 files changed, 6 insertions(+), 48 deletions(-) diff --git a/opal/mca/btl/usnic/btl_usnic.h b/opal/mca/btl/usnic/btl_usnic.h index e2acf3b61d1..e8f6dafa2de 100644 --- a/opal/mca/btl/usnic/btl_usnic.h +++ b/opal/mca/btl/usnic/btl_usnic.h @@ -236,13 +236,6 @@ typedef struct opal_btl_usnic_component_t { the prefix is non-NULL) */ char *connectivity_map_prefix; - /** Expected return value from fi_cq_readerr() upon success. In - libfabric v1.0.0 / API v1.0, the usnic provider returned - sizeof(fi_cq_err_entry) upon success. In libfabric >=v1.1 / - API >=v1.1, the usnic provider returned 1 upon success. */ - ssize_t cq_readerr_success_value; - ssize_t cq_readerr_try_again_value; - /** Offset into the send buffer where the payload will go. For libfabric v1.0.0 / API v1.0, this is 0. For libfabric >=v1.1 / API >=v1.1, this is the endpoint.msg_prefix_size (i.e., diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index dcbb04b3f15..8ca8518d355 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -636,7 +636,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, /* There are multiple dimensions to consider when requesting an API version number from libfabric: - 1. This code understands libfabric API versions v1.0 through + 1. This code understands libfabric API versions v1.3 through v1.4. 2. Open MPI may be *compiled* against one version of libfabric, @@ -684,18 +684,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, NOTE: The configure.m4 in this component will require libfabric >= v1.1.0 (i.e., it won't accept v1.0.0) because it needs access to the usNIC extension header structures that only - became available in v1.1.0. - - All that being said, the compatibility code with libfabric - v1.0.0 in the usNIC BTL has been retained, for two reasons: - - 1. It's not harmful, nor overly complicated. So the - compatibility code was not ripped out. - 2. At least some versions of Cisco Open MPI are shipping with - an embedded (libfabric v1.0.0+critical bug fix). - - Someday, #2 may no longer be true, and we may therefore rip out - the libfabric v1.0.0 compatibility code. */ + became available in v1.1.0.*/ /* First, check to see if the libfabric we are running with is <= libfabric v1.3. If so, don't bother going further. */ @@ -761,29 +750,6 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, opal_output_verbose(5, USNIC_OUT, "btl:usnic: usNIC fabrics found"); - /* Due to ambiguities in documentation, in libfabric v1.0.0 (i.e., - API v1.0) the usnic provider returned sizeof(struct - fi_cq_err_entry) from fi_cq_readerr() upon success. - - The ambiguities were clarified in libfabric v1.1.0 (i.e., API - v1.1); the usnic provider returned 1 from fi_cq_readerr() upon - success. - - So query to see what version of the libfabric API we are - running with, and adapt accordingly. */ - libfabric_api = fi_version(); - if (1 == FI_MAJOR(libfabric_api) && - 0 == FI_MINOR(libfabric_api)) { - // Old fi_cq_readerr() behavior: success=sizeof(...), try again=0 - mca_btl_usnic_component.cq_readerr_success_value = - sizeof(struct fi_cq_err_entry); - mca_btl_usnic_component.cq_readerr_try_again_value = 0; - } else { - // New fi_cq_readerr() behavior: success=1, try again=-FI_EAGAIN - mca_btl_usnic_component.cq_readerr_success_value = 1; - mca_btl_usnic_component.cq_readerr_try_again_value = -FI_EAGAIN; - } - opal_proc_t *me = opal_proc_local_get(); opal_process_name_t *name = &(me->proc_name); mca_btl_usnic_component.my_hashed_rte_name = @@ -1279,12 +1245,11 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, } rc = fi_cq_readerr(channel->cq, &err_entry, 0); - if (rc == mca_btl_usnic_component.cq_readerr_try_again_value) { + if (rc == -FI_EAGAIN) { return; - } else if (rc != mca_btl_usnic_component.cq_readerr_success_value) { - BTL_ERROR(("%s: cq_readerr ret = %d (expected %d)", - module->linux_device_name, rc, - (int) mca_btl_usnic_component.cq_readerr_success_value)); + } else if (rc != 1) { + BTL_ERROR(("%s: cq_readerr ret = %d (expected 1)", + module->linux_device_name, rc)); channel->chan_error = true; } From 7aa4fbfe280ab463df397b0a9c22eb5ce7a08850 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Tue, 4 Oct 2016 18:17:05 -0700 Subject: [PATCH 4/4] usnic: fix one last stray fabric_attr->name --> linux_device_name Signed-off-by: Jeff Squyres (cherry picked from commit 67684be7c9ae0b96aa0e09d02482b377ea7b046d) --- opal/mca/btl/usnic/btl_usnic_module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 3f896f832ef..6ca1262467c 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -1684,7 +1684,7 @@ static int create_ep(opal_btl_usnic_module_t* module, } opal_output_verbose(15, USNIC_OUT, "btl:usnic:create_ep:%s: new usnic local endpoint channel %s: %s:%d", - module->fabric_info->fabric_attr->name, + module->linux_device_name, str, inet_ntoa(sin->sin_addr), ntohs(sin->sin_port)); @@ -2132,7 +2132,7 @@ static int init_mpool(opal_btl_usnic_module_t *module) "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "create rcache", __FILE__, __LINE__); return OPAL_ERROR; }