diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt index 2338d548f01..8131766ae00 100644 --- a/ompi/mca/mtl/ofi/help-mtl-ofi.txt +++ b/ompi/mca/mtl/ofi/help-mtl-ofi.txt @@ -1,10 +1,18 @@ # -*- text -*- # -# Copyright (c) 2013-2015 Intel, Inc. All rights reserved +# Copyright (c) 2013-2017 Intel, Inc. All rights reserved # +# Copyright (c) 2017 Cisco Systems, Inc. All rights reserved # $COPYRIGHT$ # # Additional copyrights may follow # # $HEADER$ # +[OFI call fail] +Open MPI failed an OFI Libfabric library call (%s). This is highly +unusual; your job may behave unpredictably (and/or abort) after this. + + Local host: %s + Location: %s:%d + Error: %s (%zd) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 1128aca3d26..0ee125c7966 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved * * $COPYRIGHT$ * @@ -14,6 +14,7 @@ #include "ompi/mca/mtl/mtl.h" #include "ompi/mca/mtl/base/base.h" #include "opal/datatype/opal_convertor.h" +#include "opal/util/show_help.h" #include #include @@ -79,13 +80,14 @@ ompi_mtl_ofi_progress(void) assert(ofi_req); ret = ofi_req->event_callback(&wc, ofi_req); if (OMPI_SUCCESS != ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned by request event callback: %zd", - ret); - abort(); + opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, ret); + fflush(stderr); + exit(1); } } - } else if (ret == -FI_EAVAIL) { + } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { /** * An error occured and is being reported via the CQ. * Read the error and forward it to the upper layer. @@ -94,9 +96,11 @@ ompi_mtl_ofi_progress(void) &error, 0); if (0 > ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned from fi_cq_readerr: %zd", ret); - abort(); + opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, fi_strerror(-ret), ret); + fflush(stderr); + exit(1); } assert(error.op_context); @@ -104,16 +108,22 @@ ompi_mtl_ofi_progress(void) assert(ofi_req); ret = ofi_req->error_callback(&error, ofi_req); if (OMPI_SUCCESS != ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned by request error callback: %zd", - ret); - abort(); + opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, ret); + fflush(stderr); + exit(1); } } else { - /** - * The CQ is empty. Return. - */ - break; + if (ret == -FI_EAGAIN) { + break; + } else { + opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, fi_strerror(-ret), ret); + fflush(stderr); + exit(1); + } } } return count; diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 6b7058434df..89a71ed7b63 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -1,8 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved * - * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2017 Cisco Systems, Inc. All rights reserved * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -14,6 +14,7 @@ #include "mtl_ofi.h" #include "opal/util/argv.h" +#include "opal/util/show_help.h" static int ompi_mtl_ofi_component_open(void); static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority); @@ -361,12 +362,16 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, NULL, /* Optional name or fabric to resolve */ NULL, /* Optional service name or port to request */ 0ULL, /* Optional flag */ - hints, /* In: Hints to filter providers */ + hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ - if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_getinfo failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + if (FI_ENODATA == -ret) { + // It is not an error if no information is returned. + goto error; + } else if (0 != ret) { + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_getinfo", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), -ret); goto error; } @@ -392,9 +397,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.fabric, /* Out: Fabric handle */ NULL); /* Optional context for fabric events */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_fabric failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_fabric", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), -ret); goto error; } @@ -408,9 +414,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.domain, /* Out: Domain oject */ NULL); /* Optional context for domain events */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_domain failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_domain", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), -ret); goto error; } @@ -426,9 +433,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.ep, /* Out: Endpoint object */ NULL); /* Optional context */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_endpoint failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_endpoint", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), -ret); goto error; } @@ -581,38 +589,40 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, int ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) { + ssize_t ret; + opal_progress_unregister(ompi_mtl_ofi_progress_no_inline); - /** - * * Close all the OFI objects - * */ - if (fi_close((fid_t)ompi_mtl_ofi.ep)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); - } - if (fi_close((fid_t)ompi_mtl_ofi.cq)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); - } - if (fi_close((fid_t)ompi_mtl_ofi.av)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); - } - if (fi_close((fid_t)ompi_mtl_ofi.domain)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); - } - if (fi_close((fid_t)ompi_mtl_ofi.fabric)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); + /* Close all the OFI objects */ + if ((ret = fi_close((fid_t)ompi_mtl_ofi.ep))) { + goto finalize_err; + } + + if ((ret = fi_close((fid_t)ompi_mtl_ofi.cq))) { + goto finalize_err; + } + + if ((ret = fi_close((fid_t)ompi_mtl_ofi.av))) { + goto finalize_err; + } + + if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) { + goto finalize_err; + } + + if ((ret = fi_close((fid_t)ompi_mtl_ofi.fabric))) { + goto finalize_err; } return OMPI_SUCCESS; + +finalize_err: + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_close", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), -ret); + + return OMPI_ERROR; }