Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion ompi/mca/mtl/ofi/help-mtl-ofi.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
# -*- text -*-
#
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved
#
# Copyright (c) 2017 Cisco Systems, Inc. All rights reserved
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[OFI call fail]
Open MPI failed an OFI Libfabric library call (%s). This is highly
unusual; your job may behave unpredictably (and/or abort) after this.

Local host: %s
Location: %s:%d
Error: %s (%zd)
44 changes: 27 additions & 17 deletions ompi/mca/mtl/ofi/mtl_ofi.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
Expand All @@ -14,6 +14,7 @@
#include "ompi/mca/mtl/mtl.h"
#include "ompi/mca/mtl/base/base.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/util/show_help.h"

#include <rdma/fabric.h>
#include <rdma/fi_cm.h>
Expand Down Expand Up @@ -79,13 +80,14 @@ ompi_mtl_ofi_progress(void)
assert(ofi_req);
ret = ofi_req->event_callback(&wc, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request event callback: %zd",
ret);
abort();
opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, ret);
fflush(stderr);
exit(1);
}
}
} else if (ret == -FI_EAVAIL) {
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
/**
* An error occured and is being reported via the CQ.
* Read the error and forward it to the upper layer.
Expand All @@ -94,26 +96,34 @@ ompi_mtl_ofi_progress(void)
&error,
0);
if (0 > ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned from fi_cq_readerr: %zd", ret);
abort();
opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, fi_strerror(-ret), ret);
fflush(stderr);
exit(1);
}

assert(error.op_context);
ofi_req = TO_OFI_REQ(error.op_context);
assert(ofi_req);
ret = ofi_req->error_callback(&error, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request error callback: %zd",
ret);
abort();
opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, ret);
fflush(stderr);
exit(1);
}
} else {
/**
* The CQ is empty. Return.
*/
break;
if (ret == -FI_EAGAIN) {
break;
} else {
opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, fi_strerror(-ret), ret);
fflush(stderr);
exit(1);
}
}
}
return count;
Expand Down
96 changes: 53 additions & 43 deletions ompi/mca/mtl/ofi/mtl_ofi_component.c
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
*
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
Expand All @@ -14,6 +14,7 @@

#include "mtl_ofi.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"

static int ompi_mtl_ofi_component_open(void);
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
Expand Down Expand Up @@ -361,12 +362,16 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
NULL, /* Optional name or fabric to resolve */
NULL, /* Optional service name or port to request */
0ULL, /* Optional flag */
hints, /* In: Hints to filter providers */
hints, /* In: Hints to filter providers */
&providers); /* Out: List of matching providers */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getinfo failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
if (FI_ENODATA == -ret) {
// It is not an error if no information is returned.
goto error;
} else if (0 != ret) {
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_getinfo",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
goto error;
}

Expand All @@ -392,9 +397,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */
NULL); /* Optional context for fabric events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_fabric failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_fabric",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
goto error;
}

Expand All @@ -408,9 +414,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.domain, /* Out: Domain oject */
NULL); /* Optional context for domain events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_domain failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_domain",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
goto error;
}

Expand All @@ -426,9 +433,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
NULL); /* Optional context */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_endpoint failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_endpoint",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);
goto error;
}

Expand Down Expand Up @@ -581,38 +589,40 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
int
ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
{
ssize_t ret;

opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);

/**
* * Close all the OFI objects
* */
if (fi_close((fid_t)ompi_mtl_ofi.ep)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
}
if (fi_close((fid_t)ompi_mtl_ofi.cq)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
}
if (fi_close((fid_t)ompi_mtl_ofi.av)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
}
if (fi_close((fid_t)ompi_mtl_ofi.domain)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
}
if (fi_close((fid_t)ompi_mtl_ofi.fabric)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
/* Close all the OFI objects */
if ((ret = fi_close((fid_t)ompi_mtl_ofi.ep))) {
goto finalize_err;
}

if ((ret = fi_close((fid_t)ompi_mtl_ofi.cq))) {
goto finalize_err;
}

if ((ret = fi_close((fid_t)ompi_mtl_ofi.av))) {
goto finalize_err;
}

if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) {
goto finalize_err;
}

if ((ret = fi_close((fid_t)ompi_mtl_ofi.fabric))) {
goto finalize_err;
}

return OMPI_SUCCESS;

finalize_err:
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_close",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), -ret);

return OMPI_ERROR;
}


Expand Down