Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ opal/mca/installdirs/config/install_dirs.h
opal/mca/pmix/pmix112/pmix/include/pmix/autogen/config.h
opal/mca/pmix/pmix112/pmix/include/private/autogen/config.h
opal/mca/pmix/pmix112/pmix/include/private/autogen/config.h.in
opal/mca/pmix/pmix120/pmix/include/pmix/autogen/config.h
opal/mca/pmix/pmix120/pmix/include/private/autogen/config.h
opal/mca/pmix/pmix120/pmix/include/private/autogen/config.h.in


opal/tools/opal-checkpoint/opal-checkpoint
opal/tools/opal-checkpoint/opal-checkpoint.1
Expand Down
21 changes: 13 additions & 8 deletions config/opal_check_pmi.m4
Original file line number Diff line number Diff line change
Expand Up @@ -232,20 +232,25 @@ AC_DEFUN([OPAL_CHECK_PMIX],[

OPAL_VAR_SCOPE_PUSH([pmix_ext_install_dir])

AC_ARG_WITH([external-pmix],
[AC_HELP_STRING([--with-external-pmix(=DIR)],
[Use external PMIx support, optionally adding DIR to the search path (default: no)])],
[], with_external_pmix=no)
AC_ARG_WITH([pmix],
[AC_HELP_STRING([--with-pmix(=DIR)],
[Build PMIx support. DIR can take one of three values: "internal", "external", or a valid directory name. "internal" (or no DIR value) forces Open MPI to use its internal copy of PMIx. "external" forces Open MPI to use an external installation of PMIx. Supplying a valid directory name also forces Open MPI to use an external installation of PMIx, and adds DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries. Note that Open MPI does not support --without-pmix.])])

AC_MSG_CHECKING([if user requested PMIx support])
AS_IF([test "$with_external_pmix" = "no"],
AS_IF([test "$with_pmix" = "no"],
[AC_MSG_WARN([Open MPI requires PMIx support. It can be built])
AC_MSG_WARN([with either its own internal copy of PMIx, or with])
AC_MSG_WARN([an external copy that you supply.])
AC_MSG_ERROR([Cannot continue])])

AC_MSG_CHECKING([if user requested PMIx support($with_pmix)])
AS_IF([test -z "$with_pmix" || test "$with_pmix" = "yes" || test "$with_mpix" = "internal"],
[AC_MSG_RESULT([no])
opal_external_pmix_happy="no"],
[AC_MSG_RESULT([yes])
# check for external pmix lib */
AS_IF([test "$with_external_pmix" == "yes" || test -z "$with_external_pmix"],
AS_IF([test "$with_pmix" = "external"],
[pmix_ext_install_dir=/usr],
[pmix_ext_install_dir=$with_external_pmix])
[pmix_ext_install_dir=$with_pmix])

# cannot use check_package because there are
# external dependencies to make the headers
Expand Down
44 changes: 32 additions & 12 deletions ompi/errhandler/errhandler.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -30,13 +31,18 @@
#include "ompi/errhandler/errhandler.h"
#include "ompi/errhandler/errhandler_predefined.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/pmix/pmix.h"


/*
* Table for Fortran <-> C errhandler handle conversion
*/
opal_pointer_array_t ompi_errhandler_f_to_c_table = {{0}};

/*
* default errhandler id
*/
static int default_errhandler_id = -1;

/*
* Class information
Expand Down Expand Up @@ -157,6 +163,7 @@ int ompi_errhandler_finalize(void)

/* JMS Add stuff here checking for unreleased errorhandlers,
similar to communicators, info handles, etc. */
opal_pmix.deregister_errhandler(default_errhandler_id, NULL, NULL);

/* Remove errhandler F2C table */

Expand All @@ -169,7 +176,7 @@ int ompi_errhandler_finalize(void)


ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
ompi_errhandler_generic_handler_fn_t *func,
ompi_errhandler_generic_handler_fn_t *func,
ompi_errhandler_lang_t lang)
{
ompi_errhandler_t *new_errhandler;
Expand Down Expand Up @@ -213,20 +220,33 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
return new_errhandler;
}

/* registration callback */
void ompi_errhandler_registration_callback(int status,
int errhandler_ref,
void *cbdata)
{
ompi_errhandler_errtrk_t *errtrk = (ompi_errhandler_errtrk_t*)cbdata;

default_errhandler_id = errhandler_ref;
errtrk->status = status;
errtrk->active = false;
}

/**
* Default runtime errhandler callback
* Default errhandler callback
*/
int ompi_errhandler_runtime_callback(opal_pointer_array_t *errors) {
ompi_rte_error_report_t *err;
int errcode = 1;

if (NULL != errors &&
(NULL != (err = (ompi_rte_error_report_t*)opal_pointer_array_get_item(errors, 0)))) {
errcode = err->errcode;
void ompi_errhandler_callback(int status,
opal_list_t *procs,
opal_list_t *info,
opal_pmix_release_cbfunc_t cbfunc,
void *cbdata)
{
/* allow the caller to release its data */
if (NULL != cbfunc) {
cbfunc(cbdata);
}

ompi_mpi_abort(MPI_COMM_WORLD, errcode);
return OMPI_SUCCESS;
/* our default action is to abort */
ompi_mpi_abort(MPI_COMM_WORLD, status);
}

/**************************************************************************
Expand Down
37 changes: 19 additions & 18 deletions ompi/errhandler/errhandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -30,6 +31,7 @@
#include "opal/prefetch.h"
#include "opal/class/opal_object.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/pmix/pmix.h"

#include "ompi/mca/rte/rte.h"
#include "ompi/runtime/mpiruntime.h"
Expand Down Expand Up @@ -364,29 +366,28 @@ struct ompi_request_t;
ompi_errhandler_lang_t language);

/**
* Callback function from runtime layer to alert the MPI layer of an error at
* the runtime layer.
*
* @param errors A pointer array containing structs of type
* ompi_rte_error_report_t that consists of at least
* {
* ompi_process_name_t proc;
* int errcode;
* }
* Each RTE is allowed to add additional information
* as required
* Callback function to alert the MPI layer of an error or notification
* from the internal RTE and/or the resource manager.
*
* This function is used to alert the MPI layer to a specific fault detected by the
* runtime layer. This could be a process failure, a lost connection, or the inability
* runtime layer or host RM. This could be a process failure, a lost connection, or the inability
* to send an OOB message. The MPI layer has the option to perform whatever actions it
* needs to stabilize itself and continue running, abort, etc.
*
* Upon completion, the error handler should return OMPI_SUCCESS if the error has
* been resolved and no further callbacks are to be executed. Return of any other
* value will cause the RTE to continue executing error callbacks.
*/
OMPI_DECLSPEC int ompi_errhandler_runtime_callback(opal_pointer_array_t *errors);

typedef struct {
volatile bool active;
int status;
} ompi_errhandler_errtrk_t;

OMPI_DECLSPEC void ompi_errhandler_callback(int status,
opal_list_t *procs,
opal_list_t *info,
opal_pmix_release_cbfunc_t cbfunc,
void *cbdata);

OMPI_DECLSPEC void ompi_errhandler_registration_callback(int status,
int errhandler_ref,
void *cbdata);
/**
* Check to see if an errhandler is intrinsic.
*
Expand Down
7 changes: 1 addition & 6 deletions ompi/mca/rte/orte/rte_orte.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -83,12 +84,6 @@ typedef orte_local_rank_t ompi_local_rank_t;
OMPI_DECLSPEC void __opal_attribute_noreturn__
ompi_rte_abort(int error_code, char *fmt, ...);
#define ompi_rte_abort_peers(a, b, c) orte_errmgr.abort_peers(a, b, c)
#define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST
#define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST
#define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND
#define OMPI_RTE_ERRHANDLER_APPEND ORTE_ERRMGR_CALLBACK_APPEND
typedef orte_error_t ompi_rte_error_report_t;
#define ompi_rte_register_errhandler(a, b) orte_errmgr.register_error_callback(a, b)
#define OMPI_ERROR_LOG ORTE_ERROR_LOG

/* Init and finalize objects and operations */
Expand Down
4 changes: 1 addition & 3 deletions ompi/mca/rte/rte.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights reserved.
* Copyright (c) 2013 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
Expand Down Expand Up @@ -111,8 +111,6 @@
* 2. int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs) -
* Abort the specified list of peers
* 3. OMPI_ERROR_LOG(rc) - print error message regarding the given return code
* 4. ompi_rte_register_errhandler - register a callback function for the RTE
* to report asynchronous errors to the caller
*
* (e) Init and finalize objects and operations
* 1. ompi_rte_init - a function to initialize the RTE. The function
Expand Down
18 changes: 13 additions & 5 deletions ompi/runtime/ompi_mpi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
size_t nprocs;
char *error = NULL;
char *cmd=NULL, *av=NULL;
ompi_errhandler_errtrk_t errtrk;
OPAL_TIMING_DECLARE(tm);
OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);

Expand Down Expand Up @@ -504,11 +505,18 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
}
}

/* Register the default errhandler callback - RTE will ignore if it
* doesn't support this capability
*/
ompi_rte_register_errhandler(ompi_errhandler_runtime_callback,
OMPI_RTE_ERRHANDLER_LAST);
/* Register the default errhandler callback */
errtrk.status = OPAL_ERROR;
errtrk.active = true;
opal_pmix.register_errhandler(NULL, ompi_errhandler_callback,
ompi_errhandler_registration_callback,
(void*)&errtrk);
OMPI_WAIT_FOR_COMPLETION(errtrk.active);
if (OPAL_SUCCESS != errtrk.status) {
error = "Error handler registration";
ret = errtrk.status;
goto error;
}

/* Figure out the final MPI thread levels. If we were not
compiled for support for MPI threads, then don't allow
Expand Down
15 changes: 15 additions & 0 deletions opal/mca/hwloc/external/configure.m4
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,21 @@ AC_DEFUN([MCA_opal_hwloc_external_CONFIG],[
[AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])
AC_MSG_ERROR([Cannot continue])])
AC_MSG_CHECKING([if external hwloc version is lower than 2.0])
AS_IF([test "$opal_hwloc_dir" != ""],
[opal_hwloc_external_CFLAGS_save=$CFLAGS
CFLAGS="-I$opal_hwloc_dir/include $opal_hwloc_external_CFLAGS_save"])
AC_COMPILE_IFELSE(
[AC_LANG_PROGRAM([[#include <hwloc.h>]],
[[
#if HWLOC_API_VERSION >= 0x00020000
#error "hwloc API version is greater or equal than 0x00020000"
#endif
]])],
[AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])
AC_MSG_ERROR([OMPI does not currently support hwloc v2 API
Cannot continue])])
AS_IF([test "$opal_hwloc_dir" != ""],
[CFLAGS=$opal_hwloc_external_CFLAGS_save])
$1],
Expand Down
12 changes: 9 additions & 3 deletions opal/mca/pmix/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,17 @@ OPAL_DECLSPEC int opal_pmix_base_select(void);

OPAL_DECLSPEC extern bool opal_pmix_base_allow_delayed_server;

OPAL_DECLSPEC void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err);
OPAL_DECLSPEC void opal_pmix_base_deregister_handler(void);
OPAL_DECLSPEC void opal_pmix_base_register_handler(opal_list_t *info,
opal_pmix_notification_fn_t errhandler,
opal_pmix_errhandler_reg_cbfunc_t cbfunc,
void *cbdata);
OPAL_DECLSPEC void opal_pmix_base_deregister_handler(int errhandler,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata);
OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
opal_list_t *procs,
opal_list_t *info);
opal_list_t *info,
opal_pmix_release_cbfunc_t cbfunc, void *cbdata);
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
opal_pmix_pdata_t *pdat,
int timeout);
Expand Down
28 changes: 21 additions & 7 deletions opal/mca/pmix/base/pmix_base_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,26 +38,40 @@

#define OPAL_PMI_PAD 10

/******** ERRHANDLER SUPPORT ********/
static opal_pmix_errhandler_fn_t errhandler = NULL;

void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err)
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
******** DO NOT NATIVELY SUPPORT IT
********/
static opal_pmix_notification_fn_t errhandler = NULL;

void opal_pmix_base_register_handler(opal_list_t *info,
opal_pmix_notification_fn_t err,
opal_pmix_errhandler_reg_cbfunc_t cbfunc,
void *cbdata)
{
errhandler = err;
if (NULL != cbfunc) {
cbfunc(OPAL_SUCCESS, 0, cbdata);
}
}

void opal_pmix_base_errhandler(int status,
opal_list_t *procs,
opal_list_t *info)
opal_list_t *info,
opal_pmix_release_cbfunc_t cbfunc, void *cbdata)
{
if (NULL != errhandler) {
errhandler(status);
errhandler(status, procs, info, cbfunc, cbdata);
}
}

void opal_pmix_base_deregister_handler(void)
void opal_pmix_base_deregister_handler(int errid,
opal_pmix_op_cbfunc_t cbfunc,
void *cbdata)
{
errhandler = NULL;
if (NULL != cbfunc) {
cbfunc(OPAL_SUCCESS, cbdata);
}
}

struct lookup_caddy_t {
Expand Down
6 changes: 2 additions & 4 deletions opal/mca/pmix/external/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,5 @@ noinst_LTLIBRARIES = $(component_noinst)
libmca_pmix_external_la_SOURCES =$(sources)
libmca_pmix_external_la_CFLAGS =
libmca_pmix_external_la_CPPFLAGS = $(opal_pmix_ext_CPPFLAGS)
libmca_pmix_external_la_LDFLAGS = -module -avoid-version -L$(opal_pmix_ext_LDFLAGS)
libmca_pmix_external_la_LIBADD = $(opal_pmix_ext_LIBS) \
$(OPAL_TOP_BUILDDIR)/opal/mca/event/lib@OPAL_LIB_PREFIX@mca_event.la \
$(OPAL_TOP_BUILDDIR)/opal/mca/hwloc/lib@OPAL_LIB_PREFIX@mca_hwloc.la
libmca_pmix_external_la_LDFLAGS = -module -avoid-version $(opal_pmix_ext_LDFLAGS)
libmca_pmix_external_la_LIBADD = $(opal_pmix_ext_LIBS)
Loading