Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e8f9c61
Update ORTE to include all fixes since v3.x branching
May 5, 2017
dea38f8
Enable full operations under SLURM on Cray systems by co-locating a d…
May 7, 2017
d2a5a98
orte/util: fix vpids parsing in orte_util_nidmap_parse()
ggouaillardet May 8, 2017
2387024
Do not pass topologies during tree spawn of daemons as there is no wa…
Apr 26, 2017
47c4f88
pmix2x: plug a misc memory leak
ggouaillardet May 10, 2017
8f82486
Fix the nidmap computation to deal with hetero nodes
May 9, 2017
6bfacc3
Sigh - remove debug
May 10, 2017
a6a9a67
Add verbose output to nidmap code for debugging as this is a new, and…
May 10, 2017
c4aae19
Finally fix the problem - the key was knowing there were more than 2 …
May 10, 2017
ff44d69
When a daemon force-terminates, we don't get the show_help message it…
May 11, 2017
71f031d
Fix total_slots_allocated computation
May 12, 2017
bcf00d0
Fix --nolocal
May 12, 2017
964201f
Remove debug
May 12, 2017
612cd66
odls: fix handling of the orte fork agent
ggouaillardet May 8, 2017
0e205d4
Add debug verbosity to the orte data server and pmix pub/lookup funct…
May 12, 2017
372ae34
Fix ompi-server operations
May 26, 2017
6a1bc30
Update the connect/accept support so we check to see if we have the p…
May 27, 2017
27e8e3b
Silence coverity warnings
May 27, 2017
46a9f7c
Update to PMIx v2.0.0rc1
May 28, 2017
b910d55
Update PMIx to 2.0.0rc1 - correcting incomplete cherry-pick
May 29, 2017
3227e17
Protect against the condition where the port string is actually NULL
May 29, 2017
73c25a8
Add some debug code for detecting leaking file descriptors. At the en…
May 29, 2017
7a4418a
Ensure that data from a job that was stored in ompi-server is purged …
May 30, 2017
b5f470e
Fix the DVM
May 30, 2017
2d5e030
Fix cwd and preload-binary options
May 30, 2017
fe5d92a
Reorg ordering so that bare executable names also are found
May 30, 2017
f94b57b
Don't sweat the "sync" settings on file descriptors as those flags ar…
May 31, 2017
ddca65a
Update PMIx to new release state
May 31, 2017
0ea1c67
Fix uninitialized variable. Set exit codes for failed launch so we ge…
May 31, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ orte/test/mpi/memcached-dummy
orte/test/mpi/coll_test
orte/test/mpi/badcoll
orte/test/mpi/iof
orte/test/mpi/no-disconnect

orte/test/system/radix
orte/test/system/sigusr_trap
Expand Down
10 changes: 10 additions & 0 deletions config/orte_check_slurm.m4
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand Down Expand Up @@ -68,6 +69,15 @@ AC_DEFUN([ORTE_CHECK_SLURM],[
[orte_check_slurm_happy="yes"],
[orte_check_slurm_happy="no"])])

# check to see if this is a Cray nativized slurm env.

slurm_cray_env=0
OPAL_CHECK_ALPS([orte_slurm_cray],
[slurm_cray_env=1])

AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env],
[defined to 1 if slurm cray env, 0 otherwise])

OPAL_SUMMARY_ADD([[Resource Managers]],[[Slurm]],[$1],[$orte_check_slurm_happy])
fi

Expand Down
9 changes: 8 additions & 1 deletion ompi/dpm/dpm.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
Expand All @@ -40,6 +40,7 @@
#include "opal/util/argv.h"
#include "opal/util/opal_getcwd.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"
#include "opal/dss/dss.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/pmix.h"
Expand Down Expand Up @@ -112,6 +113,12 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
if (NULL == opal_pmix.publish || NULL == opal_pmix.connect ||
NULL == opal_pmix.unpublish ||
(NULL == opal_pmix.lookup && NULL == opal_pmix.lookup_nb)) {
/* print a nice message explaining we don't have support */
opal_show_help("help-mpi-runtime.txt", "noconxcpt", true);
return OMPI_ERR_NOT_SUPPORTED;
}
if (!ompi_rte_connect_accept_support(port_string)) {
/* they will have printed the help message */
return OMPI_ERR_NOT_SUPPORTED;
}

Expand Down
5 changes: 4 additions & 1 deletion ompi/mca/rte/orte/rte_orte.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
Expand Down Expand Up @@ -116,6 +116,9 @@ static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * nam
}
#endif

/* check dynamics support */
OMPI_DECLSPEC bool ompi_rte_connect_accept_support(const char *port);

END_C_DECLS

#endif /* MCA_OMPI_RTE_ORTE_H */
45 changes: 45 additions & 0 deletions ompi/mca/rte/orte/rte_orte_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "orte/mca/routed/routed.h"
#include "orte/util/name_fns.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_data_server.h"
Expand Down Expand Up @@ -190,3 +191,47 @@ void ompi_rte_wait_for_debugger(void)
opal_pmix.deregister_evhandler(handler, NULL, NULL);
}
}

bool ompi_rte_connect_accept_support(const char *port)
{
char *ptr, *tmp;
orte_process_name_t name;

/* were we launched by mpirun, or are we calling
* without a defined port? */
if (NULL == orte_process_info.my_hnp_uri ||
NULL == port || 0 == strlen(port)) {
return true;
}

/* is the job family in the port different than my own? */
tmp = strdup(port); // protect input
if (NULL == (ptr = strchr(tmp, ':'))) {
/* this port didn't come from us! */
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
free(tmp);
return false;
}
*ptr = '\0';
if (ORTE_SUCCESS != orte_util_convert_string_to_process_name(&name, tmp)) {
free(tmp);
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
return false;
}
free(tmp);
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(name.jobid)) {
/* same job family, so our infrastructure is adequate */
return true;
}

/* if the job family of the port is different than our own
* and we were launched by mpirun, then we require ompi-server
* support */
if (NULL == orte_data_server_uri) {
/* print a pretty help message */
orte_show_help("help-orterun.txt", "orterun:server-unavailable", true);
return false;
}

return true;
}
11 changes: 11 additions & 0 deletions ompi/runtime/help-mpi-runtime.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# All rights reserved.
# Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand Down Expand Up @@ -93,3 +94,13 @@ Open MPI with --enable-heterogeneous.
[no cuda support]
The user requested CUDA support with the --mca mpi_cuda_support 1 flag
but the library was not compiled with any support.
#
[noconxcpt]
The user has called an operation involving MPI_Connect and/or MPI_Accept,
but this environment lacks the necessary infrastructure support for
that operation. Open MPI relies on the PMIx_Publish/Lookup (or one of
its predecessors) APIs for this operation.

This typically happens when launching outside of mpirun where the underlying
resource manager does not provide publish/lookup support. One way of solving
the problem is to simply use mpirun to start the application.
7 changes: 4 additions & 3 deletions opal/include/opal/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -94,10 +94,11 @@ enum {
OPAL_ERR_PROC_RESTART = (OPAL_ERR_BASE - 63),
OPAL_ERR_PROC_CHECKPOINT = (OPAL_ERR_BASE - 64),
OPAL_ERR_PROC_MIGRATE = (OPAL_ERR_BASE - 65),
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66)
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66),
OPAL_ERR_HEARTBEAT_ALERT = (OPAL_ERR_BASE - 67),
OPAL_ERR_FILE_ALERT = (OPAL_ERR_BASE - 68)
};

#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)

#endif /* OPAL_CONSTANTS_H */

3 changes: 2 additions & 1 deletion opal/mca/pmix/base/base.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -57,6 +57,7 @@ OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);

typedef struct {
opal_event_base_t *evbase;
int timeout;
} opal_pmix_base_t;

extern opal_pmix_base_t opal_pmix_base;
Expand Down
55 changes: 44 additions & 11 deletions opal/mca/pmix/base/pmix_base_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
/*
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Mellanox Technologies, Inc.
Expand Down Expand Up @@ -118,6 +118,13 @@ static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata)
cd->active = false;
}

static void opcbfunc(int status, void *cbdata)
{
struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata;
cd->status = status;
cd->active = false;
}

int opal_pmix_base_exchange(opal_value_t *indat,
opal_pmix_pdata_t *outdat,
int timeout)
Expand All @@ -141,17 +148,37 @@ int opal_pmix_base_exchange(opal_value_t *indat,
opal_list_append(&ilist, &info->super);

/* publish it with "session" scope */
rc = opal_pmix.publish(&ilist);
OPAL_LIST_DESTRUCT(&ilist);
if (OPAL_SUCCESS != rc) {
OPAL_ERROR_LOG(rc);
return rc;
if (NULL == opal_pmix.publish_nb) {
rc = opal_pmix.publish(&ilist);
OPAL_LIST_DESTRUCT(&ilist);
if (OPAL_SUCCESS != rc) {
OPAL_ERROR_LOG(rc);
return rc;
}
} else {
caddy.status = -1;
caddy.active = true;
caddy.pdat = NULL;
rc = opal_pmix.publish_nb(&ilist, opcbfunc, &caddy);
if (OPAL_SUCCESS != rc) {
OPAL_ERROR_LOG(rc);
OPAL_LIST_DESTRUCT(&ilist);
return rc;
}
while (caddy.active) {
usleep(10);
}
OPAL_LIST_DESTRUCT(&ilist);
if (OPAL_SUCCESS != caddy.status) {
OPAL_ERROR_LOG(caddy.status);
return caddy.status;
}
}

/* lookup the other side's info - if a non-blocking form
* of lookup isn't available, then we use the blocking
* form and trust that the underlying system will WAIT
* until the other side publishes its data */
/* lookup the other side's info - if a non-blocking form
* of lookup isn't available, then we use the blocking
* form and trust that the underlying system will WAIT
* until the other side publishes its data */
pdat = OBJ_NEW(opal_pmix_pdata_t);
pdat->value.key = strdup(outdat->value.key);
pdat->value.type = outdat->value.type;
Expand All @@ -169,7 +196,12 @@ int opal_pmix_base_exchange(opal_value_t *indat,
info = OBJ_NEW(opal_value_t);
info->key = strdup(OPAL_PMIX_TIMEOUT);
info->type = OPAL_INT;
info->data.integer = timeout;
if (0 < opal_pmix_base.timeout) {
/* the user has overridden the default */
info->data.integer = opal_pmix_base.timeout;
} else {
info->data.integer = timeout;
}
opal_list_append(&mlist, &info->super);

/* if a non-blocking version of lookup isn't
Expand All @@ -185,6 +217,7 @@ int opal_pmix_base_exchange(opal_value_t *indat,
return rc;
}
} else {
caddy.status = -1;
caddy.active = true;
caddy.pdat = pdat;
keys = NULL;
Expand Down
8 changes: 7 additions & 1 deletion opal/mca/pmix/base/pmix_base_frame.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -47,6 +47,12 @@ static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
(void) mca_base_var_register("opal", "pmix", "base", "collect_data", "Collect all data during modex",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_collect_all_data);

opal_pmix_base.timeout = -1;
(void) mca_base_var_register("opal", "pmix", "base", "exchange_timeout",
"Time (in seconds) to wait for a data exchange to complete",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_base.timeout);
return OPAL_SUCCESS;
}

Expand Down
4 changes: 2 additions & 2 deletions opal/mca/pmix/ext2x/pmix2x.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Mellanox Technologies, Inc.
Expand Down Expand Up @@ -352,7 +352,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
if (NULL != chain->final_cbfunc) {
chain->final_cbfunc(PMIX_SUCCESS, chain->final_cbdata);
}

OBJ_RELEASE(chain);

return;
Expand Down
16 changes: 14 additions & 2 deletions opal/mca/pmix/pmix2x/configure.m4
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,27 @@ AC_DEFUN([MCA_opal_pmix_pmix2x_CONFIG],[
opal_pmix_pmix2x_sm_flag=--disable-dstore
fi

opal_pmix_pmix2x_args="--with-pmix-symbol-rename=OPAL_MCA_PMIX2X_ $opal_pmix_pmix2x_sm_flag --without-tests-examples --disable-visibility --enable-embedded-libevent --with-libevent-header=\\\"opal/mca/event/$opal_event_base_include\\\""
AC_ARG_ENABLE([pmix-timing],
[AC_HELP_STRING([--enable-pmix-timing],
[Enable PMIx timing measurements (default: disabled)])])
AC_MSG_CHECKING([if PMIx timing is enabled])
if test "$enable_pmix_timing" == "yes"; then
AC_MSG_RESULT([yes])
opal_pmix_pmix2x_timing_flag=--enable-pmix-timing
else
AC_MSG_RESULT([no (disabled)])
opal_pmix_pmix2x_timing_flag=--disable-pmix-timing
fi

opal_pmix_pmix2x_args="--with-pmix-symbol-rename=OPAL_MCA_PMIX2X_ $opal_pmix_pmix2x_sm_flag $opal_pmix_pmix2x_timing_flag --without-tests-examples --disable-pmix-backward-compatibility --disable-visibility --enable-embedded-libevent --with-libevent-header=\\\"opal/mca/event/$opal_event_base_include\\\" --enable-embedded-mode"
AS_IF([test "$enable_debug" = "yes"],
[opal_pmix_pmix2x_args="--enable-debug $opal_pmix_pmix2x_args"
CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"],
[opal_pmix_pmix2x_args="--disable-debug $opal_pmix_pmix2x_args"
CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS"])
AS_IF([test "$with_devel_headers" = "yes"],
[opal_pmix_pmix2x_args="--with-devel-headers $opal_pmix_pmix2x_args"],
[opal_pmix_pmix2x_args="--enable-embedded-mode $opal_pmix_pmix2x_args"])
[opal_pmix_pmix2x_args=$opal_pmix_pmix2x_args])
CPPFLAGS="-I$OPAL_TOP_SRCDIR -I$OPAL_TOP_BUILDDIR -I$OPAL_TOP_SRCDIR/opal/include -I$OPAL_TOP_BUILDDIR/opal/include $CPPFLAGS"

OPAL_CONFIG_SUBDIR([$opal_pmix_pmix2x_basedir/pmix],
Expand Down
29 changes: 28 additions & 1 deletion opal/mca/pmix/pmix2x/pmix/NEWS
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
Copyright (c) 2017 IBM Corporation. All rights reserved.
$COPYRIGHT$

Additional copyrights may follow
Expand All @@ -23,6 +24,32 @@ current release as well as the "stable" bug fix release branch.
Master (not on release branches yet)
------------------------------------

1.2.2 -- 21 March 2017
----------------------
- Compiler fix for Sun/Oracle CC (PR #322)
- Fix missing include (PR #326)
- Improve error checking around posix_fallocate (PR #329)
- Fix possible memory corruption (PR #331)


1.2.1 -- 21 Feb. 2017
----------------------
- dstore: Fix data corruption bug in key overwrite cases
- dstore: Performance and scalability fixes
- sm: Use posix_fallocate() before mmap
- pmi1/pmi2: Restore support
- dstore: Fix extension slot size allocation (Issue #280)


1.2.0 -- 14 Dec. 2016
----------------------
- Add shared memory data storage (dstore) option. Default: enabled
Configure option: --disable-dstore
- PMIx_Commit performance improvements
- Disable errhandler support
- Keep job info in the shared memory dstore
- PMIx_Get performance and memory improvements

1.1.5
-----
- Add pmix_version.h to support direct detection of PMIx library version
Expand Down
Loading