Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions orte/mca/errmgr/base/help-errmgr-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved.
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand Down Expand Up @@ -61,9 +62,10 @@ route found between them. Please check network connectivity
(including firewalls and network routing requirements).
#
[node-died]
ORTE has lost communication with its daemon located on node:
ORTE has lost communication with a remote daemon.

hostname: %s
HNP daemon : %s on node %s
Remote daemon: %s on node %s

This is usually due to either a failure of the TCP network
connection to the node, or possibly an internal failure of
Expand Down
7 changes: 6 additions & 1 deletion orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* Copyright (c) 2011-2015 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -369,7 +370,11 @@ static void proc_errors(int fd, short args, void *cbdata)
/* record the first one to fail */
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
/* output an error message so the user knows what happened */
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
orte_show_help("help-errmgr-base.txt", "node-died", true,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
orte_process_info.nodename,
ORTE_NAME_PRINT(proc),
pptr->node->name);
/* mark the daemon job as failed */
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
/* point to the lowest rank to cause the problem */
Expand Down
14 changes: 7 additions & 7 deletions orte/mca/oob/tcp/oob_tcp_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -400,27 +401,26 @@ static int tcp_component_register(void)
&mca_oob_tcp_component.disable_ipv6_family);
#endif // OPAL_ENABLE_IPV6

// Default to keepalives every 60 seconds
mca_oob_tcp_component.keepalive_time = 60;
// Wait for this amount of time before sending the first keepalive probe
mca_oob_tcp_component.keepalive_time = 300;
(void)mca_base_component_var_register(component, "keepalive_time",
"Idle time in seconds before starting to send keepalives (keepalive_time <= 0 disables keepalive functionality)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_oob_tcp_component.keepalive_time);

// Default to keepalive retry interval time of 5 seconds
mca_oob_tcp_component.keepalive_intvl = 5;
// Resend keepalive probe every INT seconds
mca_oob_tcp_component.keepalive_intvl = 20;
(void)mca_base_component_var_register(component, "keepalive_intvl",
"Time between successive keepalive pings when peer has not responded, in seconds (ignored if keepalive_time <= 0)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_oob_tcp_component.keepalive_intvl);

// Default to retrying a keepalive 3 times before declaring the
// peer kaput
mca_oob_tcp_component.keepalive_probes = 3;
// After sending PR probes every INT seconds consider the connection dead
mca_oob_tcp_component.keepalive_probes = 9;
(void)mca_base_component_var_register(component, "keepalive_probes",
"Number of keepalives that can be missed before declaring error (ignored if keepalive_time <= 0)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
Expand Down