From 8e165c81f83623f8365fb6228f06715ec93b2435 Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Wed, 15 Feb 2017 16:35:37 -0500 Subject: [PATCH 1/2] orte/errmgr: Improve help message on connection lost Signed-off-by: Joshua Hursey (cherry picked from commit c452f6849552e2454708cbf9bcd5189b7643d721) Signed-off-by: Joshua Hursey --- orte/mca/errmgr/base/help-errmgr-base.txt | 6 ++++-- orte/mca/errmgr/default_hnp/errmgr_default_hnp.c | 7 ++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/orte/mca/errmgr/base/help-errmgr-base.txt b/orte/mca/errmgr/base/help-errmgr-base.txt index c7e3051bb95..4aec50c04d4 100644 --- a/orte/mca/errmgr/base/help-errmgr-base.txt +++ b/orte/mca/errmgr/base/help-errmgr-base.txt @@ -11,6 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -61,9 +62,10 @@ route found between them. Please check network connectivity (including firewalls and network routing requirements). # [node-died] -ORTE has lost communication with its daemon located on node: +ORTE has lost communication with a remote daemon. - hostname: %s + HNP daemon : %s on node %s + Remote daemon: %s on node %s This is usually due to either a failure of the TCP network connection to the node, or possibly an internal failure of diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index a29d80e9b0a..fcdbe3acc30 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -10,6 +10,7 @@ * Copyright (c) 2011-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -369,7 +370,11 @@ static void proc_errors(int fd, short args, void *cbdata) /* record the first one to fail */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { /* output an error message so the user knows what happened */ - orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name); + orte_show_help("help-errmgr-base.txt", "node-died", true, + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_process_info.nodename, + ORTE_NAME_PRINT(proc), + pptr->node->name); /* mark the daemon job as failed */ jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ From 353f72045a3d78325dcfd41f1fd7ed09caf4c9a0 Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Fri, 17 Feb 2017 10:56:02 -0600 Subject: [PATCH 2/2] oob/tcp: Adjust TCP keepalive default values Signed-off-by: Joshua Hursey (cherry picked from commit df0f8e95cdd449b9408da6744c51bb0fc6233b76) Signed-off-by: Joshua Hursey --- orte/mca/oob/tcp/oob_tcp_component.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 45580a61b90..85cad6c83d6 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -16,6 +16,7 @@ * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -400,8 +401,8 @@ static int tcp_component_register(void) &mca_oob_tcp_component.disable_ipv6_family); #endif // OPAL_ENABLE_IPV6 - // Default to keepalives every 60 seconds - mca_oob_tcp_component.keepalive_time = 60; + // Wait for this amount of time before sending the first keepalive probe + mca_oob_tcp_component.keepalive_time = 300; (void)mca_base_component_var_register(component, "keepalive_time", "Idle time in seconds before starting to send keepalives (keepalive_time <= 0 disables keepalive functionality)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -409,8 +410,8 @@ static int tcp_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.keepalive_time); - // Default to keepalive retry interval time of 5 seconds - mca_oob_tcp_component.keepalive_intvl = 5; + // Resend keepalive probe every INT seconds + mca_oob_tcp_component.keepalive_intvl = 20; (void)mca_base_component_var_register(component, "keepalive_intvl", "Time between successive keepalive pings when peer has not responded, in seconds (ignored if keepalive_time <= 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -418,9 +419,8 @@ static int tcp_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.keepalive_intvl); - // Default to retrying a keepalive 3 times before declaring the - // peer kaput - mca_oob_tcp_component.keepalive_probes = 3; + // After sending PR probes every INT seconds consider the connection dead + mca_oob_tcp_component.keepalive_probes = 9; (void)mca_base_component_var_register(component, "keepalive_probes", "Number of keepalives that can be missed before declaring error (ignored if keepalive_time <= 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,