From 0cfb4f29aa6f6f42d34c7dbd0bfc2458767485ca Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 16 Mar 2015 09:59:21 -0700 Subject: [PATCH 1/5] Silence compiler warning --- ompi/mca/osc/base/osc_base_obj_convert.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ompi/mca/osc/base/osc_base_obj_convert.c b/ompi/mca/osc/base/osc_base_obj_convert.c index 98105236b1c..a5c3a694ecb 100644 --- a/ompi/mca/osc/base/osc_base_obj_convert.c +++ b/ompi/mca/osc/base/osc_base_obj_convert.c @@ -13,7 +13,8 @@ * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -114,7 +115,7 @@ int ompi_osc_base_process_op (void *outbuf, void *inbuf, size_t inbuflen, iov_count = OMPI_OSC_BASE_DECODE_MAX; done = opal_convertor_raw (&convertor, iov, &iov_count, &size); - for (int i = 0 ; i < iov_count ; ++i) { + for (uint32_t i = 0 ; i < iov_count ; ++i) { int primitive_count = iov[i].iov_len / primitive_size; ompi_op_reduce (op, inbuf, iov[i].iov_base, primitive_count, primitive_datatype); inbuf = (void *)((intptr_t) inbuf + iov[i].iov_len); From 69ac25bf559b0faca6712cd1d3f235bae9808286 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 16 Mar 2015 09:59:44 -0700 Subject: [PATCH 2/5] Add support for TCP keepalive on inter-node sockets --- orte/mca/oob/tcp/oob_tcp_common.c | 102 ++++++++++++++++++++++----- orte/mca/oob/tcp/oob_tcp_component.c | 31 ++++++++ orte/mca/oob/tcp/oob_tcp_component.h | 6 +- 3 files changed, 122 insertions(+), 17 deletions(-) diff --git a/orte/mca/oob/tcp/oob_tcp_common.c b/orte/mca/oob/tcp/oob_tcp_common.c index fb552bc4766..7777261d7b5 100644 --- a/orte/mca/oob/tcp/oob_tcp_common.c +++ b/orte/mca/oob/tcp/oob_tcp_common.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -42,32 +42,28 @@ #ifdef HAVE_NETINET_IN_H #include #endif +#ifdef HAVE_NETINET_TCP_H +#include +#endif #ifdef HAVE_ARPA_INET_H #include #endif #ifdef HAVE_NETDB_H #include #endif +#ifdef HAVE_SYS_SOCKET_H +#include +#endif #include -#include "opal/util/show_help.h" #include "opal/util/error.h" #include "opal/util/output.h" #include "opal/opal_socket_errno.h" #include "opal/util/if.h" #include "opal/util/net.h" -#include "opal/util/argv.h" #include "opal/class/opal_hash_table.h" -#include "opal/class/opal_list.h" #include "opal/mca/backtrace/backtrace.h" -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/ess/ess.h" -#include "orte/util/name_fns.h" -#include "orte/util/parse_options.h" -#include "orte/util/show_help.h" -#include "orte/runtime/orte_globals.h" - #include "orte/mca/oob/tcp/oob_tcp.h" #include "orte/mca/oob/tcp/oob_tcp_component.h" #include "oob_tcp_peer.h" @@ -82,7 +78,7 @@ void orte_oob_tcp_set_socket_options(int sd) #if defined(TCP_NODELAY) int optval; optval = 1; - if(setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) { + if (setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) { opal_backtrace_print(stderr, NULL, 1); opal_output(0, "[%s:%d] setsockopt(TCP_NODELAY) failed: %s (%d)", __FILE__, __LINE__, @@ -91,8 +87,8 @@ void orte_oob_tcp_set_socket_options(int sd) } #endif #if defined(SO_SNDBUF) - if(mca_oob_tcp_component.tcp_sndbuf > 0 && - setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *)&mca_oob_tcp_component.tcp_sndbuf, sizeof(int)) < 0) { + if (mca_oob_tcp_component.tcp_sndbuf > 0 && + setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *)&mca_oob_tcp_component.tcp_sndbuf, sizeof(int)) < 0) { opal_output(0, "[%s:%d] setsockopt(SO_SNDBUF) failed: %s (%d)", __FILE__, __LINE__, strerror(opal_socket_errno), @@ -100,14 +96,88 @@ void orte_oob_tcp_set_socket_options(int sd) } #endif #if defined(SO_RCVBUF) - if(mca_oob_tcp_component.tcp_rcvbuf > 0 && - setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *)&mca_oob_tcp_component.tcp_rcvbuf, sizeof(int)) < 0) { + if (mca_oob_tcp_component.tcp_rcvbuf > 0 && + setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *)&mca_oob_tcp_component.tcp_rcvbuf, sizeof(int)) < 0) { opal_output(0, "[%s:%d] setsockopt(SO_RCVBUF) failed: %s (%d)", __FILE__, __LINE__, strerror(opal_socket_errno), opal_socket_errno); } #endif +#if defined(SO_KEEPALIVE) + if (0 < mca_oob_tcp_component.keepalive_time) { + int option; + socklen_t optlen; + + /* see if the keepalive option is available */ + optlen = sizeof(option); + if (getsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, &optlen) < 0) { + /* not available, so just return */ + return; + } + + /* Set the option active */ + option = 1; + if (setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, optlen) < 0) { + opal_output(0, "[%s:%d] setsockopt(SO_KEEPALIVE) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + return; + } + if (mca_oob_tcp_component.tcp_proto < 0) { + /* we don't know the TCP protocol number */ + return; + } +#if defined(TCP_KEEPALIVE) + /* set the idle time */ + if (setsockopt(sd, mca_oob_tcp_component.tcp_proto, TCP_KEEPALIVE, + &mca_oob_tcp_component.keepalive_time, + sizeof(mca_oob_tcp_component.keepalive_time)) < 0) { + opal_output(0, "[%s:%d] setsockopt(TCP_KEEPALIVE) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + return; + } +#elif defined(TCP_KEEPIDLE) + /* set the idle time */ + if (setsockopt(sd, mca_oob_tcp_component.tcp_proto, TCP_KEEPIDLE, + &mca_oob_tcp_component.keepalive_time, + sizeof(mca_oob_tcp_component.keepalive_time)) < 0) { + opal_output(0, "[%s:%d] setsockopt(TCP_KEEPIDLE) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + return; + } +#endif // TCP_KEEPIDLE +#if defined(TCP_KEEPINTVL) + /* set the keepalive interval */ + if (setsockopt(sd, mca_oob_tcp_component.tcp_proto, TCP_KEEPINTVL, + &mca_oob_tcp_component.keepalive_intvl, + sizeof(mca_oob_tcp_component.keepalive_intvl)) < 0) { + opal_output(0, "[%s:%d] setsockopt(TCP_KEEPINTVL) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + return; + } +#endif // TCP_KEEPINTVL +#if defined(TCP_KEEPCNT) + /* set the miss rate */ + if (setsockopt(sd, mca_oob_tcp_component.tcp_proto, TCP_KEEPCNT, + &mca_oob_tcp_component.keepalive_probes, + sizeof(mca_oob_tcp_component.keepalive_probes)) < 0) { + opal_output(0, "[%s:%d] setsockopt(TCP_KEEPCNT) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + } + } +#endif // TCP_KEEPCNT +#endif // SO_KEEPALIVE + } mca_oob_tcp_peer_t* mca_oob_tcp_peer_lookup(const orte_process_name_t *name) diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 383a078e5d9..d4dfdb80212 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -410,6 +410,37 @@ static int tcp_component_register(void) &mca_oob_tcp_component.disable_ipv6_family); #endif + + mca_oob_tcp_component.keepalive_time = -1; + (void)mca_base_component_var_register(component, "keepalive_time", + "Idle time in seconds before starting to send keepalives (num <= 0 => disable keepalive)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_oob_tcp_component.keepalive_time); + if (0 < mca_oob_tcp_component.keepalive_time) { + struct protoent *proto; + if (NULL != (proto = getprotobyname("TCP"))) { + mca_oob_tcp_component.tcp_proto = proto->p_proto; + } else { + mca_oob_tcp_component.tcp_proto = -1; + } + } + + mca_oob_tcp_component.keepalive_intvl = 5; + (void)mca_base_component_var_register(component, "keepalive_intvl", + "Time between keepalives, in seconds", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_oob_tcp_component.keepalive_intvl); + mca_oob_tcp_component.keepalive_probes = 3; + (void)mca_base_component_var_register(component, "keepalive_probes", + "Number of keepalives that can be missed before declaring error", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_oob_tcp_component.keepalive_probes); return ORTE_SUCCESS; } diff --git a/orte/mca/oob/tcp/oob_tcp_component.h b/orte/mca/oob/tcp/oob_tcp_component.h index a6657caace1..84abf8c696a 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.h +++ b/orte/mca/oob/tcp/oob_tcp_component.h @@ -12,7 +12,7 @@ * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved + * Copyright (c) 2014-2015 Intel, Inc. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -77,6 +77,10 @@ typedef struct { bool listen_thread_active; struct timeval listen_thread_tv; /**< Timeout when using listen thread */ int stop_thread[2]; /**< pipe used to exit the listen thread */ + int keepalive_probes; /**< number of keepalives that can be missed before declaring error */ + int keepalive_time; /**< idle time in seconds before starting to send keepalives */ + int keepalive_intvl; /**< time between keepalives, in seconds */ + int tcp_proto; /**< TCP protocol number */ } mca_oob_tcp_component_t; ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component; From 019bba5cafc223199820f95588381beff31ad61c Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 16 Mar 2015 11:54:51 -0700 Subject: [PATCH 3/5] Cleanup a bit - don't need to lookup the protocol number if we just use the right define --- orte/mca/oob/tcp/oob_tcp_common.c | 12 ++++-------- orte/mca/oob/tcp/oob_tcp_component.c | 10 +--------- orte/mca/oob/tcp/oob_tcp_component.h | 1 - 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/orte/mca/oob/tcp/oob_tcp_common.c b/orte/mca/oob/tcp/oob_tcp_common.c index 7777261d7b5..a6538a55c98 100644 --- a/orte/mca/oob/tcp/oob_tcp_common.c +++ b/orte/mca/oob/tcp/oob_tcp_common.c @@ -125,13 +125,9 @@ void orte_oob_tcp_set_socket_options(int sd) opal_socket_errno); return; } - if (mca_oob_tcp_component.tcp_proto < 0) { - /* we don't know the TCP protocol number */ - return; - } #if defined(TCP_KEEPALIVE) /* set the idle time */ - if (setsockopt(sd, mca_oob_tcp_component.tcp_proto, TCP_KEEPALIVE, + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPALIVE, &mca_oob_tcp_component.keepalive_time, sizeof(mca_oob_tcp_component.keepalive_time)) < 0) { opal_output(0, "[%s:%d] setsockopt(TCP_KEEPALIVE) failed: %s (%d)", @@ -142,7 +138,7 @@ void orte_oob_tcp_set_socket_options(int sd) } #elif defined(TCP_KEEPIDLE) /* set the idle time */ - if (setsockopt(sd, mca_oob_tcp_component.tcp_proto, TCP_KEEPIDLE, + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPIDLE, &mca_oob_tcp_component.keepalive_time, sizeof(mca_oob_tcp_component.keepalive_time)) < 0) { opal_output(0, "[%s:%d] setsockopt(TCP_KEEPIDLE) failed: %s (%d)", @@ -154,7 +150,7 @@ void orte_oob_tcp_set_socket_options(int sd) #endif // TCP_KEEPIDLE #if defined(TCP_KEEPINTVL) /* set the keepalive interval */ - if (setsockopt(sd, mca_oob_tcp_component.tcp_proto, TCP_KEEPINTVL, + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPINTVL, &mca_oob_tcp_component.keepalive_intvl, sizeof(mca_oob_tcp_component.keepalive_intvl)) < 0) { opal_output(0, "[%s:%d] setsockopt(TCP_KEEPINTVL) failed: %s (%d)", @@ -166,7 +162,7 @@ void orte_oob_tcp_set_socket_options(int sd) #endif // TCP_KEEPINTVL #if defined(TCP_KEEPCNT) /* set the miss rate */ - if (setsockopt(sd, mca_oob_tcp_component.tcp_proto, TCP_KEEPCNT, + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPCNT, &mca_oob_tcp_component.keepalive_probes, sizeof(mca_oob_tcp_component.keepalive_probes)) < 0) { opal_output(0, "[%s:%d] setsockopt(TCP_KEEPCNT) failed: %s (%d)", diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index d4dfdb80212..680ca615c3f 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -411,21 +411,13 @@ static int tcp_component_register(void) #endif - mca_oob_tcp_component.keepalive_time = -1; + mca_oob_tcp_component.keepalive_time = 10; (void)mca_base_component_var_register(component, "keepalive_time", "Idle time in seconds before starting to send keepalives (num <= 0 => disable keepalive)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.keepalive_time); - if (0 < mca_oob_tcp_component.keepalive_time) { - struct protoent *proto; - if (NULL != (proto = getprotobyname("TCP"))) { - mca_oob_tcp_component.tcp_proto = proto->p_proto; - } else { - mca_oob_tcp_component.tcp_proto = -1; - } - } mca_oob_tcp_component.keepalive_intvl = 5; (void)mca_base_component_var_register(component, "keepalive_intvl", diff --git a/orte/mca/oob/tcp/oob_tcp_component.h b/orte/mca/oob/tcp/oob_tcp_component.h index 84abf8c696a..cd48d2e639f 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.h +++ b/orte/mca/oob/tcp/oob_tcp_component.h @@ -80,7 +80,6 @@ typedef struct { int keepalive_probes; /**< number of keepalives that can be missed before declaring error */ int keepalive_time; /**< idle time in seconds before starting to send keepalives */ int keepalive_intvl; /**< time between keepalives, in seconds */ - int tcp_proto; /**< TCP protocol number */ } mca_oob_tcp_component_t; ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component; From 4ded049cbcefea3bb7543a0047b50d28291f45b6 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 16 Mar 2015 11:57:32 -0700 Subject: [PATCH 4/5] Modify MCA param description --- orte/mca/oob/tcp/oob_tcp_component.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 680ca615c3f..874c8d3611a 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -413,7 +413,7 @@ static int tcp_component_register(void) mca_oob_tcp_component.keepalive_time = 10; (void)mca_base_component_var_register(component, "keepalive_time", - "Idle time in seconds before starting to send keepalives (num <= 0 => disable keepalive)", + "Idle time in seconds before starting to send keepalives (num <= 0 ----> disable keepalive)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, From 64d11f170a7a3bcea8fe24a7cb5bb5a96753271e Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 16 Mar 2015 12:32:58 -0700 Subject: [PATCH 5/5] Adjust the default keepalive interval. Refactor the code when setting keepalive options --- orte/mca/oob/tcp/oob_tcp_common.c | 136 ++++++++++++++------------- orte/mca/oob/tcp/oob_tcp_component.c | 2 +- 2 files changed, 71 insertions(+), 67 deletions(-) diff --git a/orte/mca/oob/tcp/oob_tcp_common.c b/orte/mca/oob/tcp/oob_tcp_common.c index a6538a55c98..9e5b36ed75e 100644 --- a/orte/mca/oob/tcp/oob_tcp_common.c +++ b/orte/mca/oob/tcp/oob_tcp_common.c @@ -73,6 +73,75 @@ * Set socket buffering */ +static void set_keepalive(int sd) +{ + int option; + socklen_t optlen; + + /* see if the keepalive option is available */ + optlen = sizeof(option); + if (getsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, &optlen) < 0) { + /* not available, so just return */ + return; + } + + /* Set the option active */ + option = 1; + if (setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, optlen) < 0) { + opal_output(0, "[%s:%d] setsockopt(SO_KEEPALIVE) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + return; + } +#if defined(TCP_KEEPALIVE) + /* set the idle time */ + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPALIVE, + &mca_oob_tcp_component.keepalive_time, + sizeof(mca_oob_tcp_component.keepalive_time)) < 0) { + opal_output(0, "[%s:%d] setsockopt(TCP_KEEPALIVE) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + return; + } +#elif defined(TCP_KEEPIDLE) + /* set the idle time */ + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPIDLE, + &mca_oob_tcp_component.keepalive_time, + sizeof(mca_oob_tcp_component.keepalive_time)) < 0) { + opal_output(0, "[%s:%d] setsockopt(TCP_KEEPIDLE) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + return; + } +#endif // TCP_KEEPIDLE +#if defined(TCP_KEEPINTVL) + /* set the keepalive interval */ + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPINTVL, + &mca_oob_tcp_component.keepalive_intvl, + sizeof(mca_oob_tcp_component.keepalive_intvl)) < 0) { + opal_output(0, "[%s:%d] setsockopt(TCP_KEEPINTVL) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + return; + } +#endif // TCP_KEEPINTVL +#if defined(TCP_KEEPCNT) + /* set the miss rate */ + if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPCNT, + &mca_oob_tcp_component.keepalive_probes, + sizeof(mca_oob_tcp_component.keepalive_probes)) < 0) { + opal_output(0, "[%s:%d] setsockopt(TCP_KEEPCNT) failed: %s (%d)", + __FILE__, __LINE__, + strerror(opal_socket_errno), + opal_socket_errno); + } +#endif // TCP_KEEPCNT +} + void orte_oob_tcp_set_socket_options(int sd) { #if defined(TCP_NODELAY) @@ -106,74 +175,9 @@ void orte_oob_tcp_set_socket_options(int sd) #endif #if defined(SO_KEEPALIVE) if (0 < mca_oob_tcp_component.keepalive_time) { - int option; - socklen_t optlen; - - /* see if the keepalive option is available */ - optlen = sizeof(option); - if (getsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, &optlen) < 0) { - /* not available, so just return */ - return; - } - - /* Set the option active */ - option = 1; - if (setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, optlen) < 0) { - opal_output(0, "[%s:%d] setsockopt(SO_KEEPALIVE) failed: %s (%d)", - __FILE__, __LINE__, - strerror(opal_socket_errno), - opal_socket_errno); - return; - } -#if defined(TCP_KEEPALIVE) - /* set the idle time */ - if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPALIVE, - &mca_oob_tcp_component.keepalive_time, - sizeof(mca_oob_tcp_component.keepalive_time)) < 0) { - opal_output(0, "[%s:%d] setsockopt(TCP_KEEPALIVE) failed: %s (%d)", - __FILE__, __LINE__, - strerror(opal_socket_errno), - opal_socket_errno); - return; - } -#elif defined(TCP_KEEPIDLE) - /* set the idle time */ - if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPIDLE, - &mca_oob_tcp_component.keepalive_time, - sizeof(mca_oob_tcp_component.keepalive_time)) < 0) { - opal_output(0, "[%s:%d] setsockopt(TCP_KEEPIDLE) failed: %s (%d)", - __FILE__, __LINE__, - strerror(opal_socket_errno), - opal_socket_errno); - return; - } -#endif // TCP_KEEPIDLE -#if defined(TCP_KEEPINTVL) - /* set the keepalive interval */ - if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPINTVL, - &mca_oob_tcp_component.keepalive_intvl, - sizeof(mca_oob_tcp_component.keepalive_intvl)) < 0) { - opal_output(0, "[%s:%d] setsockopt(TCP_KEEPINTVL) failed: %s (%d)", - __FILE__, __LINE__, - strerror(opal_socket_errno), - opal_socket_errno); - return; - } -#endif // TCP_KEEPINTVL -#if defined(TCP_KEEPCNT) - /* set the miss rate */ - if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPCNT, - &mca_oob_tcp_component.keepalive_probes, - sizeof(mca_oob_tcp_component.keepalive_probes)) < 0) { - opal_output(0, "[%s:%d] setsockopt(TCP_KEEPCNT) failed: %s (%d)", - __FILE__, __LINE__, - strerror(opal_socket_errno), - opal_socket_errno); - } + set_keepalive(sd); } -#endif // TCP_KEEPCNT #endif // SO_KEEPALIVE - } mca_oob_tcp_peer_t* mca_oob_tcp_peer_lookup(const orte_process_name_t *name) diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 874c8d3611a..5206bb5eafc 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -419,7 +419,7 @@ static int tcp_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.keepalive_time); - mca_oob_tcp_component.keepalive_intvl = 5; + mca_oob_tcp_component.keepalive_intvl = 60; (void)mca_base_component_var_register(component, "keepalive_intvl", "Time between keepalives, in seconds", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,