Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions ompi/mca/osc/base/osc_base_obj_convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -114,7 +115,7 @@ int ompi_osc_base_process_op (void *outbuf, void *inbuf, size_t inbuflen,
iov_count = OMPI_OSC_BASE_DECODE_MAX;
done = opal_convertor_raw (&convertor, iov, &iov_count, &size);

for (int i = 0 ; i < iov_count ; ++i) {
for (uint32_t i = 0 ; i < iov_count ; ++i) {
int primitive_count = iov[i].iov_len / primitive_size;
ompi_op_reduce (op, inbuf, iov[i].iov_base, primitive_count, primitive_datatype);
inbuf = (void *)((intptr_t) inbuf + iov[i].iov_len);
Expand Down
102 changes: 86 additions & 16 deletions orte/mca/oob/tcp/oob_tcp_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* All rights reserved.
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -42,32 +42,28 @@
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_NETINET_TCP_H
#include <netinet/tcp.h>
#endif
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif
#include <ctype.h>

#include "opal/util/show_help.h"
#include "opal/util/error.h"
#include "opal/util/output.h"
#include "opal/opal_socket_errno.h"
#include "opal/util/if.h"
#include "opal/util/net.h"
#include "opal/util/argv.h"
#include "opal/class/opal_hash_table.h"
#include "opal/class/opal_list.h"
#include "opal/mca/backtrace/backtrace.h"

#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/name_fns.h"
#include "orte/util/parse_options.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"

#include "orte/mca/oob/tcp/oob_tcp.h"
#include "orte/mca/oob/tcp/oob_tcp_component.h"
#include "oob_tcp_peer.h"
Expand All @@ -77,12 +73,81 @@
* Set socket buffering
*/

static void set_keepalive(int sd)
{
int option;
socklen_t optlen;

/* see if the keepalive option is available */
optlen = sizeof(option);
if (getsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, &optlen) < 0) {
/* not available, so just return */
return;
}

/* Set the option active */
option = 1;
if (setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &option, optlen) < 0) {
opal_output(0, "[%s:%d] setsockopt(SO_KEEPALIVE) failed: %s (%d)",
__FILE__, __LINE__,
strerror(opal_socket_errno),
opal_socket_errno);
return;
}
#if defined(TCP_KEEPALIVE)
/* set the idle time */
if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPALIVE,
&mca_oob_tcp_component.keepalive_time,
sizeof(mca_oob_tcp_component.keepalive_time)) < 0) {
opal_output(0, "[%s:%d] setsockopt(TCP_KEEPALIVE) failed: %s (%d)",
__FILE__, __LINE__,
strerror(opal_socket_errno),
opal_socket_errno);
return;
}
#elif defined(TCP_KEEPIDLE)
/* set the idle time */
if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPIDLE,
&mca_oob_tcp_component.keepalive_time,
sizeof(mca_oob_tcp_component.keepalive_time)) < 0) {
opal_output(0, "[%s:%d] setsockopt(TCP_KEEPIDLE) failed: %s (%d)",
__FILE__, __LINE__,
strerror(opal_socket_errno),
opal_socket_errno);
return;
}
#endif // TCP_KEEPIDLE
#if defined(TCP_KEEPINTVL)
/* set the keepalive interval */
if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPINTVL,
&mca_oob_tcp_component.keepalive_intvl,
sizeof(mca_oob_tcp_component.keepalive_intvl)) < 0) {
opal_output(0, "[%s:%d] setsockopt(TCP_KEEPINTVL) failed: %s (%d)",
__FILE__, __LINE__,
strerror(opal_socket_errno),
opal_socket_errno);
return;
}
#endif // TCP_KEEPINTVL
#if defined(TCP_KEEPCNT)
/* set the miss rate */
if (setsockopt(sd, IPPROTO_TCP, TCP_KEEPCNT,
&mca_oob_tcp_component.keepalive_probes,
sizeof(mca_oob_tcp_component.keepalive_probes)) < 0) {
opal_output(0, "[%s:%d] setsockopt(TCP_KEEPCNT) failed: %s (%d)",
__FILE__, __LINE__,
strerror(opal_socket_errno),
opal_socket_errno);
}
#endif // TCP_KEEPCNT
}

void orte_oob_tcp_set_socket_options(int sd)
{
#if defined(TCP_NODELAY)
int optval;
optval = 1;
if(setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) {
if (setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)) < 0) {
opal_backtrace_print(stderr, NULL, 1);
opal_output(0, "[%s:%d] setsockopt(TCP_NODELAY) failed: %s (%d)",
__FILE__, __LINE__,
Expand All @@ -91,23 +156,28 @@ void orte_oob_tcp_set_socket_options(int sd)
}
#endif
#if defined(SO_SNDBUF)
if(mca_oob_tcp_component.tcp_sndbuf > 0 &&
setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *)&mca_oob_tcp_component.tcp_sndbuf, sizeof(int)) < 0) {
if (mca_oob_tcp_component.tcp_sndbuf > 0 &&
setsockopt(sd, SOL_SOCKET, SO_SNDBUF, (char *)&mca_oob_tcp_component.tcp_sndbuf, sizeof(int)) < 0) {
opal_output(0, "[%s:%d] setsockopt(SO_SNDBUF) failed: %s (%d)",
__FILE__, __LINE__,
strerror(opal_socket_errno),
opal_socket_errno);
}
#endif
#if defined(SO_RCVBUF)
if(mca_oob_tcp_component.tcp_rcvbuf > 0 &&
setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *)&mca_oob_tcp_component.tcp_rcvbuf, sizeof(int)) < 0) {
if (mca_oob_tcp_component.tcp_rcvbuf > 0 &&
setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (char *)&mca_oob_tcp_component.tcp_rcvbuf, sizeof(int)) < 0) {
opal_output(0, "[%s:%d] setsockopt(SO_RCVBUF) failed: %s (%d)",
__FILE__, __LINE__,
strerror(opal_socket_errno),
opal_socket_errno);
}
#endif
#if defined(SO_KEEPALIVE)
if (0 < mca_oob_tcp_component.keepalive_time) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be > instead of <.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

argh, nevermind

set_keepalive(sd);
}
#endif // SO_KEEPALIVE
}

mca_oob_tcp_peer_t* mca_oob_tcp_peer_lookup(const orte_process_name_t *name)
Expand Down
23 changes: 23 additions & 0 deletions orte/mca/oob/tcp/oob_tcp_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,29 @@ static int tcp_component_register(void)
&mca_oob_tcp_component.disable_ipv6_family);
#endif


mca_oob_tcp_component.keepalive_time = 10;
(void)mca_base_component_var_register(component, "keepalive_time",
"Idle time in seconds before starting to send keepalives (num <= 0 ----> disable keepalive)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_oob_tcp_component.keepalive_time);

mca_oob_tcp_component.keepalive_intvl = 60;
(void)mca_base_component_var_register(component, "keepalive_intvl",
"Time between keepalives, in seconds",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_oob_tcp_component.keepalive_intvl);
mca_oob_tcp_component.keepalive_probes = 3;
(void)mca_base_component_var_register(component, "keepalive_probes",
"Number of keepalives that can be missed before declaring error",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_oob_tcp_component.keepalive_probes);
return ORTE_SUCCESS;
}

Expand Down
5 changes: 4 additions & 1 deletion orte/mca/oob/tcp/oob_tcp_component.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -77,6 +77,9 @@ typedef struct {
bool listen_thread_active;
struct timeval listen_thread_tv; /**< Timeout when using listen thread */
int stop_thread[2]; /**< pipe used to exit the listen thread */
int keepalive_probes; /**< number of keepalives that can be missed before declaring error */
int keepalive_time; /**< idle time in seconds before starting to send keepalives */
int keepalive_intvl; /**< time between keepalives, in seconds */
} mca_oob_tcp_component_t;

ORTE_MODULE_DECLSPEC extern mca_oob_tcp_component_t mca_oob_tcp_component;
Expand Down