Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,7 @@ test/datatype/ddt_raw
test/datatype/opal_datatype_test
test/datatype/position_noncontig
test/datatype/unpack_ooo
test/datatype/unpack_hetero

test/dss/dss_buffer
test/dss/dss_copy
Expand Down
93 changes: 66 additions & 27 deletions opal/mca/btl/tcp/btl_tcp_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Oak Ridge National Laboratory
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
Expand Down Expand Up @@ -1334,6 +1334,34 @@ static void mca_btl_tcp_component_accept_handler( int incoming_sd,
}


static const char *get_peer_name(int fd)
{
char *str;
const char *ret;
struct sockaddr sa;
struct sockaddr_in *si;
socklen_t slt = (socklen_t) sizeof(sa);

int rc = getpeername(fd, &sa, &slt);
if (0 != rc) {
ret = strdup("Unknown");
return ret;
}

str = malloc(INET_ADDRSTRLEN);
if (NULL == str) {
return NULL;
}

si = (struct sockaddr_in*) &sa;
ret = inet_ntop(AF_INET, &(si->sin_addr), str, INET_ADDRSTRLEN);
if (NULL == ret) {
free(str);
}

return ret;
}

/**
* Event callback when there is data available on the registered
* socket to recv. This callback is triggered only once per lifetime
Expand Down Expand Up @@ -1365,20 +1393,22 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
if (ENOPROTOOPT == errno) {
sockopt = false;
} else {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot get current recv timeout value of the socket"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
} else {
tv.tv_sec = 2;
tv.tv_usec = 0;
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot set new recv timeout value of the socket"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
}
Expand All @@ -1397,14 +1427,16 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
* This attempted connection will be ignored; your MPI job may or may not
* continue properly.
*/
if (sizeof(hs_msg) != retval) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"process did not receive full connect ACK "
"Local_host:%s PID:%d String_received:%s Test_fail:%s",
opal_process_info.nodename,
getpid(),
(retval > 0) ? hs_msg.magic_id : "<nothing>",
"handshake message length");
if (sizeof(hs_msg) != retval) {
const char *peer = get_peer_name(sd);
opal_show_help("help-mpi-btl-tcp.txt",
"did not receive full magic id string",
true,
opal_process_info.nodename,
getpid(),
opal_version_string,
peer);
free((char*) peer);

/* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd);
Expand All @@ -1413,12 +1445,18 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)

guid = hs_msg.guid;
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"process did not receive right magic string. "
"Local_host:%s PID:%d String_received:%s Test_fail:%s",
opal_process_info.nodename,
getpid(), hs_msg.magic_id,
"string value");
const char *peer = get_peer_name(sd);
opal_show_help("help-mpi-btl-tcp.txt",
"received incorrect magic id string",
true,
opal_process_info.nodename,
getpid(),
opal_version_string,
peer,
hs_msg.magic_id,
mca_btl_tcp_magic_id_string);
free((char*) peer);

/* The other side probably isn't OMPI, so just hang up */
CLOSE_THE_SOCKET(sd);
return;
Expand All @@ -1427,10 +1465,11 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
if (sockopt) {
/* reset RECVTIMEO option to its original state */
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) {
opal_output_verbose(20, opal_btl_base_framework.framework_output,
"Cannot reset recv timeout value"
"Local_host:%s PID:%d",
opal_process_info.nodename, getpid());
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
true, opal_process_info.nodename,
getpid(),
"setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
strerror(opal_socket_errno), opal_socket_errno);
return;
}
}
Expand Down
52 changes: 47 additions & 5 deletions opal/mca/btl/tcp/help-mpi-btl-tcp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ values are in the range [1 .. 2^16-1]. This value will be ignored
WARNING: Open MPI failed to TCP connect to a peer MPI process. This
should not happen.

Your Open MPI job may now fail.
Your Open MPI job may now hang or fail.

Local host: %s
PID: %d
Expand All @@ -46,7 +46,7 @@ Your Open MPI job may now fail.
WARNING: Open MPI failed to handshake with a connecting peer MPI
process over TCP. This should not happen.

Your Open MPI job may now fail.
Your Open MPI job may now hang or fail.

Local host: %s
PID: %d
Expand Down Expand Up @@ -102,8 +102,11 @@ hopefully be able to continue).
Known IPs of peer: %s
#
[socket flag fail]
WARNING: Open MPI failed to set flags on a TCP socket. This should
not happen. It is likely that your MPI job will now fail.
WARNING: Open MPI failed to get or set flags on a TCP socket. This
should not happen.

This may cause unpredictable behavior, and may end up hanging or
aborting your job.

Local host: %s
PID: %d
Expand Down Expand Up @@ -164,4 +167,43 @@ Your Open MPI job may now fail.
PID: %d
Message: %s
Error: %s (%d)
#
#
[did not receive full magic id string]
The TCP BTL received an inbound socket connection from an unidentified
peer. This typically means one of two things:

1. A non-Open MPI process tried to connect to this Open MPI process.
2. An Open MPI process compiled against a different version of Open
MPI tried to connect to this Open MPI process.

Open MPI only supports running exactly the same version between all
processes in a single job.

This may cause unpredictable behavior, and may end up aborting your
job.

Local host: %s
Local PID: %d
Local Open MPI version: %s
Peer IP address: %s
#
[received incorrect magic id string]
The TCP BTL received an inbound socket connection from a peer that did
not identify itself correctly as an Open MPI process. This typically
means one of two things:

1. A non-Open MPI process tried to connect to this Open MPI process.
2. An Open MPI process compiled against a different version of Open
MPI tried to connect to this Open MPI process.

Open MPI only supports running exactly the same version between all
processes in a single job.

This may cause unpredictable behavior, and may end up hanging or
aborting your job.

Local host: %s
Local PID: %d
Local Open MPI version: %s
Peer IP address: %s
Peer identifier: %s (expected %s)
4 changes: 3 additions & 1 deletion opal/util/net.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
Expand Down Expand Up @@ -361,7 +361,9 @@ opal_net_addr_isipv4public(const struct sockaddr *addr)
bool
opal_net_addr_isipv6linklocal(const struct sockaddr *addr)
{
#if OPAL_ENABLE_IPV6
struct sockaddr_in6 if_addr;
#endif

switch (addr->sa_family) {
#if OPAL_ENABLE_IPV6
Expand Down
91 changes: 73 additions & 18 deletions opal/util/show_help.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2017 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
Expand All @@ -36,24 +36,22 @@
#include "opal/constants.h"


/*
* Local functions
*/
static int default_backend(const char *filename, const char *topic,
bool want_error_header, va_list arglist);


/*
* Private variables
*/
static const char *default_filename = "help-messages";
static const char *dash_line = "--------------------------------------------------------------------------\n";
static int output_stream = -1;
static char **search_dirs = NULL;

/*
* Local functions
*/
static int opal_show_vhelp_internal(const char *filename, const char *topic,
bool want_error_header, va_list arglist);
static int opal_show_help_internal(const char *filename, const char *topic,
bool want_error_header, ...);

opal_show_help_fn_t opal_show_help = opal_show_help_internal;
opal_show_vhelp_fn_t opal_show_vhelp = opal_show_vhelp_internal;
static volatile bool enabled = true;
static opal_show_help_internal_fn_t registered_backend = default_backend;


int opal_show_help_init(void)
Expand Down Expand Up @@ -330,8 +328,17 @@ char *opal_show_help_string(const char *filename, const char *topic,
return output;
}

static int opal_show_vhelp_internal(const char *filename, const char *topic,
bool want_error_header, va_list arglist)
/* This is the default back-end to all the opal_show_*help()
* functions. All it does it render the string and then call
* opal_output().
*
* All decisions about whether show_help messages are enabled or not
* have been made by the time this function is invoked.
*/
static int default_backend(const char *filename,
const char *topic,
bool want_error_header,
va_list arglist)
{
char *output;

Expand All @@ -348,20 +355,68 @@ static int opal_show_vhelp_internal(const char *filename, const char *topic,
return (NULL == output) ? OPAL_ERROR : OPAL_SUCCESS;
}

static int opal_show_help_internal(const char *filename, const char *topic,
bool want_error_header, ...)
int opal_show_vhelp(const char *filename, const char *topic,
bool want_error_header, va_list arglist)
{
if (!enabled) {
return OPAL_SUCCESS;
}

return registered_backend(filename, topic,
want_error_header, arglist);
}

int opal_show_vhelp_final(const char *filename, const char *topic,
bool want_error_header, va_list arglist)
{
if (!enabled) {
return OPAL_SUCCESS;
}

enabled = false;

return registered_backend(filename, topic,
want_error_header, arglist);
}

int opal_show_help(const char *filename, const char *topic,
bool want_error_header, ...)
{
va_list arglist;
int rc;
int rc = OPAL_SUCCESS;

/* Convert it to a single string */
va_start(arglist, want_error_header);
rc = opal_show_vhelp(filename, topic, want_error_header, arglist);
va_end(arglist);

return rc;
}

int opal_show_help_final(const char *filename, const char *topic,
bool want_error_header, ...)
{
va_list arglist;
int rc = OPAL_SUCCESS;

va_start(arglist, want_error_header);
rc = opal_show_vhelp_final(filename, topic, want_error_header,
arglist);
va_end(arglist);

return rc;
}

int opal_show_help_register_backend(opal_show_help_internal_fn_t func)
{
if (NULL == func) {
registered_backend = default_backend;
} else {
registered_backend = func;
}

return OPAL_SUCCESS;
}

int opal_show_help_add_dir(const char *directory)
{
opal_argv_append_nosize(&search_dirs, directory);
Expand Down
Loading