Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 41 additions & 7 deletions opal/mca/btl/tcp/btl_tcp_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2017 Cisco Systems, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -41,6 +41,7 @@
#include "opal/util/if.h"
#include "opal/util/net.h"
#include "opal/util/proc.h"
#include "opal/util/show_help.h"

#include "btl_tcp.h"
#include "btl_tcp_proc.h"
Expand Down Expand Up @@ -125,16 +126,18 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
return btl_proc;
}

do {
do { /* This loop is only necessary so that we can break out of the serial code */
btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
if(NULL == btl_proc) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
break;
}

btl_proc->proc_opal = proc;

OBJ_RETAIN(btl_proc->proc_opal);
/* Retain the proc, but don't store the ref into the btl_proc just yet. This
* provides a way to release the btl_proc in case of failure without having to
* unlock the mutex.
*/
OBJ_RETAIN(proc);

/* lookup tcp parameters exported by this proc */
OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
Expand Down Expand Up @@ -184,12 +187,14 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
} while (0);

if (OPAL_SUCCESS == rc) {
btl_proc->proc_opal = proc; /* link with the proc */
/* add to hash table of all proc instance. */
opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
proc->proc_name, btl_proc);
} else {
if (btl_proc) {
OBJ_RELEASE(btl_proc);
OBJ_RELEASE(btl_proc); /* release the local proc */
OBJ_RELEASE(proc); /* and the ref on the OMPI proc */
btl_proc = NULL;
}
}
Expand Down Expand Up @@ -807,9 +812,38 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
return;
}
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
/* No further use of this socket. Close it */
CLOSE_THE_SOCKET(sd);
{
size_t len = 1024;
char* addr_str = (char*)malloc(len);
if( NULL != addr_str ) {
memset(addr_str, 0, len);
for (size_t i = 0; i < btl_proc->proc_endpoint_count; i++) {
mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
if (btl_endpoint->endpoint_addr->addr_family != addr->sa_family) {
continue;
}

if (addr_str[0] != '\0') {
strncat(addr_str, ", ", len);
len -= 2;
}
strncat(addr_str, inet_ntop(AF_INET6, (void*)(struct in6_addr*)&btl_endpoint->endpoint_addr->addr_inet,
addr_str + 1024 - len, INET6_ADDRSTRLEN), len);
len = 1024 - strlen(addr_str);
}
}
opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection",
true, opal_process_info.nodename,
getpid(),
btl_proc->proc_opal->proc_hostname,
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
opal_net_get_hostname((struct sockaddr*)addr),
addr_str);
free(addr_str);
}
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
}

/*
Expand Down
17 changes: 16 additions & 1 deletion opal/mca/btl/tcp/help-mpi-btl-tcp.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2009-2017 Cisco Systems, Inc. All rights reserved
# Copyright (c) 2015-2016 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
Expand Down Expand Up @@ -91,3 +91,18 @@ or other external events.
Local PID: %d
Peer host: %s
#
[dropped inbound connection]
Open MPI detected an inbound MPI TCP connection request from a peer
that appears to be part of this MPI job (i.e., it identified itself as
part of this Open MPI job), but it is from an IP address that is
unexpected. This is highly unusual.

The inbound connection has been dropped, and the peer should simply
try again with a different IP interface (i.e., the job should
hopefully be able to continue).

Local host: %s
Local PID: %d
Peer hostname: %s (%s)
Source IP of socket: %s
Known IPs of peer: %s
3 changes: 0 additions & 3 deletions opal/util/net.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,6 @@ opal_net_get_hostname(const struct sockaddr *addr)
if(NULL == inet_ntop(AF_INET6, &((struct sockaddr_in6*) addr)->sin6_addr,
name, NI_MAXHOST)) {
opal_output(0, "opal_sockaddr2str failed with error code %d", errno);
free(name);
return NULL;
}
return name;
Expand All @@ -394,7 +393,6 @@ opal_net_get_hostname(const struct sockaddr *addr)
#endif
break;
default:
free(name);
return NULL;
}

Expand All @@ -405,7 +403,6 @@ opal_net_get_hostname(const struct sockaddr *addr)
int err = errno;
opal_output (0, "opal_sockaddr2str failed:%s (return code %i)\n",
gai_strerror(err), error);
free (name);
return NULL;
}
/* strip any trailing % data as it isn't pertinent */
Expand Down