diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index ef7fcf0f998..94206e01b94 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -16,7 +16,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2017 Cisco Systems, Inc. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,6 +41,7 @@ #include "opal/util/if.h" #include "opal/util/net.h" #include "opal/util/proc.h" +#include "opal/util/show_help.h" #include "btl_tcp.h" #include "btl_tcp_proc.h" @@ -122,16 +123,18 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) return btl_proc; } - do { + do { /* This loop is only necessary so that we can break out of the serial code */ btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); if(NULL == btl_proc) { rc = OPAL_ERR_OUT_OF_RESOURCE; break; } - btl_proc->proc_opal = proc; - - OBJ_RETAIN(btl_proc->proc_opal); + /* Retain the proc, but don't store the ref into the btl_proc just yet. This + * provides a way to release the btl_proc in case of failure without having to + * unlock the mutex. + */ + OBJ_RETAIN(proc); /* lookup tcp parameters exported by this proc */ OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version, @@ -181,12 +184,14 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) } while (0); if (OPAL_SUCCESS == rc) { + btl_proc->proc_opal = proc; /* link with the proc */ /* add to hash table of all proc instance. */ opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs, proc->proc_name, btl_proc); } else { if (btl_proc) { - OBJ_RELEASE(btl_proc); + OBJ_RELEASE(btl_proc); /* release the local proc */ + OBJ_RELEASE(proc); /* and the ref on the OMPI proc */ btl_proc = NULL; } } @@ -823,9 +828,38 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); return; } - OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); /* No further use of this socket. Close it */ CLOSE_THE_SOCKET(sd); + { + size_t len = 1024; + char* addr_str = (char*)malloc(len); + if( NULL != addr_str ) { + memset(addr_str, 0, len); + for (size_t i = 0; i < btl_proc->proc_endpoint_count; i++) { + mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i]; + if (btl_endpoint->endpoint_addr->addr_family != addr->sa_family) { + continue; + } + + if (addr_str[0] != '\0') { + strncat(addr_str, ", ", len); + len -= 2; + } + strncat(addr_str, inet_ntop(AF_INET6, (void*)(struct in6_addr*)&btl_endpoint->endpoint_addr->addr_inet, + addr_str + 1024 - len, INET6_ADDRSTRLEN), len); + len = 1024 - strlen(addr_str); + } + } + opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection", + true, opal_process_info.nodename, + getpid(), + btl_proc->proc_opal->proc_hostname, + OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name), + opal_net_get_hostname((struct sockaddr*)addr), + addr_str); + free(addr_str); + } + OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); } /* diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index 1e6e55cee73..a5781f7ed0c 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2009-2017 Cisco Systems, Inc. All rights reserved # Copyright (c) 2015-2016 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. @@ -85,3 +85,18 @@ or other external events. Local PID: %d Peer host: %s # +[dropped inbound connection] +Open MPI detected an inbound MPI TCP connection request from a peer +that appears to be part of this MPI job (i.e., it identified itself as +part of this Open MPI job), but it is from an IP address that is +unexpected. This is highly unusual. + +The inbound connection has been dropped, and the peer should simply +try again with a different IP interface (i.e., the job should +hopefully be able to continue). + + Local host: %s + Local PID: %d + Peer hostname: %s (%s) + Source IP of socket: %s + Known IPs of peer: %s diff --git a/opal/util/net.c b/opal/util/net.c index 5765c913b02..190336ca7d3 100644 --- a/opal/util/net.c +++ b/opal/util/net.c @@ -385,7 +385,6 @@ opal_net_get_hostname(const struct sockaddr *addr) if(NULL == inet_ntop(AF_INET6, &((struct sockaddr_in6*) addr)->sin6_addr, name, NI_MAXHOST)) { opal_output(0, "opal_sockaddr2str failed with error code %d", errno); - free(name); return NULL; } return name; @@ -394,7 +393,6 @@ opal_net_get_hostname(const struct sockaddr *addr) #endif break; default: - free(name); return NULL; } @@ -405,7 +403,6 @@ opal_net_get_hostname(const struct sockaddr *addr) int err = errno; opal_output (0, "opal_sockaddr2str failed:%s (return code %i)\n", gai_strerror(err), error); - free (name); return NULL; } /* strip any trailing % data as it isn't pertinent */