From bd8457707089333983cb557fff16b5790558f918 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 1 Mar 2017 13:17:54 -0500 Subject: [PATCH 1/3] Allow a TCP proc release during the create. This is mostly for error cases, where we need to release the newly created proc. Currently the code deadlocks because the endpoint lock is help at the release and the lock is not recursive. Aslo added some code to print the IP addresses that don't match during the TCP connection step. Thanks to Evgueni Petrov @Welcome-Dear-Evgueni for reporting the issue. Signed-off-by: George Bosilca (cherry picked from commit ec4a235e6a55af7aaa03d7cdc30853016509651d) Signed-off-by: Jeff Squyres --- opal/mca/btl/tcp/btl_tcp_proc.c | 43 ++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index 68ea3f020c6..ad3745a5445 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -125,16 +125,18 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) return btl_proc; } - do { + do { /* This loop is only necessary so that we can break out of the serial code */ btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); if(NULL == btl_proc) { rc = OPAL_ERR_OUT_OF_RESOURCE; break; } - btl_proc->proc_opal = proc; - - OBJ_RETAIN(btl_proc->proc_opal); + /* Retain the proc, but don't store the ref into the btl_proc just yet. This + * provides a way to release the btl_proc in case of failure without having to + * unlock the mutex. + */ + OBJ_RETAIN(proc); /* lookup tcp parameters exported by this proc */ OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version, @@ -184,12 +186,14 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) } while (0); if (OPAL_SUCCESS == rc) { + btl_proc->proc_opal = proc; /* link with the proc */ /* add to hash table of all proc instance. */ opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs, proc->proc_name, btl_proc); } else { if (btl_proc) { - OBJ_RELEASE(btl_proc); + OBJ_RELEASE(btl_proc); /* release the local proc */ + OBJ_RELEASE(proc); /* and the ref on the OMPI proc */ btl_proc = NULL; } } @@ -807,9 +811,36 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); return; } - OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); /* No further use of this socket. Close it */ CLOSE_THE_SOCKET(sd); + { + size_t len = 1024; + char* addr_str = (char*)malloc(len); + if( NULL != addr_str ) { + memset(addr_str, 0, len); + for (size_t i = 0; i < btl_proc->proc_endpoint_count; i++) { + mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i]; + if (btl_endpoint->endpoint_addr->addr_family != addr->sa_family) { + continue; + } + + if (addr_str[0] != '\0') { + strncat(addr_str, ", ", len); + len -= 2; + } + strncat(addr_str, inet_ntop(AF_INET6, (void*)(struct in6_addr*)&btl_endpoint->endpoint_addr->addr_inet, + addr_str + 1024 - len, INET6_ADDRSTRLEN), len); + len = 1024 - strlen(addr_str); + } + } + opal_output(0, "btl: tcp: Incoming connection from %s does not match known addresses for peer %s [hostname=%s addr=%s]. Drop !\n", + opal_net_get_hostname((struct sockaddr*)addr), + OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name), + btl_proc->proc_opal->proc_hostname, + addr_str); + free(addr_str); + } + OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); } /* From aec1235cc3941a3ac0418a06a159bba53052bd4c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 1 Mar 2017 13:20:21 -0500 Subject: [PATCH 2/3] Never free the statically allocated buffer. Signed-off-by: George Bosilca (cherry picked from commit b0f8d2c46098ec6103971784fe08834541510be6) --- opal/util/net.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/opal/util/net.c b/opal/util/net.c index 5765c913b02..190336ca7d3 100644 --- a/opal/util/net.c +++ b/opal/util/net.c @@ -385,7 +385,6 @@ opal_net_get_hostname(const struct sockaddr *addr) if(NULL == inet_ntop(AF_INET6, &((struct sockaddr_in6*) addr)->sin6_addr, name, NI_MAXHOST)) { opal_output(0, "opal_sockaddr2str failed with error code %d", errno); - free(name); return NULL; } return name; @@ -394,7 +393,6 @@ opal_net_get_hostname(const struct sockaddr *addr) #endif break; default: - free(name); return NULL; } @@ -405,7 +403,6 @@ opal_net_get_hostname(const struct sockaddr *addr) int err = errno; opal_output (0, "opal_sockaddr2str failed:%s (return code %i)\n", gai_strerror(err), error); - free (name); return NULL; } /* strip any trailing % data as it isn't pertinent */ From a0d29ac7491893f353684e43faadf341c55220e5 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Wed, 1 Mar 2017 15:55:49 -0800 Subject: [PATCH 3/3] btl/tcp: use show_help to print the dropped-TCP warning Make the message more friendly / more detailed, and de-duplicate it (just in case it happens a lot). Signed-off-by: Jeff Squyres (cherry picked from commit 5b484c91f4bb8e28197cead99d30fba92868e899) --- opal/mca/btl/tcp/btl_tcp_proc.c | 15 +++++++++------ opal/mca/btl/tcp/help-mpi-btl-tcp.txt | 17 ++++++++++++++++- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index ad3745a5445..c2b55d88e2a 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -16,7 +16,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2017 Cisco Systems, Inc. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,6 +41,7 @@ #include "opal/util/if.h" #include "opal/util/net.h" #include "opal/util/proc.h" +#include "opal/util/show_help.h" #include "btl_tcp.h" #include "btl_tcp_proc.h" @@ -833,11 +834,13 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr len = 1024 - strlen(addr_str); } } - opal_output(0, "btl: tcp: Incoming connection from %s does not match known addresses for peer %s [hostname=%s addr=%s]. Drop !\n", - opal_net_get_hostname((struct sockaddr*)addr), - OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name), - btl_proc->proc_opal->proc_hostname, - addr_str); + opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection", + true, opal_process_info.nodename, + getpid(), + btl_proc->proc_opal->proc_hostname, + OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name), + opal_net_get_hostname((struct sockaddr*)addr), + addr_str); free(addr_str); } OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index 4b3f4544a37..5fbb63763bb 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2009-2017 Cisco Systems, Inc. All rights reserved # Copyright (c) 2015-2016 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. @@ -91,3 +91,18 @@ or other external events. Local PID: %d Peer host: %s # +[dropped inbound connection] +Open MPI detected an inbound MPI TCP connection request from a peer +that appears to be part of this MPI job (i.e., it identified itself as +part of this Open MPI job), but it is from an IP address that is +unexpected. This is highly unusual. + +The inbound connection has been dropped, and the peer should simply +try again with a different IP interface (i.e., the job should +hopefully be able to continue). + + Local host: %s + Local PID: %d + Peer hostname: %s (%s) + Source IP of socket: %s + Known IPs of peer: %s