diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index 408689daa8..b87aa00276 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -410,6 +410,8 @@ static int mca_bml_r2_add_proc (struct ompi_proc_t *proc) } if (!btl_in_use) { + proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL; + OBJ_RELEASE(bml_endpoint); /* no btl is available for this proc */ if (mca_bml_r2.show_unreach_errors) { opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true, diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 893e6cebec..bd5fd767e3 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -140,6 +140,10 @@ int mca_pml_ob1_isend(const void *buf, int16_t seqn; int rc; + if (OPAL_UNLIKELY(NULL == endpoint)) { + return OMPI_ERR_UNREACH; + } + seqn = (uint16_t) OPAL_THREAD_ADD32(&ob1_proc->send_sequence, 1); if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) { @@ -189,6 +193,10 @@ int mca_pml_ob1_send(const void *buf, int16_t seqn; int rc; + if (OPAL_UNLIKELY(NULL == endpoint)) { + return OMPI_ERR_UNREACH; + } + if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_BUFFERED == sendmode)) { /* large buffered sends *need* a real request so use isend instead */ ompi_request_t *brequest; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 71fb8c3d5b..77300a8320 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -435,6 +435,8 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl; mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (proc); + assert (NULL != endpoint); + for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c b/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c index 67a4fb2954..4a915993f0 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c @@ -5,7 +5,7 @@ * Copyright (c) 2008 Mellanox Technologies. All rights reserved. * Copyright (c) 2009 Sandia National Laboratories. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2014 Intel, Inc. All rights reserved * Copyright (c) 2014 The University of Tennessee and The University @@ -185,6 +185,7 @@ typedef struct { #endif uint32_t rem_index; uint8_t qpnum; + opal_process_name_t rem_name; } __opal_attribute_packed__ private_data_t; #if !BTL_OPENIB_RDMACM_IB_ADDR @@ -376,68 +377,23 @@ static char *stringify(uint32_t addr) * the rdma_cm event id */ static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *contents, - struct rdma_cm_id *id, -#if BTL_OPENIB_RDMACM_IB_ADDR - uint64_t rem_port) -#else - uint16_t rem_port) -#endif + opal_process_name_t rem_name) { - int i; + mca_btl_openib_module_t *btl = contents->openib_btl; mca_btl_openib_endpoint_t *ep = NULL; - opal_pointer_array_t *endpoints = contents->openib_btl->device->endpoints; - - struct sockaddr *peeraddr = rdma_get_peer_addr(id); -#if BTL_OPENIB_RDMACM_IB_ADDR - union ibv_gid *ep_gid, peer_gid; - memcpy(peer_gid.raw, ((struct sockaddr_ib *) peeraddr)->sib_addr.sib_raw, sizeof peer_gid); -#else - uint32_t peeripaddr = ((struct sockaddr_in *) peeraddr)->sin_addr.s_addr; - -#if OPAL_ENABLE_DEBUG - char *a; -#endif - - OPAL_OUTPUT((-1, "remote peer requesting connection: %s port %d", - a = stringify(peeripaddr), rem_port)); -#if OPAL_ENABLE_DEBUG - free(a); -#endif -#endif + opal_proc_t *opal_proc; - for (i = 0; i < opal_pointer_array_get_size(endpoints); i++) { - mca_btl_openib_endpoint_t *endpoint; - modex_message_t *message; - - endpoint = (mca_btl_openib_endpoint_t *) opal_pointer_array_get_item(endpoints, i); - if (NULL == endpoint) { - continue; - } - - message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; -#if !BTL_OPENIB_RDMACM_IB_ADDR - OPAL_OUTPUT((-1, "message ipaddr = %s port %d", - a = stringify(message->ipaddr), message->tcp_port)); -#if OPAL_ENABLE_DEBUG - free(a); -#endif -#endif - -#if BTL_OPENIB_RDMACM_IB_ADDR - ep_gid = (union ibv_gid *) message->gid; - if (ep_gid->global.interface_id == peer_gid.global.interface_id && - ep_gid->global.subnet_prefix == peer_gid.global.subnet_prefix && - message->service_id == rem_port) { -#else - if (message->ipaddr == peeripaddr && message->tcp_port == rem_port) { -#endif - ep = endpoint; - break; - } + opal_proc = opal_proc_for_name (rem_name); + if (NULL == opal_proc) { + BTL_ERROR(("could not get proc associated with remote peer %s", + opal_process_name_print (rem_name))); + return NULL; } + ep = mca_btl_openib_get_ep (&btl->super, opal_proc); if (NULL == ep) { - BTL_ERROR(("can't find suitable endpoint for this peer")); + BTL_ERROR(("could not find endpoint for peer %s", + opal_process_name_print (rem_name))); } return ep; @@ -986,6 +942,7 @@ static int handle_connect_request(struct rdma_cm_event *event) rdmacm_contents_t *contents = listener_context->contents; mca_btl_openib_endpoint_t *endpoint; struct rdma_conn_param conn_param; + opal_process_name_t rem_name; modex_message_t *message; private_data_t msg; int rc = -1, qpnum; @@ -999,10 +956,11 @@ static int handle_connect_request(struct rdma_cm_event *event) qpnum = ((private_data_t *)event->param.conn.private_data)->qpnum; rem_port = ((private_data_t *)event->param.conn.private_data)->rem_port; rem_index = ((private_data_t *)event->param.conn.private_data)->rem_index; + rem_name = ((private_data_t *)event->param.conn.private_data)->rem_name; /* Determine which endpoint the remote side is trying to connect to; use the listener's context->contents to figure it out */ - endpoint = rdmacm_find_endpoint(contents, event->id, rem_port); + endpoint = rdmacm_find_endpoint(contents, rem_name); if (NULL == endpoint) { #if !BTL_OPENIB_RDMACM_IB_ADDR struct sockaddr *peeraddr = rdma_get_peer_addr(event->id); @@ -1145,6 +1103,7 @@ static int handle_connect_request(struct rdma_cm_event *event) /* Fill the private data being sent to the other side */ msg.qpnum = qpnum; msg.rem_index = endpoint->index; + msg.rem_name = OPAL_PROC_MY_NAME; /* Accepting the connection will result in a RDMA_CM_EVENT_ESTABLISHED event on both the client and server @@ -1617,6 +1576,7 @@ static int finish_connect(id_context_t *context) msg.qpnum = context->qpnum; msg.rem_index = contents->endpoint->index; + msg.rem_name = OPAL_PROC_MY_NAME; #if BTL_OPENIB_RDMACM_IB_ADDR memset(msg.librdmacm_header, 0, sizeof(msg.librdmacm_header)); msg.rem_port = contents->service_id;