From a004f16ab707c78c05a73df70aab27183f7d90d8 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 16 Feb 2016 16:42:50 -0700 Subject: [PATCH 1/7] btl/openib: fix regression in XRC support Commit open-mpi/ompi@400af6c52dadacf16a685c70fc56d21649cc4541 introduced a regression in XRC support. The commit reversed the ordering of shared receive queue (SRQ) and completion queue (CQ) completion. CQ creation must always preceed SRQ creation when using XRC as the CQs are needed to create the SRQs. This commit fixes the ordering so that CQs are always created before SRQs. Signed-off-by: Nathan Hjelm (cherry picked from open-mpi/ompi@123a39ac3c4c3fd83d8db52de8a2001563803b2f) Signed-off-by: Nathan Hjelm --- opal/mca/btl/openib/btl_openib.c | 63 ++++++++++++++++++-------------- opal/mca/btl/openib/btl_openib.h | 3 +- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 2722f928e4..e5c4343efd 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2015 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. @@ -399,6 +399,8 @@ static int create_srq(mca_btl_openib_module_t *openib_btl) } } + openib_btl->srqs_created = true; + return OPAL_SUCCESS; } @@ -406,7 +408,7 @@ static int openib_btl_prepare(struct mca_btl_openib_module_t* openib_btl) { int rc = OPAL_SUCCESS; opal_mutex_lock(&openib_btl->ib_lock); - if (0 == openib_btl->num_peers && + if (!openib_btl->srqs_created && (mca_btl_openib_component.num_srq_qps > 0 || mca_btl_openib_component.num_xrc_qps > 0)) { rc = create_srq(openib_btl); @@ -416,17 +418,12 @@ static int openib_btl_prepare(struct mca_btl_openib_module_t* openib_btl) } -static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl, size_t nprocs) +static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl) { uint32_t send_cqes, recv_cqes; int rc = OPAL_SUCCESS, qp; mca_btl_openib_device_t *device = openib_btl->device; - if( 0 == nprocs){ - /* nothing to do */ - return OPAL_SUCCESS; - } - opal_mutex_lock(&openib_btl->ib_lock); /* figure out reasonable sizes for completion queues */ for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { @@ -435,7 +432,7 @@ static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl, si recv_cqes = mca_btl_openib_component.qp_infos[qp].rd_num; } else { send_cqes = (mca_btl_openib_component.qp_infos[qp].rd_num + - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) * nprocs; + mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) * openib_btl->num_peers; recv_cqes = send_cqes; } @@ -455,7 +452,6 @@ static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl, si goto out; } - openib_btl->num_peers += nprocs; out: opal_mutex_unlock(&openib_btl->ib_lock); return rc; @@ -1028,10 +1024,14 @@ int mca_btl_openib_add_procs( return rc; } - rc = openib_btl_prepare(openib_btl); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("could not prepare openib btl structure for usel")); - return rc; + if (0 == openib_btl->num_peers) { + /* ensure completion queues are created before attempting to + * make a loop-back queue pair */ + rc = openib_btl_size_queues(openib_btl); + if (OPAL_SUCCESS != rc) { + BTL_ERROR(("error creating cqs")); + return rc; + } } /* prepare all proc's and account them properly */ @@ -1080,10 +1080,20 @@ int mca_btl_openib_add_procs( } } - /* account this procs if need */ - rc = openib_btl_size_queues(openib_btl, nprocs_new); + if (nprocs_new) { + OPAL_THREAD_ADD32(&openib_btl->num_peers, nprocs_new); + + /* adjust cq sizes given the new procs */ + rc = openib_btl_size_queues (openib_btl); + if (OPAL_SUCCESS != rc) { + BTL_ERROR(("error creating cqs")); + return rc; + } + } + + rc = openib_btl_prepare (openib_btl); if (OPAL_SUCCESS != rc) { - BTL_ERROR(("error creating cqs")); + BTL_ERROR(("could not prepare openib btl module for use")); return rc; } @@ -1156,10 +1166,8 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul { mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t *) btl; volatile mca_btl_base_endpoint_t *endpoint = NULL; + int local_port_cnt = 0, btl_rank, rc; mca_btl_openib_proc_t *ib_proc; - int rc; - int local_port_cnt = 0, btl_rank; - size_t nprocs_new = 0; rc = prepare_device_for_use (openib_btl->device); if (OPAL_SUCCESS != rc) { @@ -1167,12 +1175,6 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul return NULL; } - rc = openib_btl_prepare(openib_btl); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("could not prepare openib btl structure for use")); - return NULL; - } - if (NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc))) { /* if we don't have connection info for this process, it's * okay because some other method might be able to reach it, @@ -1189,7 +1191,8 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul /* this is a new process to this openib btl * account this procs if need */ - rc = openib_btl_size_queues(openib_btl, nprocs_new); + OPAL_THREAD_ADD32(&openib_btl->num_peers, 1); + rc = openib_btl_size_queues(openib_btl); if (OPAL_SUCCESS != rc) { BTL_ERROR(("error creating cqs")); return NULL; @@ -1214,6 +1217,12 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul return NULL; } + rc = openib_btl_prepare(openib_btl); + if (OPAL_SUCCESS != rc) { + BTL_ERROR(("could not prepare openib btl structure for use")); + goto exit; + } + for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) { endpoint = ib_proc->proc_endpoints[j]; if (endpoint->endpoint_btl == openib_btl) { diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index c0dd643e56..9b6464b272 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. @@ -465,6 +465,7 @@ struct mca_btl_openib_module_t { mca_btl_base_module_t super; bool btl_inited; + bool srqs_created; /** Common information about all ports */ mca_btl_openib_modex_message_t port_info; From 87500e15c7890cde42e9476f11ddccd66aa65eb6 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 16 Feb 2016 16:46:32 -0700 Subject: [PATCH 2/7] btl/openib: fix error in param check in mca_btl_openib_put mca_btl_openib_put incorrectly checks the qp inline max before allowing an inline put. This check will always fail for an endpoint that has not been connected. The commit changes the check to use the btl_put_local_registration_threshold instead. Signed-off-by: Nathan Hjelm (cherry picked from open-mpi/ompi@201c280e6c10653f363e9324c989f479a2f0364a) Signed-off-by: Nathan Hjelm --- opal/mca/btl/openib/btl_openib_put.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/opal/mca/btl/openib/btl_openib_put.c b/opal/mca/btl/openib/btl_openib_put.c index 25b5d3f532..ae46e921bd 100644 --- a/opal/mca/btl/openib/btl_openib_put.c +++ b/opal/mca/btl/openib/btl_openib_put.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. @@ -49,7 +49,7 @@ int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint qp = mca_btl_openib_component.rdma_qp; } - if (OPAL_UNLIKELY((ep->qps[qp].ib_inline_max < size && !local_handle) || !remote_handle || + if (OPAL_UNLIKELY((btl->btl_put_local_registration_threshold < size && !local_handle) || !remote_handle || size > btl->btl_put_limit)) { return OPAL_ERR_BAD_PARAM; } @@ -164,7 +164,7 @@ int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) { qp_put_wqe(ep, qp); - return OPAL_ERROR;; + return OPAL_ERROR; } return OPAL_SUCCESS; From fd223cab55905d7347a063f87d8fe4e33968447d Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 16 Feb 2016 16:49:04 -0700 Subject: [PATCH 3/7] btl/openib/udcm: fix XRC support This commit fixes two bugs in XRC support - When dynamic add_procs support was added to master the remote process name was added to the non-XRC request structure. The same value was not added to the XRC xconnect structure. This error was not caught because the send/recv code was incorrectly using the wrong structure member. This commmit adds the member and ensure the xconnect code uses the correct structure. - XRC loopback QP support has been fixed. It was 1) not setting the correct fields on the endpoint structure, 2) calling udcm_xrc_recv_qp_connect, and 3) was not initializing the endpoint data. Signed-off-by: Nathan Hjelm (cherry picked from open-mpi/ompi@bf8360388f6a0f4fe6fabf299d1f2476942a1517) Signed-off-by: Nathan Hjelm --- .../openib/connect/btl_openib_connect_udcm.c | 69 +++++++++---------- 1 file changed, 32 insertions(+), 37 deletions(-) diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index 947ca371f0..87bd7074b7 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -3,7 +3,7 @@ * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -240,6 +240,7 @@ typedef struct udcm_msg_hdr { #if HAVE_XRC /* UDCM_MESSAGE_XCONNECT, UDCM_MESSAGE_XCONNECT2 */ struct msg_xrc_connect { + opal_process_name_t rem_name; int32_t rem_ep_index; uint8_t rem_port_num; uint32_t rem_qp_num; @@ -343,11 +344,7 @@ static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc, static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep); static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn); static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep); -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num); -#else -static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep); -#endif static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn); static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, uint8_t msg_type); @@ -529,27 +526,24 @@ static int udcm_component_finalize(void) static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep) { udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); + int32_t recv_qpn; int rc; opal_mutex_lock (&udep->udep_lock); do { -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - rc = udcm_xrc_recv_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num); -#else - lcl_ep->xrc_recv_qp_num = lcl_ep->qps[0].qp->lcl_qp->qp_num; - rc = udcm_xrc_recv_qp_connect (lcl_ep); -#endif - if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("error connecting loopback XRC receive queue pair")); + if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) { + BTL_VERBOSE(("error initializing loopback endpoint cpc data")); break; } - rc = mca_btl_openib_endpoint_post_recvs (lcl_ep); + rc = udcm_xrc_send_qp_create (lcl_ep); if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("error posting receives for loopback queue pair")); + BTL_VERBOSE(("error creating send queue pair for loopback endpoint")); break; } + lcl_ep->rem_info.rem_index = lcl_ep->index; + rc = udcm_xrc_recv_qp_create (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num, lcl_ep->qps[0].qp->lcl_psn); if (OPAL_SUCCESS != rc) { @@ -557,14 +551,22 @@ static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep) break; } - rc = udcm_xrc_send_qp_connect (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num, - lcl_ep->qps[0].qp->lcl_psn); +#if OPAL_HAVE_CONNECTX_XRC_DOMAINS + recv_qpn = lcl_ep->xrc_recv_qp->qp_num; +#else + recv_qpn = lcl_ep->xrc_recv_qp_num; +#endif + + lcl_ep->rem_info.rem_qps[0].rem_psn = lcl_ep->xrc_recv_psn; + lcl_ep->rem_info.rem_qps[0].rem_qp_num = recv_qpn; + + rc = udcm_xrc_send_qp_connect (lcl_ep, recv_qpn, lcl_ep->xrc_recv_psn); if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("error creating loopback XRC send queue pair")); + BTL_VERBOSE(("error connecting loopback XRC send queue pair")); break; } - lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED; + BTL_VERBOSE(("successfully created loopback queue pair")); /* need to hold the endpoint lock before calling udcm_finish_connection */ OPAL_THREAD_LOCK(&lcl_ep->endpoint_lock); @@ -606,8 +608,6 @@ static int udcm_endpoint_init_self (struct mca_btl_base_endpoint_t *lcl_ep) break; } - lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTED; - /* need to hold the endpoint lock before calling udcm_finish_connection */ OPAL_THREAD_LOCK(&lcl_ep->endpoint_lock); rc = udcm_finish_connection (lcl_ep); @@ -2607,11 +2607,7 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) /* mark: xrc receive qp */ /* Recv qp connect */ -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num) -#else -static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep) -#endif { mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl; @@ -2625,9 +2621,9 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep) BTL_VERBOSE(("Connecting Recv QP\n")); lcl_ep->xrc_recv_qp = ibv_open_qp(openib_btl->device->ib_dev_context, &attr); if (NULL == lcl_ep->xrc_recv_qp) { /* failed to regester the qp, so it is already die and we should create new one */ - /* Return NOT READY !!!*/ - BTL_ERROR(("Failed to register qp_num: %d, get error: %s (%d)\n. Replying with RNR", - qp_num, strerror(errno), errno)); + /* Return NOT READY !!!*/ + BTL_VERBOSE(("Failed to register qp_num: %d, get error: %s (%d)\n. Replying with RNR", + qp_num, strerror(errno), errno)); return OPAL_ERROR; } else { BTL_VERBOSE(("Connected to XRC Recv qp [%d]", lcl_ep->xrc_recv_qp->qp_num)); @@ -2635,13 +2631,16 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep) } #else int ret; + /* silence unused variable warning */ + (void) qp_num; + BTL_VERBOSE(("Connecting receive qp: %d", lcl_ep->xrc_recv_qp_num)); ret = ibv_reg_xrc_rcv_qp(openib_btl->device->xrc_domain, lcl_ep->xrc_recv_qp_num); if (ret) { /* failed to regester the qp, so it is already die and we should create new one */ /* Return NOT READY !!!*/ lcl_ep->xrc_recv_qp_num = 0; - BTL_ERROR(("Failed to register qp_num: %d , get error: %s (%d). Replying with RNR", - lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); + BTL_VERBOSE(("Failed to register qp_num: %d , get error: %s (%d). Replying with RNR", + lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); return OPAL_ERROR; } #endif @@ -2817,9 +2816,9 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_ return rc; } - msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index); - msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num; - msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME; + msg->data->hdr.data.xreq.rem_ep_index = htonl(lcl_ep->index); + msg->data->hdr.data.xreq.rem_port_num = m->modex.mm_port_num; + msg->data->hdr.data.xreq.rem_name = OPAL_PROC_MY_NAME; if (UDCM_MESSAGE_XCONNECT == msg_type) { BTL_VERBOSE(("Sending XConnect with qp: %d, psn: %d", lcl_ep->qps[0].qp->lcl_qp->qp_num, @@ -2923,11 +2922,7 @@ static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg if (UDCM_MESSAGE_XCONNECT2 == msg_hdr->type) { response_type = UDCM_MESSAGE_XRESPONSE2; -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS rc = udcm_xrc_recv_qp_connect (lcl_ep, msg_hdr->data.xreq.rem_qp_num); -#else - rc = udcm_xrc_recv_qp_connect (lcl_ep); -#endif if (OPAL_SUCCESS != rc) { /* return not ready. remote side will retry */ rej_reason = UDCM_REJ_NOT_READY; From 5137cea5ab7bc8f3c3a672e326ec8428a2461a28 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 17 Feb 2016 14:54:19 -0700 Subject: [PATCH 4/7] btl/openib/udcm: fix local XRC connections This commit ensures ib_addr->remote_xrc_rcv_qp_num value is set when creating the loopback queue pair. This is needed when communicating with any other local peer. Signed-off-by: Nathan Hjelm (cherry picked from commit open-mpi/ompi@4f4ea96940ef8ddd7c626644e01b80ac0680d50c) Signed-off-by: Nathan Hjelm --- opal/mca/btl/openib/connect/btl_openib_connect_udcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index 87bd7074b7..55c76af6e4 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -557,6 +557,7 @@ static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep) recv_qpn = lcl_ep->xrc_recv_qp_num; #endif + lcl_ep->ib_addr->remote_xrc_rcv_qp_num = recv_qpn; lcl_ep->rem_info.rem_qps[0].rem_psn = lcl_ep->xrc_recv_psn; lcl_ep->rem_info.rem_qps[0].rem_qp_num = recv_qpn; From 0043b0e3fe30f90aeff535865270659b4829d4cf Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 18 Feb 2016 14:45:07 -0700 Subject: [PATCH 5/7] btl/openib: XRC fix bug that could cause an invalid SRQ# to be used This commit fixes a bug that occurs when attempting a get or put operation on an endpoint that is not already connected. In this case the remote_srqn may be set to an invalid value as the rem_srqs array on the endpoint is not populated. This commit moves the usage of the rem_srqs array to the internal put/get functions where it is guaranteed this array is populated. Signed-off-by: Nathan Hjelm (cherry picked from commit open-mpi/ompi@4dc73d7765b5c0ca25214ea3c95d5e19998a8b5f) Signed-off-by: Nathan Hjelm --- opal/mca/btl/openib/btl_openib_atomic.c | 13 ++---------- opal/mca/btl/openib/btl_openib_get.c | 25 ++++++++++++---------- opal/mca/btl/openib/btl_openib_put.c | 28 +++++++++++++------------ 3 files changed, 31 insertions(+), 35 deletions(-) diff --git a/opal/mca/btl/openib/btl_openib_atomic.c b/opal/mca/btl/openib/btl_openib_atomic.c index 8d7ac86f73..0c6460f2cf 100644 --- a/opal/mca/btl/openib/btl_openib_atomic.c +++ b/opal/mca/btl/openib/btl_openib_atomic.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -73,16 +73,7 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st frag->sr_desc.wr.atomic.rkey = rkey; -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - frag->sr_desc.qp_type.xrc.remote_srqn = endpoint->rem_info.rem_srqs[qp].rem_srq_num; -#else - frag->sr_desc.xrc_remote_srq_num = endpoint->rem_info.rem_srqs[qp].rem_srq_num; -#endif - - } -#endif + /* NTH: the SRQ# is set in mca_btl_get_internal */ if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) { OPAL_THREAD_LOCK(&endpoint->endpoint_lock); diff --git a/opal/mca/btl/openib/btl_openib_get.c b/opal/mca/btl/openib/btl_openib_get.c index 2d335619c1..c8bc78105d 100644 --- a/opal/mca/btl/openib/btl_openib_get.c +++ b/opal/mca/btl/openib/btl_openib_get.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. @@ -92,16 +92,6 @@ int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; } -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; -#else - frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; -#endif - } -#endif - if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); @@ -138,6 +128,19 @@ int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base int qp = to_base_frag(frag)->base.order; struct ibv_send_wr *bad_wr; +#if HAVE_XRC + if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { + /* NTH: the remote SRQ number is only available once the endpoint is connected. By + * setting the value here instead of mca_btl_openib_get we guarantee the rem_srqs + * array is initialized. */ +#if OPAL_HAVE_CONNECTX_XRC_DOMAINS + frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; +#else + frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; +#endif + } +#endif + /* check for a send wqe */ if (qp_get_wqe(ep, qp) < 0) { qp_put_wqe(ep, qp); diff --git a/opal/mca/btl/openib/btl_openib_put.c b/opal/mca/btl/openib/btl_openib_put.c index ae46e921bd..2a9ee2ddd6 100644 --- a/opal/mca/btl/openib/btl_openib_put.c +++ b/opal/mca/btl/openib/btl_openib_put.c @@ -101,19 +101,6 @@ int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey; } -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { - -#if OPAL_HAVE_CONNECTX_XRC - to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; -#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS - to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; -#else -#error "that should never happen" -#endif - } -#endif - if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags); @@ -153,6 +140,21 @@ int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base struct ibv_send_wr *bad_wr; int rc; +#if HAVE_XRC + if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { + /* NTH: the remote SRQ number is only available once the endpoint is connected. By + * setting the value here instead of mca_btl_openib_put we guarantee the rem_srqs + * array is initialized. */ +#if OPAL_HAVE_CONNECTX_XRC + to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; +#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS + to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; +#else +#error "that should never happen" +#endif + } +#endif + /* check for a send wqe */ if (qp_get_wqe(ep, qp) < 0) { qp_put_wqe(ep, qp); From 8109dca8102a0659e363eb8ad3604d2858bd995d Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 18 Feb 2016 15:55:34 -0700 Subject: [PATCH 6/7] btl/openib: fix locking bugs with XRC ib_addr lock This bug fixes two issue with the ib_addr lock: - The ib_addr lock must always be obtained regardless of opal_using_threads() as the CPC is run in a seperate thread. - The ib_addr lock is held in mca_btl_openib_endpoint_connected when calling back into the CPC start_connect on any pending connections. This will attempt to obtain the ib_addr lock again. Since this is not a performance-critical part of the code the lock has been changed to be recursive. Signed-off-by: Nathan Hjelm (cherry picked from commit open-mpi/ompi@371df45bf8fc24e48a9d9b05692426eece09b09c) Signed-off-by: Nathan Hjelm --- opal/mca/btl/openib/btl_openib_endpoint.c | 6 +++--- opal/mca/btl/openib/btl_openib_xrc.c | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/opal/mca/btl/openib/btl_openib_endpoint.c b/opal/mca/btl/openib/btl_openib_endpoint.c index a4f84e0892..0186f8d5e2 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.c +++ b/opal/mca/btl/openib/btl_openib_endpoint.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. @@ -579,7 +579,7 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) opal_output(-1, "Now we are CONNECTED"); if (MCA_BTL_XRC_ENABLED) { - OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock); + opal_mutex_lock (&endpoint->ib_addr->addr_lock); if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) { /* We are not xrc master */ /* set our qp pointer to master qp */ @@ -622,7 +622,7 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) } } } - OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock); + opal_mutex_unlock (&endpoint->ib_addr->addr_lock); } diff --git a/opal/mca/btl/openib/btl_openib_xrc.c b/opal/mca/btl/openib/btl_openib_xrc.c index 3fc0e32c29..1952c31b12 100644 --- a/opal/mca/btl/openib/btl_openib_xrc.c +++ b/opal/mca/btl/openib/btl_openib_xrc.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. @@ -5,6 +6,8 @@ * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -122,7 +125,10 @@ static void ib_address_constructor(ib_address_t *ib_addr) ib_addr->lid = 0; ib_addr->status = MCA_BTL_IB_ADDR_CLOSED; ib_addr->qp = NULL; - OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_mutex_t); + /* NTH: make the addr_lock recursive because mca_btl_openib_endpoint_connected can call + * into the CPC with the lock held. The alternative would be to drop the lock but the + * lock is never obtained in a critical path. */ + OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_recursive_mutex_t); OBJ_CONSTRUCT(&ib_addr->pending_ep, opal_list_t); } From 71e56beb1082c87cfe9e74d77295e5d3566f9524 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 18 Feb 2016 20:55:48 -0700 Subject: [PATCH 7/7] btl/openib: XRC save SRQ#s on the loopback endpoint This commit fixes a bug that can occur when communicating via XRC to peers on the same node. UDCM was not saving the SRQ numbers on the loopback endpoint (which shares its ib_addr info with all local peers) so any messages to local peers use an invalid SRQ number. Fixes open-mpi/ompi#1383 Signed-off-by: Nathan Hjelm (cherry picked from commit open-mpi/ompi@2031bb6f01a3d78f83759db7c6260997916c44eb) Signed-off-by: Nathan Hjelm --- .../mca/btl/openib/connect/btl_openib_connect_udcm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index 55c76af6e4..ff735f9b72 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -551,6 +551,18 @@ static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep) break; } + for (int i = 0 ; i < mca_btl_openib_component.num_xrc_qps ; ++i) { + uint32_t srq_num; +#if OPAL_HAVE_CONNECTX_XRC_DOMAINS + if (ibv_get_srq_num(lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq, &srq_num)) { + BTL_ERROR(("BTL openib UDCM internal error: can't get srq num")); + } +#else + srq_num = lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq->xrc_srq_num; +#endif + lcl_ep->rem_info.rem_srqs[i].rem_srq_num = srq_num; + } + #if OPAL_HAVE_CONNECTX_XRC_DOMAINS recv_qpn = lcl_ep->xrc_recv_qp->qp_num; #else