diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 53d8c81e874..61dd951f4b5 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -425,13 +425,20 @@ static int openib_btl_prepare(struct mca_btl_openib_module_t* openib_btl) static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl) { uint32_t send_cqes, recv_cqes; - int rc = OPAL_SUCCESS, qp; + int rc = OPAL_SUCCESS; mca_btl_openib_device_t *device = openib_btl->device; + uint32_t requested[BTL_OPENIB_MAX_CQ]; + bool need_resize = false; opal_mutex_lock(&openib_btl->ib_lock); + + for (int cq = 0 ; cq < BTL_OPENIB_MAX_CQ ; ++cq) { + requested[cq] = 0; + } + /* figure out reasonable sizes for completion queues */ - for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - if(BTL_OPENIB_QP_TYPE_SRQ(qp)) { + for (int qp = 0 ; qp < mca_btl_openib_component.num_qps ; qp++) { + if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { send_cqes = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; recv_cqes = mca_btl_openib_component.qp_infos[qp].rd_num; } else { @@ -440,24 +447,30 @@ static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl) recv_cqes = send_cqes; } - opal_mutex_lock(&openib_btl->device->device_lock); - openib_btl->device->cq_size[qp_cq_prio(qp)] += recv_cqes; - openib_btl->device->cq_size[BTL_OPENIB_LP_CQ] += send_cqes; - opal_mutex_unlock(&openib_btl->device->device_lock); + requested[qp_cq_prio(qp)] += recv_cqes; + requested[BTL_OPENIB_LP_CQ] += send_cqes; } - rc = adjust_cq(device, BTL_OPENIB_HP_CQ); - if (OPAL_SUCCESS != rc) { - goto out; - } + opal_mutex_lock (&openib_btl->device->device_lock); + for (int cq = 0 ; cq < BTL_OPENIB_MAX_CQ ; ++cq) { + if (requested[cq] < mca_btl_openib_component.ib_cq_size[cq]) { + requested[cq] = mca_btl_openib_component.ib_cq_size[cq]; + } else if (requested[cq] > openib_btl->device->ib_dev_attr.max_cqe) { + requested[cq] = openib_btl->device->ib_dev_attr.max_cqe; + } - rc = adjust_cq(device, BTL_OPENIB_LP_CQ); - if (OPAL_SUCCESS != rc) { - goto out; - } + if (openib_btl->device->cq_size[cq] < requested[cq]) { + openib_btl->device->cq_size[cq] = requested[cq]; -out: + rc = adjust_cq (device, cq); + if (OPAL_SUCCESS != rc) { + break; + } + } + } + opal_mutex_unlock (&openib_btl->device->device_lock); opal_mutex_unlock(&openib_btl->ib_lock); + return rc; } @@ -1107,7 +1120,7 @@ int mca_btl_openib_add_procs( } if (nprocs_new) { - OPAL_THREAD_ADD32(&openib_btl->num_peers, nprocs_new); + opal_atomic_add_32 (&openib_btl->num_peers, nprocs_new); /* adjust cq sizes given the new procs */ rc = openib_btl_size_queues (openib_btl); @@ -1217,7 +1230,7 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul /* this is a new process to this openib btl * account this procs if need */ - OPAL_THREAD_ADD32(&openib_btl->num_peers, 1); + opal_atomic_add_32 (&openib_btl->num_peers, 1); rc = openib_btl_size_queues(openib_btl); if (OPAL_SUCCESS != rc) { BTL_ERROR(("error creating cqs")); diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index 656f7724b23..a8566a640c7 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -82,6 +82,12 @@ BEGIN_C_DECLS * Infiniband (IB) BTL component. */ +enum { + BTL_OPENIB_HP_CQ, + BTL_OPENIB_LP_CQ, + BTL_OPENIB_MAX_CQ, +}; + typedef enum { MCA_BTL_OPENIB_TRANSPORT_IB, MCA_BTL_OPENIB_TRANSPORT_IWARP, @@ -206,7 +212,7 @@ struct mca_btl_openib_component_t { uint32_t reg_mru_len; /**< Length of the registration cache most recently used list */ uint32_t use_srq; /**< Use the Shared Receive Queue (SRQ mode) */ - uint32_t ib_cq_size[2]; /**< Max outstanding CQE on the CQ */ + uint32_t ib_cq_size[BTL_OPENIB_MAX_CQ]; /**< Max outstanding CQE on the CQ */ int ib_max_inline_data; /**< Max size of inline data */ unsigned int ib_pkey_val; @@ -379,8 +385,8 @@ typedef struct mca_btl_openib_device_t { #endif struct ibv_device_attr ib_dev_attr; struct ibv_pd *ib_pd; - struct ibv_cq *ib_cq[2]; - uint32_t cq_size[2]; + struct ibv_cq *ib_cq[BTL_OPENIB_MAX_CQ]; + uint32_t cq_size[BTL_OPENIB_MAX_CQ]; mca_mpool_base_module_t *mpool; mca_rcache_base_module_t *rcache; /* MTU for this device */ @@ -863,11 +869,6 @@ extern int mca_btl_openib_ft_event(int state); */ void mca_btl_openib_show_init_error(const char *file, int line, const char *func, const char *dev); - -#define BTL_OPENIB_HP_CQ 0 -#define BTL_OPENIB_LP_CQ 1 - - /** * Post to Shared Receive Queue with certain priority *