Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion orte/mca/oob/ud/oob_ud.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ static inline void mca_oob_ud_fill_sge (struct ibv_sge *sge, void *addr,

struct mca_oob_ud_device_t {
opal_list_item_t super;

struct ibv_device_attr attr;
struct ibv_context *ib_context;
struct ibv_comp_channel *ib_channel;
struct ibv_pd *ib_pd;
Expand Down
5 changes: 2 additions & 3 deletions orte/mca/oob/ud/oob_ud_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,6 @@ static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
struct ibv_device *ib_device)
{
int rc, port_num;
struct ibv_device_attr dev_attr;

opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup attempting to setup ib device %p",
Expand All @@ -237,7 +236,7 @@ static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
return ORTE_ERROR;
}

rc = ibv_query_device (device->ib_context, &dev_attr);
rc = ibv_query_device (device->ib_context, &device->attr);
if (0 != rc) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:device_setup error querying device. errno = %d",
Expand All @@ -261,7 +260,7 @@ static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
return ORTE_ERROR;
}

for (port_num = 1 ; port_num <= dev_attr.phys_port_cnt ; ++port_num) {
for (port_num = 1 ; port_num <= device->attr.phys_port_cnt ; ++port_num) {
mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t);

if (NULL == port) {
Expand Down
25 changes: 16 additions & 9 deletions orte/mca/oob/ud/oob_ud_qp.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
*
*/

#include "oob_ud_component.h"
#include "oob_ud_qp.h"
#include "oob_ud.h"

Expand Down Expand Up @@ -72,12 +73,16 @@ int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port,
init_attr.send_cq = qp->ib_send_cq;
init_attr.recv_cq = qp->ib_recv_cq;

init_attr.cap.max_send_sge = 32;
init_attr.cap.max_recv_sge = 32; /* GRH, data */
mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) opal_list_get_first (&mca_oob_ud_component.ud_devices);
opal_output_verbose(80, orte_oob_base_framework.framework_output,
"%s oob:ud:qp_init create queue pair for device: device->attr.max_sge = %d, device->attr.max_qp_wr = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), device->attr.max_sge, device->attr.max_qp_wr);

init_attr.cap.max_send_sge = 1;
init_attr.cap.max_recv_sge = 2; /* GRH, data */
init_attr.cap.max_inline_data = 0; /* don't use inline data for now */
/* NTH: fix these */
init_attr.cap.max_recv_wr = 4096;
init_attr.cap.max_send_wr = 4096;
init_attr.cap.max_recv_wr = min(4096, device->attr.max_qp_wr);
init_attr.cap.max_send_wr = min(4096, device->attr.max_qp_wr);

qp->ib_qp = ibv_create_qp (port->device->ib_pd, &init_attr);
if (NULL == qp->ib_qp) {
Expand Down Expand Up @@ -258,29 +263,31 @@ int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr,
}

int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr) {

struct ibv_recv_wr *bad_wr;
int rc;

rc = ibv_post_recv (qp->ib_qp, wr, &bad_wr);
if (0 != rc) {
opal_output (0, "%s oob:ud:qp_post_recv failed. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);

return ORTE_ERROR;
}
return ORTE_SUCCESS;
}

int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t **qp_ptr) {
int rc;
int rc = ORTE_SUCCESS;
opal_free_list_item_t *item;

do {
item = opal_free_list_get_st (&port->data_qps);
if (NULL == item) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:qp_data_aquire error allocating new data qp",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
rc = ORTE_ERR_OUT_OF_RESOURCE;
"%s oob:ud:qp_data_aquire error allocating new data qp. error = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE;
break;
}

Expand Down
9 changes: 6 additions & 3 deletions orte/mca/oob/ud/oob_ud_send.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
const unsigned int mtu = send_req->req_mtu;
const struct timeval aquire_timeout = {0, 500000};
mca_oob_ud_msg_t *com_msg;
int data_len, rc;
int data_len;
int rc = ORTE_SUCCESS;

opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:ud:send_try sending to %s, tag = %d, "
Expand Down Expand Up @@ -504,7 +505,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
/* send data */
rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0);
if (ORTE_SUCCESS != rc) {
opal_output (0, "error posting send!");
opal_output (0, "%s oob:ud:send_try error posting send!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
break;
}

Expand Down Expand Up @@ -532,7 +534,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
}

if (ORTE_SUCCESS != rc) {
opal_output (0, "send error! rc = %d", rc);
opal_output (0, "%s oob:ud:send_try send error! rc = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
/* damn */
return mca_oob_ud_send_complete (send_req, rc);
}
Expand Down