Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions ompi/mca/mtl/portals4/mtl_portals4.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,15 @@ struct mca_mtl_portals4_module_t {

/* Use the logical to physical table to accelerate portals4 adressing: 1 (true) : 0 (false) */
int32_t use_logical;

/* Process_id */
ptl_process_t ptl_process_id;

/* Use flow control: 1 (true) : 0 (false) */
int32_t use_flowctl;

/** Short limit; Size limit for short messages */
uint64_t short_limit;
/** Eager limit; messages greater than this use a rendezvous protocol */
uint64_t eager_limit;
/** Size of short message blocks */
Expand All @@ -67,6 +73,8 @@ struct mca_mtl_portals4_module_t {

/** Network interface handle for matched interface */
ptl_handle_ni_t ni_h;
/** Limit given by portals after NIInit */
uint64_t max_msg_size_mtl;
/** Uid for current user */
ptl_uid_t uid;

Expand Down
40 changes: 40 additions & 0 deletions ompi/mca/mtl/portals4/mtl_portals4_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,18 @@ ompi_mtl_portals4_component_register(void)
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&param_priority);
ompi_mtl_portals4.short_limit = 2 * 1024;
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
"short_limit",
"Size limit for short messages",
MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG,
NULL,
0,
0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.short_limit);


ompi_mtl_portals4.eager_limit = 2 * 1024;
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
Expand Down Expand Up @@ -173,6 +185,19 @@ ompi_mtl_portals4_component_register(void)
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.protocol);

ompi_mtl_portals4.max_msg_size_mtl = PTL_SIZE_MAX;
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
"max_msg_size",
"Max size supported by portals4 (above that, a message is cut into messages less than that size)",
MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
NULL,
0,
0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.max_msg_size_mtl);

OBJ_RELEASE(new_enum);
if (0 > ret) {
return OMPI_ERR_NOT_SUPPORTED;
Expand All @@ -196,6 +221,12 @@ ompi_mtl_portals4_component_open(void)
"no"
#endif
);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"Max message size: %lu", (unsigned long)
ompi_mtl_portals4.max_msg_size_mtl);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"Short limit: %d", (int)
ompi_mtl_portals4.short_limit);
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"Eager limit: %d", (int)
ompi_mtl_portals4.eager_limit);
Expand Down Expand Up @@ -314,6 +345,11 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
goto error;
}

if (actual_limits.max_msg_size < ompi_mtl_portals4.max_msg_size_mtl)
ompi_mtl_portals4.max_msg_size_mtl = actual_limits.max_msg_size;
OPAL_OUTPUT_VERBOSE((10, ompi_mtl_base_framework.framework_output,
"Due to portals4 and user configuration messages will not go over the size of %lu", ompi_mtl_portals4.max_msg_size_mtl));

if (ompi_comm_rank(MPI_COMM_WORLD) == 0) {
opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "max_entries=%d", actual_limits.max_entries);
opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "max_unexpected_headers=%d", actual_limits.max_unexpected_headers);
Expand Down Expand Up @@ -350,6 +386,10 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
goto error;
}

ompi_mtl_portals4.ptl_process_id = id;
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output,
"PtlGetPhysId rank=%x nid=%x pid=%x\n", id.rank, id.phys.nid, id.phys.pid));

OPAL_MODEX_SEND(ret, OPAL_PMIX_GLOBAL,
&mca_mtl_portals4_component.mtl_version,
&id, sizeof(id));
Expand Down
11 changes: 8 additions & 3 deletions ompi/mca/mtl/portals4/mtl_portals4_flowctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ ompi_mtl_portals4_flowctl_init(void)
goto error;
}

if (ompi_mtl_portals4.flowctl_idx != REQ_FLOWCTL_TABLE_ID) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PtlPTAlloc did not allocate the requested PT: %d\n",
__FILE__, __LINE__, ompi_mtl_portals4.flowctl_idx);
goto error;
}

ret = PtlCTAlloc(ompi_mtl_portals4.ni_h,
&ompi_mtl_portals4.flowctl.trigger_ct_h);
if (OPAL_UNLIKELY(PTL_OK != ret)) {
Expand Down Expand Up @@ -291,9 +298,7 @@ ompi_mtl_portals4_flowctl_trigger(void)
{
int ret;

if (false == ompi_mtl_portals4.flowctl.flowctl_active) {
ompi_mtl_portals4.flowctl.flowctl_active = true;

if (true == OPAL_ATOMIC_CMPSET_32(&ompi_mtl_portals4.flowctl.flowctl_active, false, true)) {
/* send trigger to root */
ret = PtlPut(ompi_mtl_portals4.zero_md_h,
0,
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/mtl/portals4/mtl_portals4_flowctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ OBJ_CLASS_DECLARATION(ompi_mtl_portals4_pending_request_t);


struct ompi_mtl_portals4_flowctl_t {
bool flowctl_active;
int32_t flowctl_active;

int32_t send_slots;
int32_t max_send_slots;
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/mtl/portals4/mtl_portals4_probe.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ completion_fn(ptl_event_t *ev, ompi_mtl_portals4_base_request_t *ptl_base_reques
ompi_mtl_portals4_probe_request_t *ptl_request =
(ompi_mtl_portals4_probe_request_t*) ptl_base_request;

opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
opal_output_verbose(10, ompi_mtl_base_framework.framework_output,
"%s:%d: completion_fn: %d %d",
__FILE__, __LINE__, ev->type, ev->ni_fail_type);

Expand Down
95 changes: 56 additions & 39 deletions ompi/mca/mtl/portals4/mtl_portals4_recv.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
ptl_match_bits_t match_bits, ptl_size_t remote_offset,
ompi_mtl_portals4_recv_request_t *request)
{
int ret;
int ret, i;
ptl_size_t rest = length, asked = 0, frag_size;
int32_t pending_reply;

#if OMPI_MTL_PORTALS4_FLOW_CONTROL
while (OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, -1) < 0)) {
Expand All @@ -48,19 +50,29 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
}
#endif

ret = PtlGet(ompi_mtl_portals4.send_md_h,
(ptl_size_t) start,
length,
target,
ompi_mtl_portals4.read_idx,
match_bits,
remote_offset,
request);
if (OPAL_UNLIKELY(PTL_OK != ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PtlGet failed: %d",
__FILE__, __LINE__, ret);
return OMPI_ERR_OUT_OF_RESOURCE;
request->pending_reply = (length + ompi_mtl_portals4.max_msg_size_mtl - 1) / ompi_mtl_portals4.max_msg_size_mtl;
pending_reply = request->pending_reply;

for (i = 0 ; i < pending_reply ; i++) {
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d) send",
i + 1, pending_reply));
frag_size = (OPAL_UNLIKELY(rest > ompi_mtl_portals4.max_msg_size_mtl)) ? ompi_mtl_portals4.max_msg_size_mtl : rest;
ret = PtlGet(ompi_mtl_portals4.send_md_h,
(ptl_size_t) start + i * ompi_mtl_portals4.max_msg_size_mtl,
frag_size,
target,
ompi_mtl_portals4.read_idx,
match_bits,
remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl,
request);
if (OPAL_UNLIKELY(PTL_OK != ret)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PtlGet failed: %d",
__FILE__, __LINE__, ret);
return OMPI_ERR_OUT_OF_RESOURCE;
}
rest -= frag_size;
asked += frag_size;
}

return OMPI_SUCCESS;
Expand Down Expand Up @@ -109,26 +121,30 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
}

if (ev->mlength < msg_length)
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "Truncated message, some PtlGet are required (protocol = %d)",
ompi_mtl_portals4.protocol));

#if OPAL_ENABLE_DEBUG
ptl_request->hdr_data = ev->hdr_data;
#endif

if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) {
/* If it's not a short message and we're doing rndv, we
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && msg_length > ev->mlength) {
/* If it's not a short message and we're doing rndv and the message is not complete, we
only have the first part of the message. Issue the get
to pull the second part of the message. */
ret = read_msg((char*) ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit,
ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength,
((msg_length > ptl_request->delivery_len) ?
ptl_request->delivery_len : msg_length) - ompi_mtl_portals4.eager_limit,
ptl_request->delivery_len : msg_length) - ev->mlength,
ev->initiator,
ev->hdr_data,
ompi_mtl_portals4.eager_limit,
ev->mlength,
ptl_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
goto callback_error;
}

} else {
/* If we're either using the eager protocol or were a
short message, all data has been received, so complete
Expand All @@ -142,8 +158,6 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
__FILE__, __LINE__, ret);
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret;
}
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;

OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) completed, expected",
ptl_request->opcount, ptl_request->hdr_data));
Expand All @@ -165,12 +179,14 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
}

/* set the received length in the status, now that we know
excatly how much data was sent. */
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
if (ompi_mtl_portals4.protocol == rndv) {
ptl_request->super.super.ompi_req->req_status._ucount +=
ompi_mtl_portals4.eager_limit;
exactly how much data was sent. */
ptl_request->super.super.ompi_req->req_status._ucount += ev->mlength;

ret = OPAL_THREAD_ADD32(&(ptl_request->pending_reply), -1);
if (ret > 0) {
return OMPI_SUCCESS;
}
assert(ptl_request->pending_reply == 0);

#if OMPI_MTL_PORTALS4_FLOW_CONTROL
OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1);
Expand All @@ -192,8 +208,8 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
}

OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) completed, reply",
ptl_request->opcount, ptl_request->hdr_data));
"Recv %lu (0x%lx) completed , reply (pending_reply: %d)",
ptl_request->opcount, ptl_request->hdr_data, ptl_request->pending_reply));
ptl_request->super.super.completion_callback(&ptl_request->super.super);
break;

Expand Down Expand Up @@ -281,17 +297,16 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
ptl_request->super.super.completion_callback(&ptl_request->super.super);

} else {
if (ev->mlength > 0) {
/* if rndv or triggered, copy the eager part to the right place */
memcpy(ptl_request->delivery_ptr, ev->start, ev->mlength);
}

ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength,
((msg_length > ptl_request->delivery_len) ?
ptl_request->delivery_len : msg_length) - ev->mlength,
/* For long messages in the overflow list, ev->mlength = 0 */
ptl_request->super.super.ompi_req->req_status._ucount = 0;

ret = read_msg((char*) ptl_request->delivery_ptr,
(msg_length > ptl_request->delivery_len) ?
ptl_request->delivery_len : msg_length,
ev->initiator,
ev->hdr_data,
ev->mlength,
0,
ptl_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
Expand Down Expand Up @@ -373,6 +388,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
ptl_request->delivery_len = length;
ptl_request->req_started = false;
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
ptl_request->pending_reply = 0;

OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n",
Expand All @@ -389,7 +405,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
PTL_ME_OP_PUT |
PTL_ME_USE_ONCE |
PTL_ME_EVENT_UNLINK_DISABLE;
if (length <= ompi_mtl_portals4.eager_limit) {
if (length <= ompi_mtl_portals4.short_limit) {
me.options |= PTL_ME_EVENT_LINK_DISABLE;
}
me.match_id = remote_proc;
Expand All @@ -413,7 +429,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
/* if a long message, spin until we either have a comm event or a
link event, guaranteeing progress for long unexpected
messages. */
if (length > ompi_mtl_portals4.eager_limit) {
if (length > ompi_mtl_portals4.short_limit) {
while (true != ptl_request->req_started) {
ompi_mtl_portals4_progress();
}
Expand Down Expand Up @@ -454,6 +470,7 @@ ompi_mtl_portals4_imrecv(struct mca_mtl_base_module_t* mtl,
ptl_request->delivery_ptr = start;
ptl_request->delivery_len = length;
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
ptl_request->pending_reply = 0;

OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Mrecv %lu of length %ld (0x%lx)\n",
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/mtl/portals4/mtl_portals4_recv_short.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ ompi_mtl_portals4_activate_block(ompi_mtl_portals4_recv_short_block_t *block)
me.start = block->start;
me.length = ompi_mtl_portals4.recv_short_size;
me.ct_handle = PTL_CT_NONE;
me.min_free = ompi_mtl_portals4.eager_limit;
me.min_free = ompi_mtl_portals4.short_limit;
me.uid = ompi_mtl_portals4.uid;
me.options =
PTL_ME_OP_PUT |
Expand Down
3 changes: 3 additions & 0 deletions ompi/mca/mtl/portals4/mtl_portals4_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ struct ompi_mtl_portals4_isend_request_t {
#if OMPI_MTL_PORTALS4_FLOW_CONTROL
struct ompi_mtl_portals4_pending_request_t *pending;
#endif
ptl_size_t length;
int32_t pending_get;
uint32_t event_count;
};
typedef struct ompi_mtl_portals4_isend_request_t ompi_mtl_portals4_isend_request_t;
Expand All @@ -73,6 +75,7 @@ struct ompi_mtl_portals4_recv_request_t {
void *delivery_ptr;
size_t delivery_len;
volatile bool req_started;
int32_t pending_reply;
#if OPAL_ENABLE_DEBUG
uint64_t opcount;
ptl_hdr_data_t hdr_data;
Expand Down
Loading