From d1656347c831d434f6a56400f9ce224a93fb0511 Mon Sep 17 00:00:00 2001 From: Todd Kordenbrock Date: Thu, 12 Mar 2015 11:57:48 -0500 Subject: [PATCH] btl-portals4: implement the BTL 3.0 interface --- opal/mca/btl/portals4/btl_portals4.c | 93 ++++++++++--------- opal/mca/btl/portals4/btl_portals4.h | 27 +++--- .../mca/btl/portals4/btl_portals4_component.c | 35 +++++-- opal/mca/btl/portals4/btl_portals4_frag.c | 4 +- opal/mca/btl/portals4/btl_portals4_frag.h | 8 ++ opal/mca/btl/portals4/btl_portals4_rdma.c | 43 +++++++-- 6 files changed, 135 insertions(+), 75 deletions(-) diff --git a/opal/mca/btl/portals4/btl_portals4.c b/opal/mca/btl/portals4/btl_portals4.c index bcebc946564..90eb8d9f72a 100644 --- a/opal/mca/btl/portals4/btl_portals4.c +++ b/opal/mca/btl/portals4/btl_portals4.c @@ -39,6 +39,17 @@ #include "btl_portals4.h" #include "btl_portals4_recv.h" + +mca_btl_base_registration_handle_t * +mca_btl_portals4_register_mem(mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, + size_t size, + uint32_t flags); + +int mca_btl_portals4_deregister_mem(mca_btl_base_module_t *btl, + mca_btl_base_registration_handle_t *handle); + mca_btl_portals4_module_t mca_btl_portals4_module = { .super = { .btl_component = &mca_btl_portals4_component.super, @@ -52,7 +63,8 @@ mca_btl_portals4_module_t mca_btl_portals4_module = { .btl_alloc = mca_btl_portals4_alloc, .btl_free = mca_btl_portals4_free, .btl_prepare_src = mca_btl_portals4_prepare_src, - .btl_prepare_dst = mca_btl_portals4_prepare_dst, + .btl_register_mem = mca_btl_portals4_register_mem, + .btl_deregister_mem = mca_btl_portals4_deregister_mem, .btl_send = mca_btl_portals4_send, .btl_get = mca_btl_portals4_get, .btl_dump = mca_btl_base_dump, @@ -222,7 +234,7 @@ mca_btl_portals4_alloc(struct mca_btl_base_module_t* btl_base, } frag->md_h = PTL_INVALID_HANDLE; - frag->base.des_local_count = 1; + frag->base.des_segment_count = 1; frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; frag->base.order = MCA_BTL_NO_ORDER; @@ -274,7 +286,6 @@ mca_btl_portals4_free(struct mca_btl_base_module_t* btl_base, mca_btl_base_descriptor_t* mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -312,7 +323,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base, } frag->segments[0].base.seg_len = max_data + reserve; - frag->base.des_local_count = 1; + frag->base.des_segment_count = 1; } else { /* no need to pack - rdma operation out of user's buffer */ @@ -347,7 +358,7 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base, frag->segments[0].base.seg_len = max_data; frag->segments[0].base.seg_addr.pval = iov.iov_base; frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1); - frag->base.des_local_count = 1; + frag->base.des_segment_count = 1; /* either a put or get. figure out which later */ OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, @@ -398,58 +409,50 @@ mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base, (void *)frag, frag->me_h, me.start, me.length, me.match_id.rank, me.match_id.phys.nid, me.match_id.phys.pid, me.match_bits)); } - frag->base.des_local = &frag->segments[0].base; - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; + + frag->base.des_segments = &frag->segments[0].base; frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; frag->base.order = MCA_BTL_NO_ORDER; return &frag->base; } -mca_btl_base_descriptor_t* -mca_btl_portals4_prepare_dst(struct mca_btl_base_module_t* btl_base, - struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) +mca_btl_base_registration_handle_t * +mca_btl_portals4_register_mem(mca_btl_base_module_t *btl_base, + mca_btl_base_endpoint_t *endpoint, + void *base, + size_t size, + uint32_t flags) { - struct mca_btl_portals4_module_t* portals4_btl = (struct mca_btl_portals4_module_t*) btl_base; - mca_btl_portals4_frag_t* frag; - - /* reserve space in the event queue for rdma operations immediately */ - while (OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, 1) > - portals4_btl->portals_max_outstanding_ops) { - OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, -1); - OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "Call to mca_btl_portals4_component_progress (2)\n")); - mca_btl_portals4_component_progress(); - } + struct mca_btl_portals4_module_t *portals4_btl = (struct mca_btl_portals4_module_t*) btl_base; + mca_btl_base_registration_handle_t *handle = NULL; - OPAL_BTL_PORTALS4_FRAG_ALLOC_USER(portals4_btl, frag); - if (NULL == frag) { - OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, -1); + handle = (mca_btl_base_registration_handle_t *)malloc(sizeof(mca_btl_base_registration_handle_t)); + if (!handle) { return NULL; } + + handle->key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1); + OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, - "mca_btl_portals4_prepare_dst: Incrementing portals_outstanding_ops=%d\n", portals4_btl->portals_outstanding_ops)); - - frag->segments[0].base.seg_len = *size; - opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments[0].base.seg_addr.pval) ); - frag->segments[0].key = OPAL_THREAD_ADD64(&(portals4_btl->portals_rdma_key), 1); - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; - frag->base.des_local = &frag->segments[0].base; - frag->base.des_local_count = 1; - frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - frag->base.order = MCA_BTL_NO_ORDER; - frag->md_h = PTL_INVALID_HANDLE; + "mca_btl_portals4_register_mem NI=%d base=%p size=%ld handle=%p key=%ld\n", + portals4_btl->interface_num, base, size, (void *)handle, handle->key)); + + return handle; +} + +int +mca_btl_portals4_deregister_mem(mca_btl_base_module_t *btl_base, + mca_btl_base_registration_handle_t *handle) +{ + struct mca_btl_portals4_module_t *portals4_btl = (struct mca_btl_portals4_module_t*) btl_base; OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, - "mca_btl_portals4_prepare_dst &base=%p reserve=%ld size=%ld rank=%x pid=%x key=%ld\n", - (void *)&frag->base, reserve, *size, peer->ptl_proc.rank, peer->ptl_proc.phys.pid, frag->segments[0].key)); - return &frag->base; + "mca_btl_portals4_deregister_mem NI=%d handle=%p key=%ld\n", + portals4_btl->interface_num, (void *)handle, handle->key)); + + free(handle); + + return OPAL_SUCCESS; } int diff --git a/opal/mca/btl/portals4/btl_portals4.h b/opal/mca/btl/portals4/btl_portals4.h index 699e5fbddbe..7a234eab7db 100644 --- a/opal/mca/btl/portals4/btl_portals4.h +++ b/opal/mca/btl/portals4/btl_portals4.h @@ -238,23 +238,12 @@ int mca_btl_portals4_free(struct mca_btl_base_module_t* btl_base, mca_btl_base_descriptor_t* mca_btl_portals4_prepare_src(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, size_t* size, uint32_t flags); -mca_btl_base_descriptor_t* -mca_btl_portals4_prepare_dst(struct mca_btl_base_module_t* btl_base, - struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); - int mca_btl_portals4_send(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* btl_peer, struct mca_btl_base_descriptor_t* descriptor, @@ -279,10 +268,24 @@ int mca_btl_portals4_put(struct mca_btl_base_module_t* btl_base, int mca_btl_portals4_get(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* decriptor); + void *local_address, + uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, + int flags, + int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, + void *cbdata); int mca_btl_portals4_get_error(int ptl_error); +struct mca_btl_base_registration_handle_t { + /** Portals4 match bits */ + ptl_match_bits_t key; +}; + /* * global structures */ diff --git a/opal/mca/btl/portals4/btl_portals4_component.c b/opal/mca/btl/portals4/btl_portals4_component.c index e6f643b7265..ab1de618f66 100644 --- a/opal/mca/btl/portals4/btl_portals4_component.c +++ b/opal/mca/btl/portals4/btl_portals4_component.c @@ -222,7 +222,17 @@ mca_btl_portals4_component_open(void) mca_btl_portals4_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_RDMA_MATCHED; - mca_btl_portals4_module.super.btl_seg_size = sizeof (mca_btl_portals4_segment_t); + + mca_btl_portals4_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); + + mca_btl_portals4_module.super.btl_get_limit = SIZE_MAX; + mca_btl_portals4_module.super.btl_put_limit = 0; /* not implemented */ + mca_btl_portals4_module.super.btl_get_alignment = 0; + mca_btl_portals4_module.super.btl_put_alignment = 0; + + mca_btl_portals4_module.super.btl_get_local_registration_threshold = 0; + mca_btl_portals4_module.super.btl_put_local_registration_threshold = 0; + mca_btl_portals4_module.super.btl_bandwidth = 1000; mca_btl_portals4_module.super.btl_latency = 0; @@ -770,8 +780,8 @@ mca_btl_portals4_component_progress(void) tag = (unsigned char) (ev.hdr_data); - btl_base_descriptor.des_local = seg; - btl_base_descriptor.des_local_count = 1; + btl_base_descriptor.des_segments = seg; + btl_base_descriptor.des_segment_count = 1; seg[0].seg_addr.pval = ev.start; seg[0].seg_len = ev.mlength; @@ -785,6 +795,8 @@ mca_btl_portals4_component_progress(void) case PTL_EVENT_PUT_OVERFLOW: /* */ + OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, + "PTL_EVENT_OVERFLOW received\n")); goto done; break; @@ -810,8 +822,10 @@ mca_btl_portals4_component_progress(void) goto done; break; - case PTL_EVENT_GET: + case PTL_EVENT_GET: /* Generated on source (target) when a get from memory ends */ /* */ + OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, + "PTL_EVENT_GET received at target rlength=%ld mlength=%ld\n", ev.rlength, ev.mlength)); goto done; break; @@ -849,11 +863,14 @@ mca_btl_portals4_component_progress(void) } else { OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, - "PTL_EVENT_REPLY: Call to des_cbfunc: %lx\n", (uint64_t)frag->base.des_cbfunc)); - frag->base.des_cbfunc(&portals4_btl->super, - frag->endpoint, - &frag->base, - OPAL_SUCCESS); + "PTL_EVENT_REPLY: Call to rdma_cbfunc=%p\n", (void *)frag->rdma_cb.func)); + frag->rdma_cb.func(&portals4_btl->super, + frag->endpoint, + ev.start, + frag->rdma_cb.local_handle, + frag->rdma_cb.context, + frag->rdma_cb.data, + OPAL_SUCCESS); PtlMDRelease(frag->md_h); frag->md_h = PTL_INVALID_HANDLE; diff --git a/opal/mca/btl/portals4/btl_portals4_frag.c b/opal/mca/btl/portals4/btl_portals4_frag.c index 5efc89b3229..5358553c4a1 100644 --- a/opal/mca/btl/portals4/btl_portals4_frag.c +++ b/opal/mca/btl/portals4/btl_portals4_frag.c @@ -27,8 +27,8 @@ static void mca_btl_portals4_frag_common_send_constructor(mca_btl_portals4_frag_t* frag) { frag->base.des_flags = 0; - frag->base.des_local = &frag->segments[0].base; - frag->base.des_local_count = 2; + frag->base.des_segments = &frag->segments[0].base; + frag->base.des_segment_count = 2; frag->segments[0].base.seg_addr.pval = frag + 1; frag->segments[0].base.seg_len = frag->size; diff --git a/opal/mca/btl/portals4/btl_portals4_frag.h b/opal/mca/btl/portals4/btl_portals4_frag.h index d615bef84bf..814a80a82ee 100644 --- a/opal/mca/btl/portals4/btl_portals4_frag.h +++ b/opal/mca/btl/portals4/btl_portals4_frag.h @@ -57,6 +57,14 @@ struct mca_btl_portals4_frag_t { /* length for retransmit case */ ptl_process_t peer_proc; + /* the callback and context to complete an RDMA operation */ + struct { + mca_btl_base_rdma_completion_fn_t func; + void *context; + void *data; + mca_btl_base_registration_handle_t *local_handle; + } rdma_cb; + enum { BTL_PORTALS4_FRAG_TYPE_EAGER, BTL_PORTALS4_FRAG_TYPE_MAX, BTL_PORTALS4_FRAG_TYPE_USER } type; diff --git a/opal/mca/btl/portals4/btl_portals4_rdma.c b/opal/mca/btl/portals4/btl_portals4_rdma.c index 72c3545ab32..dde1f814425 100644 --- a/opal/mca/btl/portals4/btl_portals4_rdma.c +++ b/opal/mca/btl/portals4/btl_portals4_rdma.c @@ -37,23 +37,52 @@ mca_btl_portals4_put(struct mca_btl_base_module_t* btl_base, int mca_btl_portals4_get(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* descriptor) + void *local_address, + uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, + int flags, + int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, + void *cbdata) { mca_btl_portals4_module_t *portals4_btl = (mca_btl_portals4_module_t *) btl_base; - mca_btl_portals4_segment_t *src_seg = (mca_btl_portals4_segment_t *) descriptor->des_remote; - mca_btl_portals4_frag_t *frag = (mca_btl_portals4_frag_t*) descriptor; + mca_btl_portals4_frag_t *frag = NULL; ptl_md_t md; int ret; + /* reserve space in the event queue for rdma operations immediately */ + while (OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, 1) > + portals4_btl->portals_max_outstanding_ops) { + OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, -1); + OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "Call to mca_btl_portals4_component_progress (1)\n")); + mca_btl_portals4_component_progress(); + } + + OPAL_BTL_PORTALS4_FRAG_ALLOC_USER(portals4_btl, frag); + if (NULL == frag){ + OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, -1); + return OPAL_ERROR; + } + OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, + "mca_btl_portals4_prepare_src: Incrementing portals_outstanding_ops=%d\n", portals4_btl->portals_outstanding_ops)); + OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, - "mca_btl_portals4_get frag=%p src_seg=%p frag->md_h=%d\n", (void *)frag, (void *)src_seg, frag->md_h)); + "mca_btl_portals4_get frag=%p\n", (void *)frag)); + + frag->rdma_cb.func = cbfunc; + frag->rdma_cb.context = cbcontext; + frag->rdma_cb.data = cbdata; + frag->rdma_cb.local_handle = local_handle; frag->endpoint = btl_peer; frag->hdr.tag = MCA_BTL_TAG_MAX; /* Bind the memory */ - md.start = (void *)frag->segments[0].base.seg_addr.pval; - md.length = frag->segments[0].base.seg_len; + md.start = (void *)local_address; + md.length = size; md.options = 0; md.eq_handle = portals4_btl->recv_eq_h; md.ct_handle = PTL_CT_NONE; @@ -69,7 +98,7 @@ mca_btl_portals4_get(struct mca_btl_base_module_t* btl_base, return OPAL_ERROR; } - frag->match_bits = src_seg->key; + frag->match_bits = remote_handle->key; frag->length = md.length; frag->peer_proc = btl_peer->ptl_proc; ret = PtlGet(frag->md_h,