diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 1a2b9dde77..5d9335613c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -15,6 +15,11 @@ #include "osc_rdma_dynamic.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" +#include "opal/align.h" + +static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address, + mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size, + ompi_osc_rdma_request_t *request); static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, mca_btl_base_registration_handle_t *local_handle, @@ -136,7 +141,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc ompi_osc_rdma_peer_t *peer, uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, int remote_count, ompi_datatype_t *remote_datatype, ompi_osc_rdma_request_t *request, const size_t max_rdma_len, - const ompi_osc_rdma_fn_t rdma_fn,const bool alloc_reqs) + const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs) { ompi_osc_rdma_module_t *module = sync->module; struct iovec local_iovec[OMPI_OSC_RDMA_DECODE_MAX], remote_iovec[OMPI_OSC_RDMA_DECODE_MAX]; @@ -575,11 +580,13 @@ static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struc assert (OPAL_SUCCESS == status); - if (NULL != frag) { + if (request->buffer || NULL != frag) { if (OPAL_LIKELY(OMPI_SUCCESS == status)) { memcpy (origin_addr, (void *) source, request->len); } + } + if (NULL != frag) { ompi_osc_rdma_frag_complete (frag); } else { ompi_osc_rdma_deregister (sync->module, local_handle); @@ -621,6 +628,27 @@ int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer) } +static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address, + mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size, + ompi_osc_rdma_request_t *request) { + ompi_osc_rdma_module_t *module = sync->module; + ompi_osc_rdma_request_t *subreq; + int ret; + + OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq); + subreq->internal = true; + subreq->type = OMPI_OSC_RDMA_TYPE_RDMA; + subreq->parent_request = request; + (void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1); + + ret = ompi_osc_rdma_get_contig (sync, peer, source_address, source_handle, target_buffer, size, subreq); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OMPI_OSC_RDMA_REQUEST_RETURN(subreq); + (void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1); + } + + return ret; +} static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size, @@ -639,33 +667,81 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p aligned_source_bound = (source_address + size + btl_alignment_mask) & ~btl_alignment_mask; aligned_len = aligned_source_bound - aligned_source_base; - request->offset = source_address - aligned_source_base; - request->len = size; - request->origin_addr = target_buffer; - request->sync = sync; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p", size, source_address, target_buffer); if ((module->selected_btl->btl_register_mem && size > module->selected_btl->btl_get_local_registration_threshold) || (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - /* check for alignment */ - if (!(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - (void) ompi_osc_rdma_register (module, peer->data_endpoint, target_buffer, size, MCA_BTL_REG_FLAG_LOCAL_WRITE, + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { + /* region is too large for a buffered read */ + size_t subsize; + + if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { + /* remote region has the same alignment but the base is not aligned. perform a small + * buffered get of the beginning of the remote region */ + aligned_source_base = OPAL_ALIGN(source_address, module->selected_btl->btl_get_alignment, osc_rdma_base_t); + subsize = (size_t) (aligned_source_base - source_address); + + ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + source_address += subsize; + target_buffer = (void *) ((intptr_t) target_buffer + subsize); + size -= subsize; + + aligned_len = aligned_source_bound - aligned_source_base; + } + + if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && + (size & btl_alignment_mask)) { + /* remote region bases are aligned but the bounds are not. perform a + * small buffered get of the end of the remote region */ + aligned_len = size & ~btl_alignment_mask; + subsize = size - aligned_len; + size = aligned_len; + ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle, + (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + /* (remaining) user request is now correctly aligned */ + } + + if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + /* local and remote alignments differ */ + request->buffer = ptr = malloc (aligned_len); + } else { + ptr = target_buffer; + } + + if (NULL != ptr) { + (void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, &local_handle); } if (OPAL_UNLIKELY(NULL == local_handle)) { - return OMPI_ERR_OUT_OF_RESOURCE; + free (request->buffer); + request->buffer = NULL; + return ret; } } else { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get", ptr, (void *) frag); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", + ptr, (void *) frag, aligned_len, (unsigned long) aligned_source_base); local_handle = frag->handle; } } + request->offset = source_address - aligned_source_base; + request->len = size; + request->origin_addr = target_buffer; + request->sync = sync; + ompi_osc_rdma_sync_rdma_inc (sync); do { diff --git a/ompi/mca/osc/rdma/osc_rdma_frag.h b/ompi/mca/osc/rdma/osc_rdma_frag.h index fcd7243fd0..e9636a24d2 100644 --- a/ompi/mca/osc/rdma/osc_rdma_frag.h +++ b/ompi/mca/osc/rdma/osc_rdma_frag.h @@ -60,7 +60,7 @@ static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size request_len = OPAL_ALIGN(request_len, 8, size_t); if (request_len > (mca_osc_rdma_component.buffer_size >> 1)) { - return OMPI_ERR_OUT_OF_RESOURCE; + return OMPI_ERR_VALUE_OUT_OF_BOUNDS; } OPAL_THREAD_LOCK(&module->lock); diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 8dedcc3f45..67e78a2a68 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -50,7 +50,7 @@ static inline int ompi_osc_rdma_lock_release_shared (ompi_osc_rdma_module_t *mod { uint64_t lock = (uint64_t) (intptr_t) peer->state + offset; volatile bool atomic_complete = false; - void *temp; + void *temp = NULL; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing shared lock %" PRIx64 " on peer %d. value 0x%lx", lock, @@ -117,7 +117,7 @@ static inline int ompi_osc_rdma_lock_acquire_shared (ompi_osc_rdma_module_t *mod { uint64_t lock = (uint64_t) peer->state + offset; volatile bool atomic_complete; - ompi_osc_rdma_lock_t *temp; + ompi_osc_rdma_lock_t *temp = NULL; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring shared lock %" PRIx64 " on peer %d. value 0x%lx", lock, @@ -292,7 +292,7 @@ static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t * { uint64_t lock = (uint64_t) (intptr_t) peer->state + offset; volatile bool atomic_complete = false; - void *temp; + void *temp = NULL; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d", lock, peer->rank); diff --git a/ompi/mca/osc/rdma/osc_rdma_request.c b/ompi/mca/osc/rdma/osc_rdma_request.c index a5a8bb8084..86b088b1b6 100644 --- a/ompi/mca/osc/rdma/osc_rdma_request.c +++ b/ompi/mca/osc/rdma/osc_rdma_request.c @@ -59,7 +59,10 @@ static void request_construct(ompi_osc_rdma_request_t *request) request->super.req_free = request_free; request->super.req_cancel = request_cancel; request->super.req_complete_cb = request_complete; - request->parent_request = 0; + request->parent_request = NULL; + request->buffer = NULL; + request->internal = false; + request->outstanding_requests = 0; OBJ_CONSTRUCT(&request->convertor, opal_convertor_t); } diff --git a/ompi/mca/osc/rdma/osc_rdma_request.h b/ompi/mca/osc/rdma/osc_rdma_request.h index ad10e4c69a..3cec365a7a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_request.h +++ b/ompi/mca/osc/rdma/osc_rdma_request.h @@ -60,6 +60,7 @@ struct ompi_osc_rdma_request_t { /** synchronization object */ struct ompi_osc_rdma_sync_t *sync; + void *buffer; }; typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t; OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t); @@ -78,18 +79,19 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t); req = (ompi_osc_rdma_request_t*) item; \ OMPI_REQUEST_INIT(&req->super, false); \ req->super.req_mpi_object.win = module->win; \ - req->super.req_complete = false; \ req->super.req_state = OMPI_REQUEST_ACTIVE; \ req->module = rmodule; \ - req->internal = false; \ - req->outstanding_requests = 0; \ - req->parent_request = NULL; \ req->peer = (rpeer); \ } while (0) #define OMPI_OSC_RDMA_REQUEST_RETURN(req) \ do { \ OMPI_REQUEST_FINI(&(req)->super); \ + free ((req)->buffer); \ + (req)->buffer = NULL; \ + (req)->parent_request = NULL; \ + (req)->internal = false; \ + (req)->outstanding_requests = 0; \ opal_free_list_return (&mca_osc_rdma_component.requests, \ (opal_free_list_item_t *) (req)); \ } while (0)