diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 33abef80442..8f70409e1cf 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -8,7 +8,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. @@ -86,6 +86,12 @@ struct ompi_osc_rdma_component_t { /** Default value of the no_locks info key for new windows */ bool no_locks; + /** Accumulate operations will only operate on a single intrinsic datatype */ + bool acc_single_intrinsic; + + /** Use network AMOs when available */ + bool acc_use_amo; + /** Priority of the osc/rdma component */ unsigned int priority; @@ -121,12 +127,13 @@ struct ompi_osc_rdma_module_t { /** value of same_size info key for this window */ bool same_size; - /** window should have accumulate ordering... */ - bool accumulate_ordering; - /** passive-target synchronization will not be used in this window */ bool no_locks; + bool acc_single_intrinsic; + + bool acc_use_amo; + /** flavor of this window */ int flavor; diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index d3e771ae59c..b12c94aa21d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -1,7 +1,9 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,7 +27,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count int ret = OMPI_SUCCESS; do { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local regions"); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local region(s)"); if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); @@ -70,7 +72,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count return ret; } -static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void *compare_buffer, void *result_buffer, +static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, ompi_osc_rdma_module_t *module) @@ -79,10 +81,10 @@ static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - memcpy (result_buffer, (void *) (uintptr_t) target_address, datatype->super.size); + memcpy (result_addr, (void *) (uintptr_t) target_address, datatype->super.size); - if (0 == memcmp (compare_buffer, result_buffer, datatype->super.size)) { - memcpy ((void *) (uintptr_t) target_address, source_buffer, datatype->super.size); + if (0 == memcmp (compare_addr, result_addr, datatype->super.size)) { + memcpy ((void *) (uintptr_t) target_address, source_addr, datatype->super.size); } ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); @@ -258,15 +260,19 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v return OMPI_SUCCESS; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "accumulate btl operation faile with opal error code %d", ret); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "accumulate btl operation failed with opal error code %d", ret); + + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } ompi_osc_rdma_cleanup_rdma (sync, frag, NULL, NULL); return ret; } -static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_buffer, int source_count, - ompi_datatype_t *source_datatype, void *result_buffer, int result_count, +static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_addr, int source_count, + ompi_datatype_t *source_datatype, void *result_addr, int result_count, ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) @@ -304,15 +310,15 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (source_datatype) { (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); - source_buffer = (void *)((intptr_t) source_buffer + lb); + source_addr = (void *)((intptr_t) source_addr + lb); } if (result_datatype) { (void) ompi_datatype_get_extent (result_datatype, &lb, &extent); - result_buffer = (void *)((intptr_t) result_buffer + lb); + result_addr = (void *)((intptr_t) result_addr + lb); } - ret = ompi_osc_rdma_gacc_contig (sync, source_buffer, source_count, source_datatype, result_buffer, + ret = ompi_osc_rdma_gacc_contig (sync, source_addr, source_count, source_datatype, result_addr, result_count, result_datatype, peer, target_address, target_handle, target_count, target_datatype, op, request); @@ -323,12 +329,12 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (source_datatype) { /* the convertors will handle the lb */ (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); - source_buffer = (void *)((intptr_t) source_buffer - lb); + source_addr = (void *)((intptr_t) source_addr - lb); } if (result_datatype) { (void) ompi_datatype_get_extent (result_datatype, &lb, &extent); - result_buffer = (void *)((intptr_t) result_buffer - lb); + result_addr = (void *)((intptr_t) result_addr - lb); } } @@ -362,7 +368,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v /* the source may be NULL if using MPI_OP_NO_OP with MPI_Get_accumulate */ if (source_datatype) { OBJ_CONSTRUCT(&source_convertor, opal_convertor_t); - ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_buffer, + ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_addr, 0, &source_convertor); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; @@ -427,7 +433,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (result_datatype) { /* prepare a convertor for this part of the result */ opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count, - result_buffer, 0, &subreq->convertor); + result_addr, 0, &subreq->convertor); opal_convertor_set_position (&subreq->convertor, &result_position); subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC; } else { @@ -478,41 +484,205 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v return OMPI_SUCCESS; } -#if 0 static void ompi_osc_rdma_cas_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, mca_btl_base_registration_handle_t *local_handle, void *context, void *data, int status) { ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; - void *result_buffer = (void *)(intptr_t) ((int64_t *) local_address)[1]; + void *result_addr = (void *)(intptr_t) ((int64_t *) local_address)[1]; + size_t size = ((int64_t *) local_address)[2]; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic compare-and-swap complete. result: 0x%" PRIx64, *((int64_t *) local_address)); /* copy the result */ - memcpy (result_buffer, local_address, 8); + memcpy (result_addr, local_address, size); ompi_osc_rdma_sync_rdma_dec (sync); ompi_osc_rdma_frag_complete (frag); } -static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, - void *result_buffer, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, +static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, + void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) { ompi_osc_rdma_module_t *module = sync->module; + const size_t size = datatype->super.size; ompi_osc_rdma_frag_t *frag = NULL; + int64_t compare, source; + int ret, flags; char *ptr; + + if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->selected_btl->btl_flags))) { + return OMPI_ERR_NOT_SUPPORTED; + } + + compare = (8 == size) ? ((int64_t *) compare_addr)[0] : ((int32_t *) compare_addr)[0]; + source = (8 == size) ? ((int64_t *) source_addr)[0] : ((int32_t *) source_addr)[0]; + flags = (4 == size) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%" + PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr)); + + ret = ompi_osc_rdma_frag_alloc (module, 24, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + /* store the destination and size in the temporary buffer */ + ((int64_t *) ptr)[1] = (intptr_t) result_addr; + ((int64_t *) ptr)[2] = size; + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address, + frag->handle, target_handle, compare, source, flags, MCA_BTL_NO_ORDER, + ompi_osc_rdma_cas_atomic_complete, sync, frag); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + + if (1 == ret) { + memcpy (result_addr, ptr, size); + ret = OMPI_SUCCESS; + } + + ompi_osc_rdma_frag_complete (frag); + } + + return ret; +} + +static inline void ompi_osc_rdma_fetch_and_op_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; + ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; + void *result_addr = (void *)(intptr_t) ((int64_t *) local_address)[1]; + ompi_osc_rdma_request_t *req = (ompi_osc_rdma_request_t *) (intptr_t) ((int64_t *) local_address)[2]; + size_t size = ((int64_t *) local_address)[3]; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic fetch-and-op complete. result: 0x%" PRIx64, + *((int64_t *) local_address)); + + /* copy the result */ + if (result_addr) { + memcpy (result_addr, local_address, size); + } + + ompi_osc_rdma_sync_rdma_dec (sync); + ompi_osc_rdma_frag_complete (frag); + if (req) { + ompi_osc_rdma_request_complete (req, status); + } +} + +static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES] = { + [OMPI_OP_MAX] = MCA_BTL_ATOMIC_MAX, + [OMPI_OP_MIN] = MCA_BTL_ATOMIC_MIN, + [OMPI_OP_SUM] = MCA_BTL_ATOMIC_ADD, + [OMPI_OP_BAND] = MCA_BTL_ATOMIC_AND, + [OMPI_OP_BOR] = MCA_BTL_ATOMIC_OR, + [OMPI_OP_BXOR] = MCA_BTL_ATOMIC_XOR, + [OMPI_OP_LAND] = MCA_BTL_ATOMIC_LAND, + [OMPI_OP_LOR] = MCA_BTL_ATOMIC_LOR, + [OMPI_OP_LXOR] = MCA_BTL_ATOMIC_LXOR, + [OMPI_OP_REPLACE] = MCA_BTL_ATOMIC_SWAP, +}; + +static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt, + ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, + mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + int32_t atomic_flags = module->selected_btl->btl_atomic_flags; + ompi_osc_rdma_frag_t *frag = NULL; + int ret, btl_op, flags; + char *ptr = NULL; + int64_t origin; + + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { + return OMPI_ERR_NOT_SUPPORTED; + } + + flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; + if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) { + flags |= MCA_BTL_ATOMIC_FLAG_FLOAT; + } + + btl_op = ompi_osc_rdma_op_mapping[op->op_type]; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using %d-bit btl atomics. origin: 0x%" PRIx64, + (4 == extent) ? 32 : 64, *((int64_t *) origin_addr)); + + ret = ompi_osc_rdma_frag_alloc (module, 32, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0]; + + /* store the destination, request, and extent in the temporary buffer for the callback */ + ((int64_t *) ptr)[1] = (intptr_t) result_addr; + ((int64_t *) ptr)[2] = (intptr_t) req; + ((int64_t *) ptr)[3] = extent; + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->data_endpoint, ptr, target_address, + frag->handle, target_handle, btl_op, origin, flags, + MCA_BTL_NO_ORDER, ompi_osc_rdma_fetch_and_op_atomic_complete, + sync, frag); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + + if (OPAL_LIKELY(1 == ret)) { + memcpy (result_addr, ptr, extent); + if (req) { + ompi_osc_rdma_request_complete (req, OMPI_SUCCESS); + } + ret = OPAL_SUCCESS; + } + + ompi_osc_rdma_frag_complete (frag); + } + + return ret; +} + +static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt, + ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, + mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + ompi_osc_rdma_frag_t *frag = NULL; + uint64_t address, offset; + char *ptr = NULL; int ret; - /* XXX -- TODO -- Update the BTL interface to allow for other CAS sizes */ - if (datatype->super.size != 8) { + if (extent > 8) { return OMPI_ERR_NOT_SUPPORTED; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using 64-bit btl atomics. compare: 0x%" - PRIx64 ", origin: 0x%" PRIx64, *((int64_t *) compare_buffer), *((int64_t *) source_buffer)); + /* align the address. the user should not call with an unaligned address so don't need to range check here */ + address = target_address & ~7; + offset = target_address & ~address; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap. origin: 0x%" PRIx64, + *((int64_t *) origin_addr)); ret = ompi_osc_rdma_frag_alloc (module, 16, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -520,27 +690,124 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo } /* store the destination in the temporary buffer */ - ((int64_t *) ptr)[1] = (intptr_t) result_buffer; + do { + volatile bool complete = false; - ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address, - frag->handle, target_handle, ((int64_t *)compare_buffer)[0], - *((int64_t *) source_buffer), 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_cas_atomic_complete, module, frag); - if (OPAL_UNLIKELY(0 > ret)) { - return ret; + ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, ptr, 8); + if (OMPI_SUCCESS != ret) { + ompi_osc_rdma_frag_complete (frag); + return ret; + } + + ((int64_t *) ptr)[1] = ((int64_t *) ptr)[0]; + + if (&ompi_mpi_op_no_op.op == op) { + memcpy (ptr + offset, origin_addr, extent); + } else { + ompi_op_reduce (op, (void *) origin_addr, ptr + offset, 1, dt); + } + + do { + ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, address, + frag->handle, target_handle, ((int64_t *) ptr)[1], + ((int64_t *) ptr)[0], 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_atomic_complete, (void *) &complete, NULL); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + break; + } + + while (!complete) { + ompi_osc_rdma_progress (module); + } + + if (((int64_t *) ptr)[1] == ((int64_t *) ptr)[0]) { + break; + } + } while (1); + + if (result_addr) { + memcpy (result_addr, ptr + 8 + offset, extent); } - if (1 != ret) { - ompi_osc_rdma_sync_rdma_inc (sync); - } else { - memcpy (result_buffer, ptr, 8); + ompi_osc_rdma_frag_complete (frag); - ompi_osc_rdma_frag_complete (frag); + return ret; +} + +static void ompi_osc_rdma_acc_single_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; + ompi_osc_rdma_request_t *req = (ompi_osc_rdma_request_t *) data; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic accumulate complete"); + + ompi_osc_rdma_sync_rdma_dec (sync); + if (req) { + ompi_osc_rdma_request_complete (req, status); + } +} + +static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, ompi_datatype_t *dt, ptrdiff_t extent, + ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, + ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + int32_t atomic_flags = module->selected_btl->btl_atomic_flags; + int ret, btl_op, flags; + int64_t origin; + + if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + /* btl put atomics not supported or disabled. fall back on fetch-and-op */ + return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, op, req); } - return OMPI_SUCCESS; + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { + return OMPI_ERR_NOT_SUPPORTED; + } + + origin = (8 == extent) ? ((uint64_t *) origin_addr)[0] : ((uint32_t *) origin_addr)[0]; + + /* set the appropriate flags for this atomic */ + flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; + if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) { + flags |= MCA_BTL_ATOMIC_FLAG_FLOAT; + } + + btl_op = ompi_osc_rdma_op_mapping[op->op_type]; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate using 64-bit btl atomics. origin: 0x%" PRIx64, + *((int64_t *) origin_addr)); + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->data_endpoint, target_address, + target_handle, btl_op, origin, flags, MCA_BTL_NO_ORDER, + ompi_osc_rdma_acc_single_atomic_complete, sync, req); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + if (1 == ret) { + if (req) { + ompi_osc_rdma_request_complete (req, OMPI_SUCCESS); + } + ret = OMPI_SUCCESS; + } + } + + return ret; } -#endif /** * ompi_osc_rdma_cas_get_complete: @@ -561,45 +828,49 @@ static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, s OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "remote compare-and-swap get complete on sync %p. " "status %d", (void *) sync, status); - if (OMPI_SUCCESS == status) { - /* copy data to the user buffer (for gacc) */ - memcpy (request->result_addr, (void *) source, request->len); - - if (0 == memcmp ((void *) source, request->compare_addr, request->len)) { - /* the target and compare buffers match so write the source to the target */ - memcpy ((void *) source, request->origin_addr, request->len); - - ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address, - request->target_address, local_handle, - (mca_btl_base_registration_handle_t *) request->ctx, - request->len, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_acc_put_complete, request, NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code " - "%d", ret); - } - - /* TODO -- we can do better. probably should queue up the next step and handle it in progress */ - assert (OPAL_SUCCESS == ret); - } else { - /* this is a no-op. nothing more to do except release the accumulate lock */ - ompi_osc_rdma_frag_complete (frag); + if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + return; + } - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_release_exclusive (module, request->peer, - offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } + /* copy data to the user buffer (for gacc) */ + memcpy (request->result_addr, (void *) source, request->len); - /* the request is now complete and the outstanding rdma operation is complete */ - ompi_osc_rdma_request_complete (request, status); + if (0 == memcmp ((void *) source, request->compare_addr, request->len)) { + /* the target and compare buffers match. write the source to the target */ + memcpy ((void *) source, request->origin_addr, request->len); - ompi_osc_rdma_sync_rdma_dec (sync); - peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING; + ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address, + request->target_address, local_handle, + (mca_btl_base_registration_handle_t *) request->ctx, + request->len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_acc_put_complete, request, NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code " + "%d", ret); } + + /* TODO -- we can do better. probably should queue up the next step and handle it in progress */ + assert (OPAL_SUCCESS == ret); + + return; + } + + /* this is a no-op. nothing more to do except release the accumulate lock */ + ompi_osc_rdma_frag_complete (frag); + + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, request->peer, + offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } + + /* the request is now complete and the outstanding rdma operation is complete */ + ompi_osc_rdma_request_complete (request, status); + + ompi_osc_rdma_sync_rdma_dec (sync); + peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING; } -static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, void *result_buffer, +static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) { @@ -649,10 +920,10 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe /* set up the request */ request->frag = frag; - request->origin_addr = (void *) source_buffer; + request->origin_addr = (void *) source_addr; request->ctx = (void *) target_handle; - request->result_addr = result_buffer; - request->compare_addr = compare_buffer; + request->result_addr = result_addr; + request->compare_addr = compare_addr; request->result_dt = datatype; request->offset = (ptrdiff_t) offset; request->target_address = target_address; @@ -670,6 +941,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe } if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } ompi_osc_rdma_frag_complete (frag); return ret; } @@ -684,8 +958,8 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr, - struct ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, - struct ompi_win_t *win) + ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, + ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -708,8 +982,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare return ret; } -#if 0 - if (MCA_OSC_RDMA_SAME_OP <= module->accumulate_ops) { + if (win->w_acc_ops <= OMPI_WIN_ACCUMULATE_OPS_SAME_OP) { /* the user has indicated that they will only use the same op (or same op and no op) * for operations on overlapping memory ranges. that indicates it is safe to go ahead * and use network atomic operations. */ @@ -718,8 +991,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare if (OMPI_SUCCESS == ret) { return OMPI_SUCCESS; } - } else -#endif + } if (ompi_osc_rdma_peer_local_base (peer)) { return ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt, @@ -733,15 +1005,16 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare static inline int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, + ompi_datatype_t *origin_datatype, void *result_addr, int result_count, + ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, int target_rank, MPI_Aint target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, + ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; + ptrdiff_t lb, extent; int ret; /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */ @@ -753,12 +1026,35 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo return OMPI_SUCCESS; } - ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count, - &target_address, &target_handle); + (void) ompi_datatype_get_extent (origin_datatype, &lb, &extent); + + ret = osc_rdma_get_remote_segment (module, peer, target_disp, extent * target_count, &target_address, &target_handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } + if (module->acc_single_intrinsic && extent <= 8) { + if (module->acc_use_amo && ompi_datatype_is_predefined (origin_datatype)) { + if (NULL == result_addr) { + ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + } else { + ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + } + + if (OMPI_SUCCESS == ret) { + return OMPI_SUCCESS; + } + } + + ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + if (OMPI_SUCCESS == ret) { + return OMPI_SUCCESS; + } + } + if (ompi_osc_rdma_peer_local_base (peer)) { /* local/self optimization */ return ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count, @@ -771,13 +1067,10 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo target_datatype, op, request); } -int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win) +int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -801,14 +1094,10 @@ int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, } -int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win, - ompi_request_t **request) +int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win, ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -842,31 +1131,9 @@ int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, return OMPI_SUCCESS; } -int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, struct ompi_op_t *op, struct ompi_win_t *win) -{ - ompi_osc_rdma_module_t *module = GET_MODULE(win); - ompi_osc_rdma_peer_t *peer; - ompi_osc_rdma_sync_t *sync; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fop: %p, %s, %d, %lu, %s, %s", result_addr, dt->name, target_rank, - (unsigned long) target_disp, op->o_name, win->w_name); - - sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer); - if (OPAL_UNLIKELY(NULL == sync)) { - return OMPI_ERR_RMA_SYNC; - } - - return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer, target_rank, - target_disp, 1, dt, op, NULL); -} - - -int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, - struct ompi_win_t *win, struct ompi_request_t **request) +int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win, ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -898,11 +1165,9 @@ int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, return OMPI_SUCCESS; } -int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, - struct ompi_win_t *win) +int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -921,3 +1186,23 @@ int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, NULL, peer, target_rank, target_disp, target_count, target_datatype, op, NULL); } + + +int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, ompi_datatype_t *dt, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, ompi_op_t *op, ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_peer_t *peer; + ompi_osc_rdma_sync_t *sync; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fop: %p, %s, %d, %lu, %s, %s", result_addr, dt->name, + target_rank, (unsigned long) target_disp, op->o_name, win->w_name); + + sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer); + if (OPAL_UNLIKELY(NULL == sync)) { + return OMPI_ERR_RMA_SYNC; + } + + return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer, + target_rank, target_disp, 1, dt, op, NULL); +} diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.h b/ompi/mca/osc/rdma/osc_rdma_accumulate.h index 8f6f1bb4b73..7ab370ab2b8 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.h +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -14,44 +14,30 @@ #include "osc_rdma.h" -int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, - void *result_addr, struct ompi_datatype_t *dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - struct ompi_win_t *win); - -int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, struct ompi_win_t *win); -int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, - struct ompi_datatype_t *dt, int target, - OPAL_PTRDIFF_TYPE target_disp, - struct ompi_op_t *op, struct ompi_win_t *win); - -int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win); - -int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, struct ompi_win_t *win, - struct ompi_request_t **request); - -int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win, - struct ompi_request_t **request); +int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr, + ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, + ompi_win_t *win); + +int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win); + +int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, ompi_datatype_t *dt, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, ompi_op_t *op, ompi_win_t *win); + +int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win); + +int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win, ompi_request_t **request); + +int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win, ompi_request_t **request); #endif /* OSC_RDMA_ACCUMULATE_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index c453891839e..7efde7c39be 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -732,7 +732,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } } else { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", - ptr, (void *) frag, aligned_len, (unsigned long) aligned_source_base); + ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); local_handle = frag->handle; } } @@ -772,9 +772,9 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, + ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) + ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *target_handle; @@ -807,9 +807,9 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_osc_rdma_put_contig, false); } -static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) + ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *source_handle; @@ -841,9 +841,9 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori source_handle, source_count, source_datatype, request, module->selected_btl->btl_get_limit, ompi_osc_rdma_get_contig, true); } -int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, ompi_win_t *win) + ompi_datatype_t *target_datatype, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -862,10 +862,10 @@ int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_da target_count, target_datatype, NULL); } -int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_win_t *win, - struct ompi_request_t **request) + ompi_datatype_t *target_datatype, ompi_win_t *win, + ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -897,9 +897,9 @@ int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_d return OMPI_SUCCESS; } -int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_get (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, struct ompi_win_t *win) + ompi_datatype_t *source_datatype, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -918,10 +918,10 @@ int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype source_disp, source_count, source_datatype, NULL); } -int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, struct ompi_win_t *win, - struct ompi_request_t **request) + ompi_datatype_t *source_datatype, ompi_win_t *win, + ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index c011eea3ed0..e9b048c56ee 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -96,23 +96,23 @@ static inline int osc_rdma_get_remote_segment (ompi_osc_rdma_module_t *module, o /* prototypes for implementations of MPI RMA window functions. these will be called from the * mpi interface (ompi/mpi/c) */ -int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win); + ompi_datatype_t *target_dt, ompi_win_t *win); -int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_get (void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win); + ompi_datatype_t *target_dt, ompi_win_t *win); -int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win, - struct ompi_request_t **request); + ompi_datatype_t *target_dt, ompi_win_t *win, + ompi_request_t **request); -int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win, - struct ompi_request_t **request); + ompi_datatype_t *target_dt, ompi_win_t *win, + ompi_request_t **request); /** * @brief read data from a remote memory region (blocking) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 8b52933007d..c951a767610 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -173,6 +173,20 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks); + mca_osc_rdma_component.acc_single_intrinsic = false; + (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic", + "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes " + "that will not use anything more than a single predefined datatype (default: false)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic); + + mca_osc_rdma_component.acc_use_amo = true; + (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo", + "Enable the use of network atomic memory operations when using single " + "intrinsic optimizations. If not set network compare-and-swap will be " + "used instread (default: true)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_use_amo); + mca_osc_rdma_component.buffer_size = 32768; (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", "Size of temporary buffers (default: 32k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, @@ -585,7 +599,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } } - if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) { + if (MPI_WIN_FLAVOR_CREATE == module->flavor) { ret = ompi_osc_rdma_initialize_region (module, base, size); if (OMPI_SUCCESS != ret) { break; @@ -600,6 +614,20 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s opal_shmem_unlink (&module->seg_ds); } + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *) module->state->regions; + module->state->disp_unit = module->disp_unit; + module->state->region_count = 1; + region->base = state_region->base + my_base_offset; + region->len = size; + if (module->selected_btl->btl_register_mem) { + memcpy (region->btl_handle_data, state_region->btl_handle_data, module->selected_btl->btl_registration_handle_size); + } + } + + /* barrier to make sure all ranks have attached */ + shared_comm->c_coll.coll_barrier(shared_comm, shared_comm->c_coll.coll_barrier_module); + offset = data_base; for (int i = 0 ; i < local_size ; ++i) { ompi_osc_rdma_peer_extended_t *ex_peer; @@ -646,21 +674,18 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { if (temp[i].size) { - ex_peer->super.base = (uint64_t) (uintptr_t) module->segment_base + offset; + ex_peer->super.base = state_region->base + offset; + offset += temp[i].size; } else { ex_peer->super.base = 0; } + } - peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; - - offset += temp[i].size; - } else { - ompi_osc_rdma_region_t *peer_region = (ompi_osc_rdma_region_t *) peer_state->regions; + ompi_osc_rdma_region_t *peer_region = (ompi_osc_rdma_region_t *) peer_state->regions; - ex_peer->super.base = peer_region->base; - if (module->selected_btl->btl_register_mem) { - ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data; - } + ex_peer->super.base = peer_region->base; + if (module->selected_btl->btl_register_mem) { + ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data; } } @@ -1020,6 +1045,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->same_disp_unit = check_config_value_bool ("same_disp_unit", info); module->same_size = check_config_value_bool ("same_size", info); module->no_locks = check_config_value_bool ("no_locks", info); + module->acc_single_intrinsic = check_config_value_bool ("ompi_single_accumulate", info); + module->acc_use_amo = mca_osc_rdma_component.acc_use_amo; module->all_sync.module = module; @@ -1047,14 +1074,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } } - /* options */ - /* FIX ME: should actually check this value... */ -#if 1 - module->accumulate_ordering = 1; -#else - ompi_osc_base_config_value_equal("accumulate_ordering", info, "none"); -#endif - ret = ompi_comm_dup(comm, &module->comm); if (OMPI_SUCCESS != ret) { ompi_osc_rdma_free (win); @@ -1132,17 +1151,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } } - ret = ompi_osc_rdma_share_data (module); - if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); - ompi_osc_rdma_free (win); - return ret; - } - - - /* for now the leader is always rank 0 in the communicator */ - module->leader = ompi_osc_rdma_module_peer (module, 0); - /* lock data */ if (module->no_locks) { win->w_flags |= OMPI_WIN_NO_LOCKS; @@ -1177,20 +1185,19 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* sync memory - make sure all initialization completed */ opal_atomic_mb(); - /* barrier to prevent arrival of lock requests before we're - fully created */ - ret = module->comm->c_coll.coll_barrier(module->comm, - module->comm->c_coll.coll_barrier_module); + ret = ompi_osc_rdma_share_data (module); if (OMPI_SUCCESS != ret) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); ompi_osc_rdma_free (win); - return ret; - } - + } else { + /* for now the leader is always rank 0 in the communicator */ + module->leader = ompi_osc_rdma_module_peer (module, 0); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d", - ompi_comm_get_cid(module->comm)); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d", + ompi_comm_get_cid(module->comm)); + } - return OMPI_SUCCESS; + return ret; } diff --git a/ompi/mca/osc/rdma/osc_rdma_passive_target.c b/ompi/mca/osc/rdma/osc_rdma_passive_target.c index f3e1a0ac85b..720fbbb64a8 100644 --- a/ompi/mca/osc/rdma/osc_rdma_passive_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_passive_target.c @@ -43,12 +43,6 @@ int ompi_osc_rdma_flush (int target, struct ompi_win_t *win) OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush: %d, %s", target, win->w_name); - if (ompi_comm_rank (module->comm) == target) { - /* nothing to flush. call one round of progress */ - ompi_osc_rdma_progress (module); - return OMPI_SUCCESS; - } - OPAL_THREAD_LOCK(&module->lock); lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer); diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index 44d9a0e45a3..7d7967ef66a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -217,6 +217,10 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd memcpy (ex_peer->super.base_handle, base_region->btl_handle_data, registration_handle_size); } + + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + ex_peer->super.super.data_endpoint = ex_peer->super.super.state_endpoint; + } } return OMPI_SUCCESS; diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.h b/ompi/mca/osc/rdma/osc_rdma_peer.h index 34fb22a3885..6716733a43a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.h +++ b/ompi/mca/osc/rdma/osc_rdma_peer.h @@ -75,6 +75,9 @@ struct ompi_osc_rdma_peer_basic_t { /** remote peer's base pointer */ osc_rdma_base_t base; + /** local pointer to peer's base */ + osc_rdma_base_t local_base; + /** registration handle associated with the base */ mca_btl_base_registration_handle_t *base_handle; }; diff --git a/opal/mca/btl/base/btl_base_frame.c b/opal/mca/btl/base/btl_base_frame.c index 6cb49e5f49c..f5f15c86544 100644 --- a/opal/mca/btl/base/btl_base_frame.c +++ b/opal/mca/btl/base/btl_base_frame.c @@ -61,6 +61,12 @@ mca_base_var_enum_value_flag_t mca_btl_base_atomic_enum_flags[] = { {MCA_BTL_ATOMIC_SUPPORTS_AND, "and", 0}, {MCA_BTL_ATOMIC_SUPPORTS_OR, "or", 0}, {MCA_BTL_ATOMIC_SUPPORTS_XOR, "xor", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_LAND, "land", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_LOR, "lor", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_LXOR, "lxor", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_SWAP, "swap", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_MIN, "min", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_MAX, "max", 0}, {MCA_BTL_ATOMIC_SUPPORTS_CSWAP, "compare-and-swap", 0}, {MCA_BTL_ATOMIC_SUPPORTS_GLOB, "global"}, {0, NULL, 0} diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index ed6771da774..19af3630084 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. @@ -290,12 +290,45 @@ enum { MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400, /** The btl supports atomic bitwise exclusive or */ MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800, + + /** The btl supports logical and */ + MCA_BTL_ATOMIC_SUPPORTS_LAND = 0x00001000, + /** The btl supports logical or */ + MCA_BTL_ATOMIC_SUPPORTS_LOR = 0x00002000, + /** The btl supports logical exclusive or */ + MCA_BTL_ATOMIC_SUPPORTS_LXOR = 0x00004000, + + /** The btl supports atomic swap */ + MCA_BTL_ATOMIC_SUPPORTS_SWAP = 0x00010000, + + /** The btl supports atomic min */ + MCA_BTL_ATOMIC_SUPPORTS_MIN = 0x00100000, + /** The btl supports atomic min */ + MCA_BTL_ATOMIC_SUPPORTS_MAX = 0x00200000, + + /** The btl supports 32-bit integer operations. Keep in mind the btl may + * support only a subset of the available atomics. */ + MCA_BTL_ATOMIC_SUPPORTS_32BIT = 0x01000000, + + /** The btl supports floating-point operations. Keep in mind the btl may + * support only a subset of the available atomics and may not support + * both 64 or 32-bit floating point. */ + MCA_BTL_ATOMIC_SUPPORTS_FLOAT = 0x02000000, + /** The btl supports atomic compare-and-swap */ MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000, + /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */ MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000, }; +enum { + /** Use 32-bit atomics */ + MCA_BTL_ATOMIC_FLAG_32BIT = 0x00000001, + /** Use floating-point atomics */ + MCA_BTL_ATOMIC_FLAG_FLOAT = 0x00000002, +}; + enum mca_btl_base_atomic_op_t { /** Atomic add: (*remote_address) = (*remote_address) + operand */ MCA_BTL_ATOMIC_ADD = 0x0001, @@ -305,6 +338,20 @@ enum mca_btl_base_atomic_op_t { MCA_BTL_ATOMIC_OR = 0x0012, /** Atomic xor: (*remote_address) = (*remote_address) ^ operand */ MCA_BTL_ATOMIC_XOR = 0x0014, + /** Atomic logical and: (*remote_address) = (*remote_address) && operand */ + MCA_BTL_ATOMIC_LAND = 0x0015, + /** Atomic logical or: (*remote_address) = (*remote_address) || operand */ + MCA_BTL_ATOMIC_LOR = 0x0016, + /** Atomic logical xor: (*remote_address) = (*remote_address) != operand */ + MCA_BTL_ATOMIC_LXOR = 0x0017, + /** Atomic swap: (*remote_address) = operand */ + MCA_BTL_ATOMIC_SWAP = 0x001a, + /** Atomic min */ + MCA_BTL_ATOMIC_MIN = 0x0020, + /** Atomic max */ + MCA_BTL_ATOMIC_MAX = 0x0021, + + MCA_BTL_ATOMIC_LAST, }; typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t; @@ -974,7 +1021,7 @@ typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl, * (remote_address, remote_address + 8) * @param op (IN) Operation to perform * @param operand (IN) Operand for the operation - * @param flags (IN) Flags for this put operation + * @param flags (IN) Flags for this atomic operation * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback @@ -1018,7 +1065,7 @@ typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_ * (remote_address, remote_address + 8) * @param op (IN) Operation to perform * @param operand (IN) Operand for the operation - * @param flags (IN) Flags for this put operation + * @param flags (IN) Flags for this atomic operation * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback @@ -1064,7 +1111,7 @@ typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module * (remote_address, remote_address + 8) * @param compare (IN) Operand for the operation * @param value (IN) Value to store on success - * @param flags (IN) Flags for this put operation + * @param flags (IN) Flags for this atomic operation * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback diff --git a/opal/mca/btl/openib/btl_openib_atomic.c b/opal/mca/btl/openib/btl_openib_atomic.c index 0c6460f2cf3..ec0eb644f1a 100644 --- a/opal/mca/btl/openib/btl_openib_atomic.c +++ b/opal/mca/btl/openib/btl_openib_atomic.c @@ -112,7 +112,7 @@ int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl void *cbcontext, void *cbdata) { - if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op)) { + if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op || (MCA_BTL_ATOMIC_FLAG_32BIT & flags))) { return OPAL_ERR_NOT_SUPPORTED; } @@ -128,6 +128,10 @@ int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_b uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { + if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_FLAG_32BIT & flags)) { + return OPAL_ERR_NOT_SUPPORTED; + } + return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle, remote_handle, IBV_WR_ATOMIC_CMP_AND_SWP, compare, value, flags, order, cbfunc, cbcontext, cbdata); diff --git a/opal/mca/btl/ugni/btl_ugni_atomic.c b/opal/mca/btl/ugni/btl_ugni_atomic.c index 981bc759ee9..3c62670da89 100644 --- a/opal/mca/btl/ugni/btl_ugni_atomic.c +++ b/opal/mca/btl/ugni/btl_ugni_atomic.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -11,18 +11,66 @@ #include "btl_ugni_rdma.h" -static gni_fma_cmd_type_t famo_cmds[] = { - [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD, - [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND, - [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR, - [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR, +static gni_fma_cmd_type_t amo_cmds[][MCA_BTL_ATOMIC_LAST] = { + [OPAL_INT32] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_IADD_S, + [MCA_BTL_ATOMIC_LAND] = GNI_FMA_ATOMIC2_AND_S, + [MCA_BTL_ATOMIC_LOR] = GNI_FMA_ATOMIC2_OR_S, + [MCA_BTL_ATOMIC_LXOR] = GNI_FMA_ATOMIC2_XOR_S, + [MCA_BTL_ATOMIC_SWAP] = GNI_FMA_ATOMIC2_SWAP_S, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_IMIN_S, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_IMAX_S, + }, + [OPAL_INT64] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD, + [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND, + [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR, + [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR, + [MCA_BTL_ATOMIC_SWAP] = GNI_FMA_ATOMIC2_SWAP, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_IMIN, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_IMAX, + }, + [OPAL_FLOAT] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FPADD_S, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FPMIN_S, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FPMAX_S, + }, + [OPAL_DOUBLE] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FPADD, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FPMIN, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FPMAX, + }, }; -static gni_fma_cmd_type_t amo_cmds[] = { - [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD, - [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND, - [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR, - [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR, +static gni_fma_cmd_type_t famo_cmds[][MCA_BTL_ATOMIC_LAST] = { + [OPAL_INT32] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FIADD_S, + [MCA_BTL_ATOMIC_LAND] = GNI_FMA_ATOMIC2_FAND_S, + [MCA_BTL_ATOMIC_LOR] = GNI_FMA_ATOMIC2_FOR_S, + [MCA_BTL_ATOMIC_LXOR] = GNI_FMA_ATOMIC2_FXOR_S, + [MCA_BTL_ATOMIC_SWAP] = GNI_FMA_ATOMIC2_FSWAP_S, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FIMIN_S, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FIMAX_S, + }, + [OPAL_INT64] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD, + [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND, + [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR, + [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR, + [MCA_BTL_ATOMIC_SWAP] = GNI_FMA_ATOMIC2_FSWAP, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FIMIN, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FIMAX, + }, + [OPAL_FLOAT] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FFPADD_S, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FFPMIN_S, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FFPMAX_S, + }, + [OPAL_DOUBLE] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FFPADD, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FFPMIN, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FFPMAX, + }, }; int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, @@ -32,7 +80,20 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end { gni_mem_handle_t dummy = {0, 0}; mca_btl_ugni_post_descriptor_t *post_desc; - int rc; + int gni_op, rc, type; + size_t size; + + size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + if (MCA_BTL_ATOMIC_FLAG_FLOAT & flags) { + type = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? OPAL_FLOAT : OPAL_DOUBLE; + } else { + type = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? OPAL_INT32 : OPAL_INT64; + } + + gni_op = amo_cmds[type][op]; + if (0 == gni_op) { + return OPAL_ERR_NOT_SUPPORTED; + } rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -45,8 +106,8 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end } init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, 0, dummy, remote_address, - remote_handle->gni_handle, 8, 0); - post_desc->desc.base.amo_cmd = amo_cmds[op]; + remote_handle->gni_handle, size, 0); + post_desc->desc.base.amo_cmd = gni_op; post_desc->desc.base.first_operand = operand; @@ -54,6 +115,10 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); if (GNI_RC_SUCCESS != rc) { + mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + if (GNI_RC_ILLEGAL_OP == rc) { + return OPAL_ERR_NOT_SUPPORTED; + } return OPAL_ERR_OUT_OF_RESOURCE; } @@ -67,7 +132,20 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en void *cbcontext, void *cbdata) { mca_btl_ugni_post_descriptor_t *post_desc; - int rc; + int gni_op, rc, type; + size_t size; + + size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + if (MCA_BTL_ATOMIC_FLAG_FLOAT & flags) { + type = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? OPAL_FLOAT : OPAL_DOUBLE; + } else { + type = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? OPAL_INT32 : OPAL_INT64; + } + + gni_op = famo_cmds[type][op]; + if (0 == gni_op) { + return OPAL_ERR_NOT_SUPPORTED; + } rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -81,8 +159,8 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, - remote_address, remote_handle->gni_handle, 8, 0); - post_desc->desc.base.amo_cmd = famo_cmds[op]; + remote_address, remote_handle->gni_handle, size, 0); + post_desc->desc.base.amo_cmd = gni_op; post_desc->desc.base.first_operand = operand; @@ -91,6 +169,9 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); if (GNI_RC_SUCCESS != rc) { mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + if (GNI_RC_ILLEGAL_OP == rc) { + return OPAL_ERR_NOT_SUPPORTED; + } return OPAL_ERR_OUT_OF_RESOURCE; } @@ -103,7 +184,11 @@ int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_ int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_ugni_post_descriptor_t *post_desc; - int rc; + int gni_op, rc; + size_t size; + + gni_op = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? GNI_FMA_ATOMIC2_CSWAP_S : GNI_FMA_ATOMIC_CSWAP; + size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -117,8 +202,8 @@ int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_ init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, - remote_address, remote_handle->gni_handle, 8, 0); - post_desc->desc.base.amo_cmd = GNI_FMA_ATOMIC_CSWAP; + remote_address, remote_handle->gni_handle, size, 0); + post_desc->desc.base.amo_cmd = gni_op; post_desc->desc.base.first_operand = compare; post_desc->desc.base.second_operand = value; diff --git a/opal/mca/btl/ugni/btl_ugni_component.c b/opal/mca/btl/ugni/btl_ugni_component.c index 7e8198df20f..6d283128353 100644 --- a/opal/mca/btl/ugni/btl_ugni_component.c +++ b/opal/mca/btl/ugni/btl_ugni_component.c @@ -288,6 +288,13 @@ btl_ugni_component_register(void) MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR | MCA_BTL_ATOMIC_SUPPORTS_CSWAP; + if (GNI_DEVICE_ARIES == device_type) { + /* aries supports additional atomic operations */ + mca_btl_ugni_module.super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX | + MCA_BTL_ATOMIC_SUPPORTS_LAND | MCA_BTL_ATOMIC_SUPPORTS_LOR | MCA_BTL_ATOMIC_SUPPORTS_LXOR | + MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_FLOAT; + } + mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */