From 4f2a4e86a1e925c7ad10b1d04e64a9fb5f785f24 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 25 May 2016 10:39:34 -0600 Subject: [PATCH 1/6] btl: add support for more atomics This commit add support for more atomic operations and type. The operations added are logical and, logical or, logical xor, swap, min, and max. New types are 32-bit int by using the MCA_BTL_ATOMIC_FLAG_32BIT flag, 64-bit float by using the MCA_BTL_ATOMIC_FLAG_FLOAT flag, and 32-bit float by using both flags. Floating point numbers are supported by packing the number in as an int64_t or int32_t. We will update the btl interface in the future to make this less confusing. Signed-off-by: Nathan Hjelm (cherry picked from commit 23fe19a9568a69474fdd30fdb108380b8ca41119) --- opal/mca/btl/base/btl_base_frame.c | 6 +++ opal/mca/btl/btl.h | 56 +++++++++++++++++++++++-- opal/mca/btl/openib/btl_openib_atomic.c | 6 ++- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/opal/mca/btl/base/btl_base_frame.c b/opal/mca/btl/base/btl_base_frame.c index 6cb49e5f49c..f5f15c86544 100644 --- a/opal/mca/btl/base/btl_base_frame.c +++ b/opal/mca/btl/base/btl_base_frame.c @@ -61,6 +61,12 @@ mca_base_var_enum_value_flag_t mca_btl_base_atomic_enum_flags[] = { {MCA_BTL_ATOMIC_SUPPORTS_AND, "and", 0}, {MCA_BTL_ATOMIC_SUPPORTS_OR, "or", 0}, {MCA_BTL_ATOMIC_SUPPORTS_XOR, "xor", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_LAND, "land", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_LOR, "lor", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_LXOR, "lxor", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_SWAP, "swap", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_MIN, "min", 0}, + {MCA_BTL_ATOMIC_SUPPORTS_MAX, "max", 0}, {MCA_BTL_ATOMIC_SUPPORTS_CSWAP, "compare-and-swap", 0}, {MCA_BTL_ATOMIC_SUPPORTS_GLOB, "global"}, {0, NULL, 0} diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index ed6771da774..49720857330 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. @@ -290,10 +290,44 @@ enum { MCA_BTL_ATOMIC_SUPPORTS_OR = 0x00000400, /** The btl supports atomic bitwise exclusive or */ MCA_BTL_ATOMIC_SUPPORTS_XOR = 0x00000800, + + /** The btl supports logical and */ + MCA_BTL_ATOMIC_SUPPORTS_LAND = 0x00001000, + /** The btl supports logical or */ + MCA_BTL_ATOMIC_SUPPORTS_LOR = 0x00002000, + /** The btl supports logical exclusive or */ + MCA_BTL_ATOMIC_SUPPORTS_LXOR = 0x00004000, + + /** The btl supports atomic swap */ + MCA_BTL_ATOMIC_SUPPORTS_SWAP = 0x00010000, + + /** The btl supports atomic min */ + MCA_BTL_ATOMIC_SUPPORTS_MIN = 0x00100000, + /** The btl supports atomic min */ + MCA_BTL_ATOMIC_SUPPORTS_MAX = 0x00200000, + /** The btl supports atomic compare-and-swap */ MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000, + /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */ MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000, + + + /** The btl supports 32-bit integer operations. Keep in mind the btl may + * support only a subset of the available atomics. */ + MCA_BTL_ATOMIC_SUPPORTS_32BIT = 0x40000000, + + /** The btl supports floating-point operations. Keep in mind the btl may + * support only a subset of the available atomics and may not support + * both 64 or 32-bit floating point. */ + MCA_BTL_ATOMIC_SUPPORTS_FLOAT = 0x80000000, +}; + +enum { + /** Use 32-bit atomics */ + MCA_BTL_ATOMIC_FLAG_32BIT = 0x00000001, + /** Use floating-point atomics */ + MCA_BTL_ATOMIC_FLAG_FLOAT = 0x00000002, }; enum mca_btl_base_atomic_op_t { @@ -305,6 +339,20 @@ enum mca_btl_base_atomic_op_t { MCA_BTL_ATOMIC_OR = 0x0012, /** Atomic xor: (*remote_address) = (*remote_address) ^ operand */ MCA_BTL_ATOMIC_XOR = 0x0014, + /** Atomic logical and: (*remote_address) = (*remote_address) && operand */ + MCA_BTL_ATOMIC_LAND = 0x0015, + /** Atomic logical or: (*remote_address) = (*remote_address) || operand */ + MCA_BTL_ATOMIC_LOR = 0x0016, + /** Atomic logical xor: (*remote_address) = (*remote_address) != operand */ + MCA_BTL_ATOMIC_LXOR = 0x0017, + /** Atomic swap: (*remote_address) = operand */ + MCA_BTL_ATOMIC_SWAP = 0x001a, + /** Atomic min */ + MCA_BTL_ATOMIC_MIN = 0x0020, + /** Atomic max */ + MCA_BTL_ATOMIC_MAX = 0x0021, + + MCA_BTL_ATOMIC_LAST, }; typedef enum mca_btl_base_atomic_op_t mca_btl_base_atomic_op_t; @@ -974,7 +1022,7 @@ typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl, * (remote_address, remote_address + 8) * @param op (IN) Operation to perform * @param operand (IN) Operand for the operation - * @param flags (IN) Flags for this put operation + * @param flags (IN) Flags for this atomic operation * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback @@ -1018,7 +1066,7 @@ typedef int (*mca_btl_base_module_atomic_op64_fn_t) (struct mca_btl_base_module_ * (remote_address, remote_address + 8) * @param op (IN) Operation to perform * @param operand (IN) Operand for the operation - * @param flags (IN) Flags for this put operation + * @param flags (IN) Flags for this atomic operation * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback @@ -1064,7 +1112,7 @@ typedef int (*mca_btl_base_module_atomic_fop64_fn_t) (struct mca_btl_base_module * (remote_address, remote_address + 8) * @param compare (IN) Operand for the operation * @param value (IN) Value to store on success - * @param flags (IN) Flags for this put operation + * @param flags (IN) Flags for this atomic operation * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback diff --git a/opal/mca/btl/openib/btl_openib_atomic.c b/opal/mca/btl/openib/btl_openib_atomic.c index 0c6460f2cf3..ec0eb644f1a 100644 --- a/opal/mca/btl/openib/btl_openib_atomic.c +++ b/opal/mca/btl/openib/btl_openib_atomic.c @@ -112,7 +112,7 @@ int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl void *cbcontext, void *cbdata) { - if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op)) { + if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op || (MCA_BTL_ATOMIC_FLAG_32BIT & flags))) { return OPAL_ERR_NOT_SUPPORTED; } @@ -128,6 +128,10 @@ int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_b uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { + if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_FLAG_32BIT & flags)) { + return OPAL_ERR_NOT_SUPPORTED; + } + return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle, remote_handle, IBV_WR_ATOMIC_CMP_AND_SWP, compare, value, flags, order, cbfunc, cbcontext, cbdata); From 4d72591fadcc1bae1ef27fc53c95c799f6709aca Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 2 Jun 2016 19:21:34 -0600 Subject: [PATCH 2/6] btl: adjust values of new atomic flags Signed-off-by: Nathan Hjelm (cherry picked from commit 6169d03ea354f363a8d65e7e5313c55fcccd05ef) --- opal/mca/btl/btl.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index 49720857330..19af3630084 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -306,21 +306,20 @@ enum { /** The btl supports atomic min */ MCA_BTL_ATOMIC_SUPPORTS_MAX = 0x00200000, - /** The btl supports atomic compare-and-swap */ - MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000, - - /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */ - MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000, - - /** The btl supports 32-bit integer operations. Keep in mind the btl may * support only a subset of the available atomics. */ - MCA_BTL_ATOMIC_SUPPORTS_32BIT = 0x40000000, + MCA_BTL_ATOMIC_SUPPORTS_32BIT = 0x01000000, /** The btl supports floating-point operations. Keep in mind the btl may * support only a subset of the available atomics and may not support * both 64 or 32-bit floating point. */ - MCA_BTL_ATOMIC_SUPPORTS_FLOAT = 0x80000000, + MCA_BTL_ATOMIC_SUPPORTS_FLOAT = 0x02000000, + + /** The btl supports atomic compare-and-swap */ + MCA_BTL_ATOMIC_SUPPORTS_CSWAP = 0x10000000, + + /** The btl guarantees global atomicity (can mix btl atomics with cpu atomics) */ + MCA_BTL_ATOMIC_SUPPORTS_GLOB = 0x20000000, }; enum { From 1bad2a9a2094f8222458020900fabd2a8ad8f839 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 1 Sep 2016 12:47:44 -0600 Subject: [PATCH 3/6] osc/rdma: add support for network AMOs This commit adds support for using network AMOs for MPI_Accumulate, MPI_Fetch_and_op, and MPI_Compare_and_swap. This support is only enabled if the ompi_single_intrinsic info key is specified or the acc_single_interinsic MCA variable is set. This configuration indicates to this implementation that no long accumulates will be performed since these do not currently mix with the AMO implementation. This commit also cleans up the code somwhat. This includes removing unnecessary struct keywords where the type is also typedef'd. Signed-off-by: Nathan Hjelm (cherry picked from commit 1ce5847e8b56ef65ca760cc783658354e8e346e6) --- ompi/mca/osc/rdma/osc_rdma.h | 15 +- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 547 +++++++++++++++----- ompi/mca/osc/rdma/osc_rdma_accumulate.h | 64 +-- ompi/mca/osc/rdma/osc_rdma_comm.c | 28 +- ompi/mca/osc/rdma/osc_rdma_comm.h | 20 +- ompi/mca/osc/rdma/osc_rdma_component.c | 87 ++-- ompi/mca/osc/rdma/osc_rdma_passive_target.c | 6 - ompi/mca/osc/rdma/osc_rdma_peer.c | 4 + ompi/mca/osc/rdma/osc_rdma_peer.h | 3 + 9 files changed, 530 insertions(+), 244 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 33abef80442..8f70409e1cf 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -8,7 +8,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. @@ -86,6 +86,12 @@ struct ompi_osc_rdma_component_t { /** Default value of the no_locks info key for new windows */ bool no_locks; + /** Accumulate operations will only operate on a single intrinsic datatype */ + bool acc_single_intrinsic; + + /** Use network AMOs when available */ + bool acc_use_amo; + /** Priority of the osc/rdma component */ unsigned int priority; @@ -121,12 +127,13 @@ struct ompi_osc_rdma_module_t { /** value of same_size info key for this window */ bool same_size; - /** window should have accumulate ordering... */ - bool accumulate_ordering; - /** passive-target synchronization will not be used in this window */ bool no_locks; + bool acc_single_intrinsic; + + bool acc_use_amo; + /** flavor of this window */ int flavor; diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index d3e771ae59c..65617bab7b6 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -25,7 +25,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count int ret = OMPI_SUCCESS; do { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local regions"); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local region(s)"); if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); @@ -70,7 +70,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count return ret; } -static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void *compare_buffer, void *result_buffer, +static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, ompi_osc_rdma_module_t *module) @@ -79,10 +79,10 @@ static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - memcpy (result_buffer, (void *) (uintptr_t) target_address, datatype->super.size); + memcpy (result_addr, (void *) (uintptr_t) target_address, datatype->super.size); - if (0 == memcmp (compare_buffer, result_buffer, datatype->super.size)) { - memcpy ((void *) (uintptr_t) target_address, source_buffer, datatype->super.size); + if (0 == memcmp (compare_addr, result_addr, datatype->super.size)) { + memcpy ((void *) (uintptr_t) target_address, source_addr, datatype->super.size); } ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); @@ -258,15 +258,19 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v return OMPI_SUCCESS; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "accumulate btl operation faile with opal error code %d", ret); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "accumulate btl operation failed with opal error code %d", ret); + + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } ompi_osc_rdma_cleanup_rdma (sync, frag, NULL, NULL); return ret; } -static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_buffer, int source_count, - ompi_datatype_t *source_datatype, void *result_buffer, int result_count, +static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_addr, int source_count, + ompi_datatype_t *source_datatype, void *result_addr, int result_count, ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) @@ -304,15 +308,15 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (source_datatype) { (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); - source_buffer = (void *)((intptr_t) source_buffer + lb); + source_addr = (void *)((intptr_t) source_addr + lb); } if (result_datatype) { (void) ompi_datatype_get_extent (result_datatype, &lb, &extent); - result_buffer = (void *)((intptr_t) result_buffer + lb); + result_addr = (void *)((intptr_t) result_addr + lb); } - ret = ompi_osc_rdma_gacc_contig (sync, source_buffer, source_count, source_datatype, result_buffer, + ret = ompi_osc_rdma_gacc_contig (sync, source_addr, source_count, source_datatype, result_addr, result_count, result_datatype, peer, target_address, target_handle, target_count, target_datatype, op, request); @@ -323,12 +327,12 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (source_datatype) { /* the convertors will handle the lb */ (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); - source_buffer = (void *)((intptr_t) source_buffer - lb); + source_addr = (void *)((intptr_t) source_addr - lb); } if (result_datatype) { (void) ompi_datatype_get_extent (result_datatype, &lb, &extent); - result_buffer = (void *)((intptr_t) result_buffer - lb); + result_addr = (void *)((intptr_t) result_addr - lb); } } @@ -362,7 +366,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v /* the source may be NULL if using MPI_OP_NO_OP with MPI_Get_accumulate */ if (source_datatype) { OBJ_CONSTRUCT(&source_convertor, opal_convertor_t); - ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_buffer, + ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_addr, 0, &source_convertor); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; @@ -427,7 +431,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v if (result_datatype) { /* prepare a convertor for this part of the result */ opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count, - result_buffer, 0, &subreq->convertor); + result_addr, 0, &subreq->convertor); opal_convertor_set_position (&subreq->convertor, &result_position); subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC; } else { @@ -478,69 +482,331 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v return OMPI_SUCCESS; } -#if 0 static void ompi_osc_rdma_cas_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, mca_btl_base_registration_handle_t *local_handle, void *context, void *data, int status) { ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; - void *result_buffer = (void *)(intptr_t) ((int64_t *) local_address)[1]; + void *result_addr = (void *)(intptr_t) ((int64_t *) local_address)[1]; + size_t size = ((int64_t *) local_address)[2]; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic compare-and-swap complete. result: 0x%" PRIx64, *((int64_t *) local_address)); /* copy the result */ - memcpy (result_buffer, local_address, 8); + memcpy (result_addr, local_address, size); ompi_osc_rdma_sync_rdma_dec (sync); ompi_osc_rdma_frag_complete (frag); } -static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, - void *result_buffer, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, +static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, + void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) { ompi_osc_rdma_module_t *module = sync->module; + const size_t size = datatype->super.size; ompi_osc_rdma_frag_t *frag = NULL; + int64_t compare, source; + int ret, flags; char *ptr; - int ret; - /* XXX -- TODO -- Update the BTL interface to allow for other CAS sizes */ - if (datatype->super.size != 8) { + if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->selected_btl->btl_flags))) { return OMPI_ERR_NOT_SUPPORTED; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using 64-bit btl atomics. compare: 0x%" - PRIx64 ", origin: 0x%" PRIx64, *((int64_t *) compare_buffer), *((int64_t *) source_buffer)); + compare = (8 == size) ? ((int64_t *) compare_addr)[0] : ((int32_t *) compare_addr)[0]; + source = (8 == size) ? ((int64_t *) source_addr)[0] : ((int32_t *) source_addr)[0]; + flags = (4 == size) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; - ret = ompi_osc_rdma_frag_alloc (module, 16, &frag, &ptr); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%" + PRIx64 ", origin: 0x%" PRIx64, size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr)); + + ret = ompi_osc_rdma_frag_alloc (module, 24, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } - /* store the destination in the temporary buffer */ - ((int64_t *) ptr)[1] = (intptr_t) result_buffer; + /* store the destination and size in the temporary buffer */ + ((int64_t *) ptr)[1] = (intptr_t) result_addr; + ((int64_t *) ptr)[2] = size; + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address, + frag->handle, target_handle, compare, source, flags, MCA_BTL_NO_ORDER, + ompi_osc_rdma_cas_atomic_complete, sync, frag); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + + if (1 == ret) { + memcpy (result_addr, ptr, size); + ret = OMPI_SUCCESS; + } + + ompi_osc_rdma_frag_complete (frag); + } + + return ret; +} + +static inline void ompi_osc_rdma_fetch_and_op_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; + ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data; + void *result_addr = (void *)(intptr_t) ((int64_t *) local_address)[1]; + ompi_osc_rdma_request_t *req = (ompi_osc_rdma_request_t *) (intptr_t) ((int64_t *) local_address)[2]; + size_t size = ((int64_t *) local_address)[3]; - ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address, - frag->handle, target_handle, ((int64_t *)compare_buffer)[0], - *((int64_t *) source_buffer), 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_cas_atomic_complete, module, frag); - if (OPAL_UNLIKELY(0 > ret)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic fetch-and-op complete. result: 0x%" PRIx64, + *((int64_t *) local_address)); + + /* copy the result */ + if (result_addr) { + memcpy (result_addr, local_address, size); + } + + ompi_osc_rdma_sync_rdma_dec (sync); + ompi_osc_rdma_frag_complete (frag); + if (req) { + ompi_osc_rdma_request_complete (req, status); + } +} + +static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES] = { + [OMPI_OP_MAX] = MCA_BTL_ATOMIC_MAX, + [OMPI_OP_MIN] = MCA_BTL_ATOMIC_MIN, + [OMPI_OP_SUM] = MCA_BTL_ATOMIC_ADD, + [OMPI_OP_BAND] = MCA_BTL_ATOMIC_AND, + [OMPI_OP_BOR] = MCA_BTL_ATOMIC_OR, + [OMPI_OP_BXOR] = MCA_BTL_ATOMIC_XOR, + [OMPI_OP_LAND] = MCA_BTL_ATOMIC_LAND, + [OMPI_OP_LOR] = MCA_BTL_ATOMIC_LOR, + [OMPI_OP_LXOR] = MCA_BTL_ATOMIC_LXOR, + [OMPI_OP_REPLACE] = MCA_BTL_ATOMIC_SWAP, +}; + +static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt, + ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, + mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + int32_t atomic_flags = module->selected_btl->btl_atomic_flags; + ompi_osc_rdma_frag_t *frag = NULL; + int ret, btl_op, flags; + char *ptr = NULL; + int64_t origin; + + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { + return OMPI_ERR_NOT_SUPPORTED; + } + + flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; + if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) { + flags |= MCA_BTL_ATOMIC_FLAG_FLOAT; + } + + btl_op = ompi_osc_rdma_op_mapping[op->op_type]; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using %d-bit btl atomics. origin: 0x%" PRIx64, + (4 == extent) ? 32 : 64, *((int64_t *) origin_addr)); + + ret = ompi_osc_rdma_frag_alloc (module, 32, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } - if (1 != ret) { - ompi_osc_rdma_sync_rdma_inc (sync); - } else { - memcpy (result_buffer, ptr, 8); + origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0]; + + /* store the destination, request, and extent in the temporary buffer for the callback */ + ((int64_t *) ptr)[1] = (intptr_t) result_addr; + ((int64_t *) ptr)[2] = (intptr_t) req; + ((int64_t *) ptr)[3] = extent; + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->data_endpoint, ptr, target_address, + frag->handle, target_handle, btl_op, origin, flags, + MCA_BTL_NO_ORDER, ompi_osc_rdma_fetch_and_op_atomic_complete, + sync, frag); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + + if (OPAL_LIKELY(1 == ret)) { + memcpy (result_addr, ptr, extent); + if (req) { + ompi_osc_rdma_request_complete (req, OMPI_SUCCESS); + } + ret = OPAL_SUCCESS; + } ompi_osc_rdma_frag_complete (frag); } - return OMPI_SUCCESS; + return ret; +} + +static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt, + ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address, + mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + int32_t atomic_flags = module->selected_btl->btl_atomic_flags; + ompi_osc_rdma_frag_t *frag = NULL; + uint64_t address, offset; + char *ptr = NULL; + int ret, btl_op; + + if (extent > 8) { + return OMPI_ERR_NOT_SUPPORTED; + } + + /* align the address. the user should not call with an unaligned address so don't need to range check here */ + address = target_address & ~7; + offset = target_address & ~address; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap. origin: 0x%" PRIx64, + *((int64_t *) origin_addr)); + + ret = ompi_osc_rdma_frag_alloc (module, 16, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + /* store the destination in the temporary buffer */ + do { + bool complete = false; + + ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, ptr, 8); + if (OMPI_SUCCESS != ret) { + ompi_osc_rdma_frag_complete (frag); + return ret; + } + + ((int64_t *) ptr)[1] = ((int64_t *) ptr)[0]; + + if (&ompi_mpi_op_no_op.op == op) { + memcpy (ptr + offset, origin_addr, extent); + } else { + ompi_op_reduce (op, (void *) origin_addr, ptr + offset, 1, dt); + } + + do { + ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, address, + frag->handle, target_handle, ((int64_t *) ptr)[1], + ((int64_t *) ptr)[0], 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_atomic_complete, (void *) &complete, NULL); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + break; + } + + while (!complete) { + ompi_osc_rdma_progress (module); + } + + if (((int64_t *) ptr)[1] == ((int64_t *) ptr)[0]) { + break; + } + } while (1); + + if (result_addr) { + memcpy (result_addr, ptr + 8 + offset, extent); + } + + ompi_osc_rdma_frag_complete (frag); + + return ret; +} + +static void ompi_osc_rdma_acc_single_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context; + ompi_osc_rdma_request_t *req = (ompi_osc_rdma_request_t *) data; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "atomic accumulate complete"); + + ompi_osc_rdma_sync_rdma_dec (sync); + if (req) { + ompi_osc_rdma_request_complete (req, status); + } +} + +static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, ompi_datatype_t *dt, ptrdiff_t extent, + ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, + ompi_op_t *op, ompi_osc_rdma_request_t *req) +{ + ompi_osc_rdma_module_t *module = sync->module; + int32_t atomic_flags = module->selected_btl->btl_atomic_flags; + int ret, btl_op, flags; + int64_t origin; + + if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + /* btl put atomics not supported or disabled. fall back on fetch-and-op */ + return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, op, req); + } + + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { + return OMPI_ERR_NOT_SUPPORTED; + } + + origin = (8 == extent) ? ((uint64_t *) origin_addr)[0] : ((uint32_t *) origin_addr)[0]; + + /* set the appropriate flags for this atomic */ + flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; + if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) { + flags |= MCA_BTL_ATOMIC_FLAG_FLOAT; + } + + btl_op = ompi_osc_rdma_op_mapping[op->op_type]; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate using 64-bit btl atomics. origin: 0x%" PRIx64, + *((int64_t *) origin_addr)); + + ompi_osc_rdma_sync_rdma_inc (sync); + + do { + ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->data_endpoint, target_address, + target_handle, btl_op, origin, flags, MCA_BTL_NO_ORDER, + ompi_osc_rdma_acc_single_atomic_complete, sync, req); + + ompi_osc_rdma_progress (module); + } while (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret || OPAL_ERR_TEMP_OUT_OF_RESOURCE == ret)); + + if (OPAL_SUCCESS != ret) { + ompi_osc_rdma_sync_rdma_dec (sync); + if (1 == ret) { + if (req) { + ompi_osc_rdma_request_complete (req, OMPI_SUCCESS); + } + ret = OMPI_SUCCESS; + } + } + + return ret; } -#endif /** * ompi_osc_rdma_cas_get_complete: @@ -561,45 +827,49 @@ static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, s OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "remote compare-and-swap get complete on sync %p. " "status %d", (void *) sync, status); - if (OMPI_SUCCESS == status) { - /* copy data to the user buffer (for gacc) */ - memcpy (request->result_addr, (void *) source, request->len); - - if (0 == memcmp ((void *) source, request->compare_addr, request->len)) { - /* the target and compare buffers match so write the source to the target */ - memcpy ((void *) source, request->origin_addr, request->len); - - ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address, - request->target_address, local_handle, - (mca_btl_base_registration_handle_t *) request->ctx, - request->len, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_acc_put_complete, request, NULL); - if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code " - "%d", ret); - } - - /* TODO -- we can do better. probably should queue up the next step and handle it in progress */ - assert (OPAL_SUCCESS == ret); - } else { - /* this is a no-op. nothing more to do except release the accumulate lock */ - ompi_osc_rdma_frag_complete (frag); + if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + return; + } - if (!ompi_osc_rdma_peer_is_exclusive (peer)) { - (void) ompi_osc_rdma_lock_release_exclusive (module, request->peer, - offsetof (ompi_osc_rdma_state_t, accumulate_lock)); - } + /* copy data to the user buffer (for gacc) */ + memcpy (request->result_addr, (void *) source, request->len); - /* the request is now complete and the outstanding rdma operation is complete */ - ompi_osc_rdma_request_complete (request, status); + if (0 == memcmp ((void *) source, request->compare_addr, request->len)) { + /* the target and compare buffers match. write the source to the target */ + memcpy ((void *) source, request->origin_addr, request->len); - ompi_osc_rdma_sync_rdma_dec (sync); - peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING; + ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address, + request->target_address, local_handle, + (mca_btl_base_registration_handle_t *) request->ctx, + request->len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_acc_put_complete, request, NULL); + if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code " + "%d", ret); } + + /* TODO -- we can do better. probably should queue up the next step and handle it in progress */ + assert (OPAL_SUCCESS == ret); + + return; } + + /* this is a no-op. nothing more to do except release the accumulate lock */ + ompi_osc_rdma_frag_complete (frag); + + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, request->peer, + offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } + + /* the request is now complete and the outstanding rdma operation is complete */ + ompi_osc_rdma_request_complete (request, status); + + ompi_osc_rdma_sync_rdma_dec (sync); + peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING; } -static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, void *result_buffer, +static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, void *result_addr, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) { @@ -649,10 +919,10 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe /* set up the request */ request->frag = frag; - request->origin_addr = (void *) source_buffer; + request->origin_addr = (void *) source_addr; request->ctx = (void *) target_handle; - request->result_addr = result_buffer; - request->compare_addr = compare_buffer; + request->result_addr = result_addr; + request->compare_addr = compare_addr; request->result_dt = datatype; request->offset = (ptrdiff_t) offset; request->target_address = target_address; @@ -670,6 +940,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe } if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { + if (!ompi_osc_rdma_peer_is_exclusive (peer)) { + (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); + } ompi_osc_rdma_frag_complete (frag); return ret; } @@ -684,8 +957,8 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffe int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr, - struct ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, - struct ompi_win_t *win) + ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, + ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -708,8 +981,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare return ret; } -#if 0 - if (MCA_OSC_RDMA_SAME_OP <= module->accumulate_ops) { + if (win->w_acc_ops <= OMPI_WIN_ACCUMULATE_OPS_SAME_OP) { /* the user has indicated that they will only use the same op (or same op and no op) * for operations on overlapping memory ranges. that indicates it is safe to go ahead * and use network atomic operations. */ @@ -718,8 +990,7 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare if (OMPI_SUCCESS == ret) { return OMPI_SUCCESS; } - } else -#endif + } if (ompi_osc_rdma_peer_local_base (peer)) { return ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt, @@ -733,15 +1004,16 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare static inline int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, + ompi_datatype_t *origin_datatype, void *result_addr, int result_count, + ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, int target_rank, MPI_Aint target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, + ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; + ptrdiff_t lb, extent; int ret; /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */ @@ -753,12 +1025,35 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo return OMPI_SUCCESS; } - ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count, - &target_address, &target_handle); + (void) ompi_datatype_get_extent (origin_datatype, &lb, &extent); + + ret = osc_rdma_get_remote_segment (module, peer, target_disp, extent * target_count, &target_address, &target_handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } + if (module->acc_single_intrinsic && extent <= 8) { + if (module->acc_use_amo && ompi_datatype_is_predefined (origin_datatype)) { + if (NULL == result_addr) { + ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + } else { + ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + } + + if (OMPI_SUCCESS == ret) { + return OMPI_SUCCESS; + } + } + + ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, extent, peer, target_address, + target_handle, op, request); + if (OMPI_SUCCESS == ret) { + return OMPI_SUCCESS; + } + } + if (ompi_osc_rdma_peer_local_base (peer)) { /* local/self optimization */ return ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count, @@ -771,13 +1066,10 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo target_datatype, op, request); } -int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win) +int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -801,14 +1093,10 @@ int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, } -int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win, - ompi_request_t **request) +int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win, ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -842,31 +1130,9 @@ int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, return OMPI_SUCCESS; } -int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, struct ompi_op_t *op, struct ompi_win_t *win) -{ - ompi_osc_rdma_module_t *module = GET_MODULE(win); - ompi_osc_rdma_peer_t *peer; - ompi_osc_rdma_sync_t *sync; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fop: %p, %s, %d, %lu, %s, %s", result_addr, dt->name, target_rank, - (unsigned long) target_disp, op->o_name, win->w_name); - - sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer); - if (OPAL_UNLIKELY(NULL == sync)) { - return OMPI_ERR_RMA_SYNC; - } - - return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer, target_rank, - target_disp, 1, dt, op, NULL); -} - - -int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, - struct ompi_win_t *win, struct ompi_request_t **request) +int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win, ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -898,11 +1164,9 @@ int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, return OMPI_SUCCESS; } -int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, int target_rank, - OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, - struct ompi_win_t *win) +int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -921,3 +1185,24 @@ int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, NULL, peer, target_rank, target_disp, target_count, target_datatype, op, NULL); } + + +int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, ompi_datatype_t *dt, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, ompi_op_t *op, ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_peer_t *peer; + ompi_osc_rdma_sync_t *sync; + int ret; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fop: %p, %s, %d, %lu, %s, %s", result_addr, dt->name, + target_rank, (unsigned long) target_disp, op->o_name, win->w_name); + + sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer); + if (OPAL_UNLIKELY(NULL == sync)) { + return OMPI_ERR_RMA_SYNC; + } + + return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer, + target_rank, target_disp, 1, dt, op, NULL); +} diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.h b/ompi/mca/osc/rdma/osc_rdma_accumulate.h index 8f6f1bb4b73..7ab370ab2b8 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.h +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -14,44 +14,30 @@ #include "osc_rdma.h" -int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, - void *result_addr, struct ompi_datatype_t *dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - struct ompi_win_t *win); - -int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, struct ompi_win_t *win); -int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, - struct ompi_datatype_t *dt, int target, - OPAL_PTRDIFF_TYPE target_disp, - struct ompi_op_t *op, struct ompi_win_t *win); - -int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win); - -int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, struct ompi_win_t *win, - struct ompi_request_t **request); - -int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, - void *result_addr, int result_count, - struct ompi_datatype_t *result_datatype, - int target_rank, MPI_Aint target_disp, - int target_count, struct ompi_datatype_t *target_datatype, - struct ompi_op_t *op, struct ompi_win_t *win, - struct ompi_request_t **request); +int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr, + ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp, + ompi_win_t *win); + +int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win); + +int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, ompi_datatype_t *dt, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, ompi_op_t *op, ompi_win_t *win); + +int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win); + +int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, + ompi_win_t *win, ompi_request_t **request); + +int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_win_t *win, ompi_request_t **request); #endif /* OSC_RDMA_ACCUMULATE_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index c453891839e..f1339d34b49 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -772,9 +772,9 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, + ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) + ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *target_handle; @@ -807,9 +807,9 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_osc_rdma_put_contig, false); } -static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) + ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; mca_btl_base_registration_handle_t *source_handle; @@ -841,9 +841,9 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori source_handle, source_count, source_datatype, request, module->selected_btl->btl_get_limit, ompi_osc_rdma_get_contig, true); } -int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, ompi_win_t *win) + ompi_datatype_t *target_datatype, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -862,10 +862,10 @@ int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_da target_count, target_datatype, NULL); } -int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_datatype, struct ompi_win_t *win, - struct ompi_request_t **request) + ompi_datatype_t *target_datatype, ompi_win_t *win, + ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -897,9 +897,9 @@ int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_d return OMPI_SUCCESS; } -int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_get (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, struct ompi_win_t *win) + ompi_datatype_t *source_datatype, ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; @@ -918,10 +918,10 @@ int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype source_disp, source_count, source_datatype, NULL); } -int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, +int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count, - struct ompi_datatype_t *source_datatype, struct ompi_win_t *win, - struct ompi_request_t **request) + ompi_datatype_t *source_datatype, ompi_win_t *win, + ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index c011eea3ed0..e9b048c56ee 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -96,23 +96,23 @@ static inline int osc_rdma_get_remote_segment (ompi_osc_rdma_module_t *module, o /* prototypes for implementations of MPI RMA window functions. these will be called from the * mpi interface (ompi/mpi/c) */ -int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win); + ompi_datatype_t *target_dt, ompi_win_t *win); -int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_get (void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win); + ompi_datatype_t *target_dt, ompi_win_t *win); -int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win, - struct ompi_request_t **request); + ompi_datatype_t *target_dt, ompi_win_t *win, + ompi_request_t **request); -int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, +int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, - struct ompi_datatype_t *target_dt, struct ompi_win_t *win, - struct ompi_request_t **request); + ompi_datatype_t *target_dt, ompi_win_t *win, + ompi_request_t **request); /** * @brief read data from a remote memory region (blocking) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 8b52933007d..c951a767610 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -173,6 +173,20 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks); + mca_osc_rdma_component.acc_single_intrinsic = false; + (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic", + "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes " + "that will not use anything more than a single predefined datatype (default: false)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic); + + mca_osc_rdma_component.acc_use_amo = true; + (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo", + "Enable the use of network atomic memory operations when using single " + "intrinsic optimizations. If not set network compare-and-swap will be " + "used instread (default: true)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_use_amo); + mca_osc_rdma_component.buffer_size = 32768; (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", "Size of temporary buffers (default: 32k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, @@ -585,7 +599,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } } - if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) { + if (MPI_WIN_FLAVOR_CREATE == module->flavor) { ret = ompi_osc_rdma_initialize_region (module, base, size); if (OMPI_SUCCESS != ret) { break; @@ -600,6 +614,20 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s opal_shmem_unlink (&module->seg_ds); } + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *) module->state->regions; + module->state->disp_unit = module->disp_unit; + module->state->region_count = 1; + region->base = state_region->base + my_base_offset; + region->len = size; + if (module->selected_btl->btl_register_mem) { + memcpy (region->btl_handle_data, state_region->btl_handle_data, module->selected_btl->btl_registration_handle_size); + } + } + + /* barrier to make sure all ranks have attached */ + shared_comm->c_coll.coll_barrier(shared_comm, shared_comm->c_coll.coll_barrier_module); + offset = data_base; for (int i = 0 ; i < local_size ; ++i) { ompi_osc_rdma_peer_extended_t *ex_peer; @@ -646,21 +674,18 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { if (temp[i].size) { - ex_peer->super.base = (uint64_t) (uintptr_t) module->segment_base + offset; + ex_peer->super.base = state_region->base + offset; + offset += temp[i].size; } else { ex_peer->super.base = 0; } + } - peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; - - offset += temp[i].size; - } else { - ompi_osc_rdma_region_t *peer_region = (ompi_osc_rdma_region_t *) peer_state->regions; + ompi_osc_rdma_region_t *peer_region = (ompi_osc_rdma_region_t *) peer_state->regions; - ex_peer->super.base = peer_region->base; - if (module->selected_btl->btl_register_mem) { - ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data; - } + ex_peer->super.base = peer_region->base; + if (module->selected_btl->btl_register_mem) { + ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data; } } @@ -1020,6 +1045,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->same_disp_unit = check_config_value_bool ("same_disp_unit", info); module->same_size = check_config_value_bool ("same_size", info); module->no_locks = check_config_value_bool ("no_locks", info); + module->acc_single_intrinsic = check_config_value_bool ("ompi_single_accumulate", info); + module->acc_use_amo = mca_osc_rdma_component.acc_use_amo; module->all_sync.module = module; @@ -1047,14 +1074,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } } - /* options */ - /* FIX ME: should actually check this value... */ -#if 1 - module->accumulate_ordering = 1; -#else - ompi_osc_base_config_value_equal("accumulate_ordering", info, "none"); -#endif - ret = ompi_comm_dup(comm, &module->comm); if (OMPI_SUCCESS != ret) { ompi_osc_rdma_free (win); @@ -1132,17 +1151,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } } - ret = ompi_osc_rdma_share_data (module); - if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); - ompi_osc_rdma_free (win); - return ret; - } - - - /* for now the leader is always rank 0 in the communicator */ - module->leader = ompi_osc_rdma_module_peer (module, 0); - /* lock data */ if (module->no_locks) { win->w_flags |= OMPI_WIN_NO_LOCKS; @@ -1177,20 +1185,19 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* sync memory - make sure all initialization completed */ opal_atomic_mb(); - /* barrier to prevent arrival of lock requests before we're - fully created */ - ret = module->comm->c_coll.coll_barrier(module->comm, - module->comm->c_coll.coll_barrier_module); + ret = ompi_osc_rdma_share_data (module); if (OMPI_SUCCESS != ret) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); ompi_osc_rdma_free (win); - return ret; - } - + } else { + /* for now the leader is always rank 0 in the communicator */ + module->leader = ompi_osc_rdma_module_peer (module, 0); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d", - ompi_comm_get_cid(module->comm)); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d", + ompi_comm_get_cid(module->comm)); + } - return OMPI_SUCCESS; + return ret; } diff --git a/ompi/mca/osc/rdma/osc_rdma_passive_target.c b/ompi/mca/osc/rdma/osc_rdma_passive_target.c index f3e1a0ac85b..720fbbb64a8 100644 --- a/ompi/mca/osc/rdma/osc_rdma_passive_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_passive_target.c @@ -43,12 +43,6 @@ int ompi_osc_rdma_flush (int target, struct ompi_win_t *win) OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush: %d, %s", target, win->w_name); - if (ompi_comm_rank (module->comm) == target) { - /* nothing to flush. call one round of progress */ - ompi_osc_rdma_progress (module); - return OMPI_SUCCESS; - } - OPAL_THREAD_LOCK(&module->lock); lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer); diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index 44d9a0e45a3..7d7967ef66a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -217,6 +217,10 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd memcpy (ex_peer->super.base_handle, base_region->btl_handle_data, registration_handle_size); } + + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + ex_peer->super.super.data_endpoint = ex_peer->super.super.state_endpoint; + } } return OMPI_SUCCESS; diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.h b/ompi/mca/osc/rdma/osc_rdma_peer.h index 34fb22a3885..6716733a43a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.h +++ b/ompi/mca/osc/rdma/osc_rdma_peer.h @@ -75,6 +75,9 @@ struct ompi_osc_rdma_peer_basic_t { /** remote peer's base pointer */ osc_rdma_base_t base; + /** local pointer to peer's base */ + osc_rdma_base_t local_base; + /** registration handle associated with the base */ mca_btl_base_registration_handle_t *base_handle; }; From 7b950be138a0d167d9622e93417f8a3de3000532 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 12 Oct 2016 10:16:03 -0600 Subject: [PATCH 4/6] osc/rdma: fix warnings Signed-off-by: Nathan Hjelm (cherry picked from commit e8ef503bee882d9544076818e12a3e8d46d7d310) --- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 6 ++---- ompi/mca/osc/rdma/osc_rdma_comm.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 65617bab7b6..fc5609804c9 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -521,7 +521,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo flags = (4 == size) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%" - PRIx64 ", origin: 0x%" PRIx64, size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr)); + PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr)); ret = ompi_osc_rdma_frag_alloc (module, 24, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -666,11 +666,10 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - int32_t atomic_flags = module->selected_btl->btl_atomic_flags; ompi_osc_rdma_frag_t *frag = NULL; uint64_t address, offset; char *ptr = NULL; - int ret, btl_op; + int ret; if (extent > 8) { return OMPI_ERR_NOT_SUPPORTED; @@ -1193,7 +1192,6 @@ int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, ompi ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; ompi_osc_rdma_sync_t *sync; - int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fop: %p, %s, %d, %lu, %s, %s", result_addr, dt->name, target_rank, (unsigned long) target_disp, op->o_name, win->w_name); diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index f1339d34b49..7efde7c39be 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -732,7 +732,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } } else { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", - ptr, (void *) frag, aligned_len, (unsigned long) aligned_source_base); + ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); local_handle = frag->handle; } } From 8d44cdfed9087f705ac13224f4d959f566dd0c24 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Thu, 13 Oct 2016 16:10:07 +0900 Subject: [PATCH 5/6] osc/rdma: silence a warning declare a local variable volatile and silence CID 1372692 (cherry picked from commit 958e29f929b740dee9771794f7fca9c833d69848) --- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index fc5609804c9..b12c94aa21d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -2,6 +2,8 @@ /* * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -689,7 +691,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi /* store the destination in the temporary buffer */ do { - bool complete = false; + volatile bool complete = false; ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, ptr, 8); if (OMPI_SUCCESS != ret) { From 0fae33ba3492f224aa42c8e2c0b26aa0482341fb Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 25 May 2016 10:42:52 -0600 Subject: [PATCH 6/6] btl/ugni: add support for additional atomic operations This commit adds support for Cray Aries atomic operations. This includes 32-bit and floating point support. Signed-off-by: Nathan Hjelm (cherry picked from commit c19426ac1b7aea25ebc368001e6d4c225351435d) --- opal/mca/btl/ugni/btl_ugni_atomic.c | 125 +++++++++++++++++++++---- opal/mca/btl/ugni/btl_ugni_component.c | 7 ++ 2 files changed, 112 insertions(+), 20 deletions(-) diff --git a/opal/mca/btl/ugni/btl_ugni_atomic.c b/opal/mca/btl/ugni/btl_ugni_atomic.c index 981bc759ee9..3c62670da89 100644 --- a/opal/mca/btl/ugni/btl_ugni_atomic.c +++ b/opal/mca/btl/ugni/btl_ugni_atomic.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -11,18 +11,66 @@ #include "btl_ugni_rdma.h" -static gni_fma_cmd_type_t famo_cmds[] = { - [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD, - [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND, - [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR, - [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR, +static gni_fma_cmd_type_t amo_cmds[][MCA_BTL_ATOMIC_LAST] = { + [OPAL_INT32] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_IADD_S, + [MCA_BTL_ATOMIC_LAND] = GNI_FMA_ATOMIC2_AND_S, + [MCA_BTL_ATOMIC_LOR] = GNI_FMA_ATOMIC2_OR_S, + [MCA_BTL_ATOMIC_LXOR] = GNI_FMA_ATOMIC2_XOR_S, + [MCA_BTL_ATOMIC_SWAP] = GNI_FMA_ATOMIC2_SWAP_S, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_IMIN_S, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_IMAX_S, + }, + [OPAL_INT64] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD, + [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND, + [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR, + [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR, + [MCA_BTL_ATOMIC_SWAP] = GNI_FMA_ATOMIC2_SWAP, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_IMIN, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_IMAX, + }, + [OPAL_FLOAT] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FPADD_S, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FPMIN_S, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FPMAX_S, + }, + [OPAL_DOUBLE] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FPADD, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FPMIN, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FPMAX, + }, }; -static gni_fma_cmd_type_t amo_cmds[] = { - [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_ADD, - [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_AND, - [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_OR, - [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_XOR, +static gni_fma_cmd_type_t famo_cmds[][MCA_BTL_ATOMIC_LAST] = { + [OPAL_INT32] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FIADD_S, + [MCA_BTL_ATOMIC_LAND] = GNI_FMA_ATOMIC2_FAND_S, + [MCA_BTL_ATOMIC_LOR] = GNI_FMA_ATOMIC2_FOR_S, + [MCA_BTL_ATOMIC_LXOR] = GNI_FMA_ATOMIC2_FXOR_S, + [MCA_BTL_ATOMIC_SWAP] = GNI_FMA_ATOMIC2_FSWAP_S, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FIMIN_S, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FIMAX_S, + }, + [OPAL_INT64] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC_FADD, + [MCA_BTL_ATOMIC_AND] = GNI_FMA_ATOMIC_FAND, + [MCA_BTL_ATOMIC_OR] = GNI_FMA_ATOMIC_FOR, + [MCA_BTL_ATOMIC_XOR] = GNI_FMA_ATOMIC_FXOR, + [MCA_BTL_ATOMIC_SWAP] = GNI_FMA_ATOMIC2_FSWAP, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FIMIN, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FIMAX, + }, + [OPAL_FLOAT] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FFPADD_S, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FFPMIN_S, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FFPMAX_S, + }, + [OPAL_DOUBLE] = { + [MCA_BTL_ATOMIC_ADD] = GNI_FMA_ATOMIC2_FFPADD, + [MCA_BTL_ATOMIC_MIN] = GNI_FMA_ATOMIC2_FFPMIN, + [MCA_BTL_ATOMIC_MAX] = GNI_FMA_ATOMIC2_FFPMAX, + }, }; int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, @@ -32,7 +80,20 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end { gni_mem_handle_t dummy = {0, 0}; mca_btl_ugni_post_descriptor_t *post_desc; - int rc; + int gni_op, rc, type; + size_t size; + + size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + if (MCA_BTL_ATOMIC_FLAG_FLOAT & flags) { + type = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? OPAL_FLOAT : OPAL_DOUBLE; + } else { + type = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? OPAL_INT32 : OPAL_INT64; + } + + gni_op = amo_cmds[type][op]; + if (0 == gni_op) { + return OPAL_ERR_NOT_SUPPORTED; + } rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -45,8 +106,8 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end } init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, 0, dummy, remote_address, - remote_handle->gni_handle, 8, 0); - post_desc->desc.base.amo_cmd = amo_cmds[op]; + remote_handle->gni_handle, size, 0); + post_desc->desc.base.amo_cmd = gni_op; post_desc->desc.base.first_operand = operand; @@ -54,6 +115,10 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end rc = GNI_PostFma (endpoint->rdma_ep_handle, &post_desc->desc.base); OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); if (GNI_RC_SUCCESS != rc) { + mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + if (GNI_RC_ILLEGAL_OP == rc) { + return OPAL_ERR_NOT_SUPPORTED; + } return OPAL_ERR_OUT_OF_RESOURCE; } @@ -67,7 +132,20 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en void *cbcontext, void *cbdata) { mca_btl_ugni_post_descriptor_t *post_desc; - int rc; + int gni_op, rc, type; + size_t size; + + size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + if (MCA_BTL_ATOMIC_FLAG_FLOAT & flags) { + type = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? OPAL_FLOAT : OPAL_DOUBLE; + } else { + type = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? OPAL_INT32 : OPAL_INT64; + } + + gni_op = famo_cmds[type][op]; + if (0 == gni_op) { + return OPAL_ERR_NOT_SUPPORTED; + } rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -81,8 +159,8 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, - remote_address, remote_handle->gni_handle, 8, 0); - post_desc->desc.base.amo_cmd = famo_cmds[op]; + remote_address, remote_handle->gni_handle, size, 0); + post_desc->desc.base.amo_cmd = gni_op; post_desc->desc.base.first_operand = operand; @@ -91,6 +169,9 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en OPAL_THREAD_UNLOCK(&endpoint->btl->device->dev_lock); if (GNI_RC_SUCCESS != rc) { mca_btl_ugni_return_post_descriptor (endpoint->btl, post_desc); + if (GNI_RC_ILLEGAL_OP == rc) { + return OPAL_ERR_NOT_SUPPORTED; + } return OPAL_ERR_OUT_OF_RESOURCE; } @@ -103,7 +184,11 @@ int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_ int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_ugni_post_descriptor_t *post_desc; - int rc; + int gni_op, rc; + size_t size; + + gni_op = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? GNI_FMA_ATOMIC2_CSWAP_S : GNI_FMA_ATOMIC_CSWAP; + size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; rc = mca_btl_ugni_check_endpoint_state_rdma (endpoint); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -117,8 +202,8 @@ int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_ init_gni_post_desc (&post_desc->desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle, - remote_address, remote_handle->gni_handle, 8, 0); - post_desc->desc.base.amo_cmd = GNI_FMA_ATOMIC_CSWAP; + remote_address, remote_handle->gni_handle, size, 0); + post_desc->desc.base.amo_cmd = gni_op; post_desc->desc.base.first_operand = compare; post_desc->desc.base.second_operand = value; diff --git a/opal/mca/btl/ugni/btl_ugni_component.c b/opal/mca/btl/ugni/btl_ugni_component.c index 7e8198df20f..6d283128353 100644 --- a/opal/mca/btl/ugni/btl_ugni_component.c +++ b/opal/mca/btl/ugni/btl_ugni_component.c @@ -288,6 +288,13 @@ btl_ugni_component_register(void) MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR | MCA_BTL_ATOMIC_SUPPORTS_CSWAP; + if (GNI_DEVICE_ARIES == device_type) { + /* aries supports additional atomic operations */ + mca_btl_ugni_module.super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX | + MCA_BTL_ATOMIC_SUPPORTS_LAND | MCA_BTL_ATOMIC_SUPPORTS_LOR | MCA_BTL_ATOMIC_SUPPORTS_LXOR | + MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_FLOAT; + } + mca_btl_ugni_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */