From 35ff1dadd67fff8bbac21a8e7cefd28bc9254dbd Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <jschuchart@leconte.icl.utk.edu>
Date: Fri, 3 Mar 2023 18:42:19 -0500
Subject: [PATCH 01/74] Initial draft of CUDA device support for ops

Signed-off-by: Joseph Schuchart <jschuchart@leconte.icl.utk.edu>
---
 ompi/mca/op/cuda/Makefile.am          |   71 ++
 ompi/mca/op/cuda/configure.m4         |   36 +
 ompi/mca/op/cuda/op_cuda.h            |   50 +
 ompi/mca/op/cuda/op_cuda_component.c  |  329 +++++
 ompi/mca/op/cuda/op_cuda_functions.cu | 1668 +++++++++++++++++++++++++
 ompi/op/op.c                          |   19 +
 ompi/op/op.h                          |   46 +-
 7 files changed, 2216 insertions(+), 3 deletions(-)
 create mode 100644 ompi/mca/op/cuda/Makefile.am
 create mode 100644 ompi/mca/op/cuda/configure.m4
 create mode 100644 ompi/mca/op/cuda/op_cuda.h
 create mode 100644 ompi/mca/op/cuda/op_cuda_component.c
 create mode 100644 ompi/mca/op/cuda/op_cuda_functions.cu
diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
new file mode 100644
index 00000000000..0e844271d67
--- /dev/null
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -0,0 +1,71 @@
+#
+# Copyright (c) 2019-2023 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2020-2021 Research Organization for Information Science
+#                         and Technology (RIST).  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This component provides support for offloading reduce ops to CUDA devices.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+AM_CPPFLAGS = $(common_cuda_CPPFLAGS)
+
+sources = op_cuda_component.c op_cuda.h
+sources_extended = op_cuda_functions.c
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+
+if MCA_BUILD_ompi_op_cuda_DSO
+component_install = mca_op_cuda.la
+else
+component_install =
+component_noinst = libmca_cuda_avx.la
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_cuda_la_SOURCES = $(sources)
+mca_op_cuda_la_LIBADD = $(specialized_op_libs)
+mca_op_cuda_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
+
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_cuda_la_SOURCES = $(sources)
+libmca_op_cuda_la_LIBADD = $(specialized_op_libs)
+libmca_op_cuda_la_LDFLAGS = -module -avoid-version
+
diff --git a/ompi/mca/op/cuda/configure.m4 b/ompi/mca/op/cuda/configure.m4
new file mode 100644
index 00000000000..7ea9c31f0dc
--- /dev/null
+++ b/ompi/mca/op/cuda/configure.m4
@@ -0,0 +1,36 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2013      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2022      Amazon.com, Inc. or its affiliates.
+#                         All Rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If CUDA support was requested, then build the CUDA support library.
+# This code checks makes sure the check was done earlier by the
+# opal_check_cuda.m4 code. It also copies the flags and libs under
+# opal_cuda_CPPFLAGS, opal_cuda_LDFLAGS, and opal_cuda_LIBS
+
+AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
+
+    AC_CONFIG_FILES([ompi/mca/op/cuda/Makefile])
+
+    OPAL_CHECK_CUDA([op_cuda])
+
+    AS_IF([test "x$CUDA_SUPPORT" = "x1"],
+          [$1],
+          [$2])
+
+    AC_SUBST([accelerator_cuda_CPPFLAGS])
+    AC_SUBST([accelerator_cuda_LDFLAGS])
+    AC_SUBST([accelerator_cuda_LIBS])
+
+])dnl
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
new file mode 100644
index 00000000000..b21223813c0
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2019-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_AVX_EXPORT_H
+#define MCA_OP_AVX_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+
+BEGIN_C_DECLS
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+    /* a stream on which to schedule kernel calls */
+    opal_accelerator_stream_t *stream;
+} ompi_op_cuda_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *cuda* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_cuda_component_t
+    mca_op_cuda_component;
+
+END_C_DECLS
+
+#endif /* MCA_OP_AVX_EXPORT_H */
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
new file mode 100644
index 00000000000..a2f01a373e2
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2019-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2021      Cisco Systems, Inc.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "avx" component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/avx/op_avx.h"
+
+static int avx_component_open(void);
+static int avx_component_close(void);
+static int avx_component_init_query(bool enable_progress_threads,
+                                    bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    avx_component_op_query(struct ompi_op_t *op, int *priority);
+static int avx_component_register(void);
+
+static mca_base_var_enum_value_flag_t avx_support_flags[] = {
+    { .flag = 0x001, .string = "SSE" },
+    { .flag = 0x002, .string = "SSE2" },
+    { .flag = 0x004, .string = "SSE3" },
+    { .flag = 0x008, .string = "SSE4.1" },
+    { .flag = 0x010, .string = "AVX" },
+    { .flag = 0x020, .string = "AVX2" },
+    { .flag = 0x100, .string = "AVX512F" },
+    { .flag = 0x200, .string = "AVX512BW" },
+    { .flag = 0,     .string = NULL },
+};
+
+/**
+ * A slightly modified code from
+ * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
+
+#include <immintrin.h>
+
+static uint32_t has_intel_AVX_features(void)
+{
+    uint32_t flags = 0;
+
+    flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F)  ? OMPI_OP_AVX_HAS_AVX512F_FLAG   : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_AVX2)     ? OMPI_OP_AVX_HAS_AVX2_FLAG      : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_AVX)      ? OMPI_OP_AVX_HAS_AVX_FLAG       : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1)   ? OMPI_OP_AVX_HAS_SSE4_1_FLAG    : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_SSE3)     ? OMPI_OP_AVX_HAS_SSE3_FLAG      : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_SSE2)     ? OMPI_OP_AVX_HAS_SSE2_FLAG      : 0;
+    flags |= _may_i_use_cpu_feature(_FEATURE_SSE)      ? OMPI_OP_AVX_HAS_SSE_FLAG       : 0;
+    return flags;
+}
+#else /* non-Intel compiler */
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
+{
+#if defined(_MSC_VER)
+    __cpuidex(abcd, eax, ecx);
+#else
+    uint32_t ebx = 0, edx = 0;
+#if defined( __i386__ ) && defined ( __PIC__ )
+    /* in case of PIC under 32-bit EBX cannot be clobbered */
+    __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
+#else
+    __asm__ ( "cpuid" : "+b" (ebx),
+#endif  /* defined( __i386__ ) && defined ( __PIC__ ) */
+              "+a" (eax), "+c" (ecx), "=d" (edx) );
+    abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
+#endif
+}
+
+static uint32_t has_intel_AVX_features(void)
+{
+    /* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
+    const uint32_t avx512f_mask   = (1U << 16);  // AVX512F   (EAX = 7, ECX = 0) : EBX
+    const uint32_t avx512_bw_mask = (1U << 30);  // AVX512BW  (EAX = 7, ECX = 0) : EBX
+    const uint32_t avx2_mask      = (1U << 5);   // AVX2      (EAX = 7, ECX = 0) : EBX
+    const uint32_t avx_mask       = (1U << 28);  // AVX       (EAX = 1, ECX = 0) : ECX
+    const uint32_t sse4_1_mask    = (1U << 19);  // SSE4.1    (EAX = 1, ECX = 0) : ECX
+    const uint32_t sse3_mask      = (1U << 0);   // SSE3      (EAX = 1, ECX = 0) : ECX
+    const uint32_t sse2_mask      = (1U << 26);  // SSE2      (EAX = 1, ECX = 0) : EDX
+    const uint32_t sse_mask       = (1U << 15);  // SSE       (EAX = 1, ECX = 0) : EDX
+    uint32_t flags = 0, abcd[4];
+
+    run_cpuid( 1, 0, abcd );
+    flags |= (abcd[2] & avx_mask)       ? OMPI_OP_AVX_HAS_AVX_FLAG      : 0;
+    flags |= (abcd[2] & sse4_1_mask)    ? OMPI_OP_AVX_HAS_SSE4_1_FLAG   : 0;
+    flags |= (abcd[2] & sse3_mask)      ? OMPI_OP_AVX_HAS_SSE3_FLAG     : 0;
+    flags |= (abcd[3] & sse2_mask)      ? OMPI_OP_AVX_HAS_SSE2_FLAG     : 0;
+    flags |= (abcd[3] & sse_mask)       ? OMPI_OP_AVX_HAS_SSE_FLAG      : 0;
+#if defined(__APPLE__)
+    uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27));  /* FMA(12) + MOVBE (22) OSXSAVE (27) */
+    // OS supports extended processor state management ?
+    if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
+        return 0;
+#endif  /* defined(__APPLE__) */
+
+    run_cpuid( 7, 0, abcd );
+    flags |= (abcd[1] & avx512f_mask)   ? OMPI_OP_AVX_HAS_AVX512F_FLAG  : 0;
+    flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
+    flags |= (abcd[1] & avx2_mask)      ? OMPI_OP_AVX_HAS_AVX2_FLAG     : 0;
+    return flags;
+}
+#endif /* non-Intel compiler */
+
+ompi_op_avx_component_t mca_op_avx_component = {
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "avx",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = avx_component_open,
+            .mca_close_component = avx_component_close,
+            .mca_register_component_params = avx_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = avx_component_init_query,
+        .opc_op_query = avx_component_op_query,
+    },
+};
+
+/*
+ * Component open
+ */
+static int avx_component_open(void)
+{
+    /* We checked the flags during register, so if they are set to
+     * zero either the architecture is not suitable or the user disabled
+     * AVX support.
+     *
+     * A first level check to see what level of AVX is available on the
+     * hardware.
+     *
+     * Note that if this function returns non-OMPI_SUCCESS, then this
+     * component won't even be shown in ompi_info output (which is
+     * probably not what you want).
+     */
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int avx_component_close(void)
+{
+    /* If avx was opened successfully, close it (i.e., release any
+       resources that may have been allocated on this component).
+       Note that _component_close() will always be called at the end
+       of the process, so it may have been after any/all of the other
+       component functions have been invoked (and possibly even after
+       modules have been created and/or destroyed). */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Register MCA params.
+ */
+static int
+avx_component_register(void)
+{
+    mca_op_avx_component.supported =
+        mca_op_avx_component.flags = has_intel_AVX_features();
+
+    // MCA var enum flag for conveniently seeing SSE/MMX/AVX support
+    // values
+    mca_base_var_enum_flag_t *new_enum_flag = NULL;
+    (void) mca_base_var_enum_create_flag("op_avx_support_flags",
+                                         avx_support_flags, &new_enum_flag);
+
+    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
+                                           "capabilities",
+                                           "Level of SSE/MMX/AVX support available in the current environment",
+                                           MCA_BASE_VAR_TYPE_INT,
+                                           &(new_enum_flag->super), 0, 0,
+                                           OPAL_INFO_LVL_4,
+                                           MCA_BASE_VAR_SCOPE_CONSTANT,
+                                           &mca_op_avx_component.supported);
+
+    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
+                                           "support",
+                                           "Level of SSE/MMX/AVX support to be used, capped by the local architecture capabilities",
+                                           MCA_BASE_VAR_TYPE_INT,
+                                           &(new_enum_flag->super), 0, 0,
+                                           OPAL_INFO_LVL_4,
+                                           MCA_BASE_VAR_SCOPE_LOCAL,
+                                           &mca_op_avx_component.flags);
+    OBJ_RELEASE(new_enum_flag);
+
+    mca_op_avx_component.flags &= mca_op_avx_component.supported;
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int
+avx_component_init_query(bool enable_progress_threads,
+                         bool enable_mpi_thread_multiple)
+{
+    if( 0 == mca_op_avx_component.flags )
+        return OMPI_ERR_NOT_SUPPORTED;
+    return OMPI_SUCCESS;
+}
+
+#if OMPI_MCA_OP_HAVE_AVX512
+ extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+ extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+#endif
+#if OMPI_MCA_OP_HAVE_AVX2
+ extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+ extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+#endif
+#if OMPI_MCA_OP_HAVE_AVX
+ extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+ extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+#endif
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t*
+avx_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    ompi_op_base_module_t *module = NULL;
+    /* Sanity check -- although the framework should never invoke the
+       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
+       check here just to be sure. */
+    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
+        return NULL;
+    }
+
+    switch (op->o_f_to_c_index) {
+    case OMPI_OP_BASE_FORTRAN_MAX:
+    case OMPI_OP_BASE_FORTRAN_MIN:
+    case OMPI_OP_BASE_FORTRAN_SUM:
+    case OMPI_OP_BASE_FORTRAN_PROD:
+    case OMPI_OP_BASE_FORTRAN_BOR:
+    case OMPI_OP_BASE_FORTRAN_BAND:
+    case OMPI_OP_BASE_FORTRAN_BXOR:
+        module = OBJ_NEW(ompi_op_base_module_t);
+        for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+#if OMPI_MCA_OP_HAVE_AVX512
+            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX512F_FLAG ) {
+                module->opm_fns[i] = ompi_op_avx_functions_avx512[op->o_f_to_c_index][i];
+                module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx512[op->o_f_to_c_index][i];
+            }
+#endif
+#if OMPI_MCA_OP_HAVE_AVX2
+            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX2_FLAG ) {
+                if( NULL == module->opm_fns[i] ) {
+                    module->opm_fns[i] = ompi_op_avx_functions_avx2[op->o_f_to_c_index][i];
+                }
+                if( NULL == module->opm_3buff_fns[i] ) {
+                    module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx2[op->o_f_to_c_index][i];
+                }
+            }
+#endif
+#if OMPI_MCA_OP_HAVE_AVX
+            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX_FLAG ) {
+                if( NULL == module->opm_fns[i] ) {
+                    module->opm_fns[i] = ompi_op_avx_functions_avx[op->o_f_to_c_index][i];
+                }
+                if( NULL == module->opm_3buff_fns[i] ) {
+                    module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx[op->o_f_to_c_index][i];
+                }
+            }
+#endif
+            if( NULL != module->opm_fns[i] ) {
+                OBJ_RETAIN(module);
+            }
+            if( NULL != module->opm_3buff_fns[i] ) {
+                OBJ_RETAIN(module);
+            }
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_LAND:
+    case OMPI_OP_BASE_FORTRAN_LOR:
+    case OMPI_OP_BASE_FORTRAN_LXOR:
+    case OMPI_OP_BASE_FORTRAN_MAXLOC:
+    case OMPI_OP_BASE_FORTRAN_MINLOC:
+    case OMPI_OP_BASE_FORTRAN_REPLACE:
+    default:
+        break;
+    }
+    /* If we got a module from above, we'll return it.  Otherwise,
+       we'll return NULL, indicating that this component does not want
+       to be considered for selection for this MPI_Op.  Note that the
+       functions each returned a *avx* component pointer
+       (vs. a *base* component pointer -- where an *avx* component
+       is a base component plus some other module-specific cached
+       information), so we have to cast it to the right pointer type
+       before returning. */
+    if (NULL != module) {
+        *priority = 50;
+    }
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/cuda/op_cuda_functions.cu b/ompi/mca/op/cuda/op_cuda_functions.cu
new file mode 100644
index 00000000000..ad1741ffaa6
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_functions.cu
@@ -0,0 +1,1668 @@
+/*
+ * Copyright (c) 2019-2021 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/avx/op_avx.h"
+
+#define THREADS_PER_BLOCK 512
+
+#define OP_FUNC(name, type_name, type, op)                                 \
+    __global__ void                                                                 \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {   \
+        int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
+        if (i < n) inout[i] = inout[i] op in[i];                                    \
+    }                                                                               \
+    void ompi_op_cuda_2buff_##name##_##type_name##(const void *in, void *inout, int *count,  \
+                                                   struct ompi_datatype_t **dtype,           \
+                                                   struct ompi_op_cuda_module_1_0_0_t *module) {     \
+        int threads = THREADS_PER_BLOCK;                                            \
+        int blocks  = *count / THREADS_PER_BLOCK;                                   \
+        type *inout_ = (type*)inout;                                                \
+        const type *in_ = (const type*)in;                                          \
+        int n = *count;                                                             \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads>>>(in_, inout_, n); \
+    }
+
+
+#define FUNC_FUNC(name, type_name, type)                                            \
+    __global__ void                                                                 \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {   \
+        int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
+        if (i < n) inout[i] = current_func(inout[i], in[i]);                        \
+    }                                                                               \
+    static void                                                                     \
+    ompi_op_cuda_2buff_##name##_##type_name##(const void *in, void *inout, int *count,  \
+                                              struct ompi_datatype_t **dtype,           \
+                                              struct ompi_op_cuda_module_1_0_0_t *module) {     \
+        int threads = THREADS_PER_BLOCK;                                            \
+        int blocks  = *count / THREADS_PER_BLOCK;                                   \
+        type *inout_ = (type*)inout;                                                \
+        const type *in_ = (const type*)in;                                          \
+        int n = *count;                                                             \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads>>(in_, inout_, n); \
+    }
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+#define LOC_STRUCT(type_name, type1, type2) \
+  typedef struct { \
+      type1 v; \
+      type2 k; \
+  } ompi_op_predefined_##type_name##_t;
+
+#define LOC_FUNC(name, type_name, op) \
+    __global__ void                   \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in, \
+                                                     ompi_op_predefined_##type_name##_t *inout,    \
+                                                     int n)                                        \
+    {                                                                       \
+        int i = blockIdx.x*blockDim.x + threadIdx.x;                        \
+        if (i < n) {                                                        \
+            const ompi_op_predefined_##type_name##_t *a = &in[i];           \
+            ompi_op_predefined_##type_name##_t *b = &inout[i];              \
+            if (a->v op b->v) {                                             \
+                b->v = a->v;                                                \
+                b->k = a->k;                                                \
+            } else if (a->v == b->v) {                                      \
+                b->k = (b->k < a->k ? b->k : a->k);                         \
+            }                                                               \
+        }                                                                   \
+    }                                                                       \
+    static void                                                             \
+    ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *out, int *count,      \
+                                            struct ompi_datatype_t **dtype,             \
+                                            struct ompi_op_cuda_module_1_0_0_t *module) \
+    {                                                                                   \
+        int i;                                                                          \
+        int threads = THREADS_PER_BLOCK;                                                \
+        int blocks  = *count / THREADS_PER_BLOCK;                                       \
+        const ompi_op_predefined_##type_name##_t *a = (const ompi_op_predefined_##type_name##_t*) in; \
+        ompi_op_predefined_##type_name##_t *b = (ompi_op_predefined_##type_name##_t*) out;            \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads>>(a, b, n);                 \
+    }
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC(max,   int8_t,   int8_t)
+FUNC_FUNC(max,  uint8_t,  uint8_t)
+FUNC_FUNC(max,  int16_t,  int16_t)
+FUNC_FUNC(max, uint16_t, uint16_t)
+FUNC_FUNC(max,  int32_t,  int32_t)
+FUNC_FUNC(max, uint32_t, uint32_t)
+FUNC_FUNC(max,  int64_t,  int64_t)
+FUNC_FUNC(max, uint64_t, uint64_t)
+FUNC_FUNC(max,  long,  long)
+FUNC_FUNC(max,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC(max, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC(max, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC(max, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC(max, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC(max, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC(max, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC(max, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC(max, float, float)
+FUNC_FUNC(max, double, double)
+FUNC_FUNC(max, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC(max, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC(max, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC(max, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC(max, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC(max, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC(max, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC(min,   int8_t,   int8_t)
+FUNC_FUNC(min,  uint8_t,  uint8_t)
+FUNC_FUNC(min,  int16_t,  int16_t)
+FUNC_FUNC(min, uint16_t, uint16_t)
+FUNC_FUNC(min,  int32_t,  int32_t)
+FUNC_FUNC(min, uint32_t, uint32_t)
+FUNC_FUNC(min,  int64_t,  int64_t)
+FUNC_FUNC(min, uint64_t, uint64_t)
+FUNC_FUNC(min,  long,  long)
+FUNC_FUNC(min,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC(min, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC(min, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC(min, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC(min, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC(min, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC(min, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC(min, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC(min, float, float)
+FUNC_FUNC(min, double, double)
+FUNC_FUNC(min, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC(min, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC(min, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC(min, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC(min, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC(min, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC(min, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC(sum,   int8_t,   int8_t, +=)
+OP_FUNC(sum,  uint8_t,  uint8_t, +=)
+OP_FUNC(sum,  int16_t,  int16_t, +=)
+OP_FUNC(sum, uint16_t, uint16_t, +=)
+OP_FUNC(sum,  int32_t,  int32_t, +=)
+OP_FUNC(sum, uint32_t, uint32_t, +=)
+OP_FUNC(sum,  int64_t,  int64_t, +=)
+OP_FUNC(sum, uint64_t, uint64_t, +=)
+OP_FUNC(sum,  long,  long, +=)
+OP_FUNC(sum,  unsigned_long, unsigned long, +=)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC(sum, fortran_integer, ompi_fortran_integer_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC(sum, fortran_integer1, ompi_fortran_integer1_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC(sum, fortran_integer2, ompi_fortran_integer2_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC(sum, fortran_integer4, ompi_fortran_integer4_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC(sum, fortran_integer8, ompi_fortran_integer8_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC(sum, fortran_integer16, ompi_fortran_integer16_t, +=)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC(sum, short_float, short float, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC(sum, short_float, opal_short_float_t, +=)
+#endif
+OP_FUNC(sum, float, float, +=)
+OP_FUNC(sum, double, double, +=)
+OP_FUNC(sum, long_double, long double, +=)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC(sum, fortran_real, ompi_fortran_real_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC(sum, fortran_double_precision, ompi_fortran_double_precision_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC(sum, fortran_real2, ompi_fortran_real2_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC(sum, fortran_real4, ompi_fortran_real4_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC(sum, fortran_real8, ompi_fortran_real8_t, +=)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC(sum, fortran_real16, ompi_fortran_real16_t, +=)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC(sum, c_float_complex, float _Complex, +=)
+OP_FUNC(sum, c_double_complex, double _Complex, +=)
+OP_FUNC(sum, c_long_double_complex, long double _Complex, +=)
+#endif // 0
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC(prod,   int8_t,   int8_t, *=)
+OP_FUNC(prod,  uint8_t,  uint8_t, *=)
+OP_FUNC(prod,  int16_t,  int16_t, *=)
+OP_FUNC(prod, uint16_t, uint16_t, *=)
+OP_FUNC(prod,  int32_t,  int32_t, *=)
+OP_FUNC(prod, uint32_t, uint32_t, *=)
+OP_FUNC(prod,  int64_t,  int64_t, *=)
+OP_FUNC(prod, uint64_t, uint64_t, *=)
+OP_FUNC(prod,  long,  long, *=)
+OP_FUNC(prod,  unsigned_long, unsigned long, *=)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC(prod, fortran_integer, ompi_fortran_integer_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC(prod, fortran_integer1, ompi_fortran_integer1_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC(prod, fortran_integer2, ompi_fortran_integer2_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC(prod, fortran_integer4, ompi_fortran_integer4_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC(prod, fortran_integer8, ompi_fortran_integer8_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC(prod, fortran_integer16, ompi_fortran_integer16_t, *=)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC(prod, short_float, short float, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC(prod, short_float, opal_short_float_t, *=)
+#endif
+OP_FUNC(prod, float, float, *=)
+OP_FUNC(prod, double, double, *=)
+OP_FUNC(prod, long_double, long double, *=)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC(prod, fortran_real, ompi_fortran_real_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC(prod, fortran_double_precision, ompi_fortran_double_precision_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC(prod, fortran_real2, ompi_fortran_real2_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC(prod, fortran_real4, ompi_fortran_real4_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC(prod, fortran_real8, ompi_fortran_real8_t, *=)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC(prod, fortran_real16, ompi_fortran_real16_t, *=)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC(prod, c_short_float_complex, short float _Complex, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC(prod, c_float_complex, float _Complex, *=)
+OP_FUNC(prod, c_double_complex, double _Complex, *=)
+OP_FUNC(prod, c_long_double_complex, long double _Complex, *=)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC(land,   int8_t,   int8_t)
+FUNC_FUNC(land,  uint8_t,  uint8_t)
+FUNC_FUNC(land,  int16_t,  int16_t)
+FUNC_FUNC(land, uint16_t, uint16_t)
+FUNC_FUNC(land,  int32_t,  int32_t)
+FUNC_FUNC(land, uint32_t, uint32_t)
+FUNC_FUNC(land,  int64_t,  int64_t)
+FUNC_FUNC(land, uint64_t, uint64_t)
+FUNC_FUNC(land,  long,  long)
+FUNC_FUNC(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC(lor,   int8_t,   int8_t)
+FUNC_FUNC(lor,  uint8_t,  uint8_t)
+FUNC_FUNC(lor,  int16_t,  int16_t)
+FUNC_FUNC(lor, uint16_t, uint16_t)
+FUNC_FUNC(lor,  int32_t,  int32_t)
+FUNC_FUNC(lor, uint32_t, uint32_t)
+FUNC_FUNC(lor,  int64_t,  int64_t)
+FUNC_FUNC(lor, uint64_t, uint64_t)
+FUNC_FUNC(lor,  long,  long)
+FUNC_FUNC(lor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC(lor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC(lxor,   int8_t,   int8_t)
+FUNC_FUNC(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC(lxor,  int16_t,  int16_t)
+FUNC_FUNC(lxor, uint16_t, uint16_t)
+FUNC_FUNC(lxor,  int32_t,  int32_t)
+FUNC_FUNC(lxor, uint32_t, uint32_t)
+FUNC_FUNC(lxor,  int64_t,  int64_t)
+FUNC_FUNC(lxor, uint64_t, uint64_t)
+FUNC_FUNC(lxor,  long,  long)
+FUNC_FUNC(lxor,  unsigned_long, unsigned long)
+
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC(lxor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC(band,   int8_t,   int8_t)
+FUNC_FUNC(band,  uint8_t,  uint8_t)
+FUNC_FUNC(band,  int16_t,  int16_t)
+FUNC_FUNC(band, uint16_t, uint16_t)
+FUNC_FUNC(band,  int32_t,  int32_t)
+FUNC_FUNC(band, uint32_t, uint32_t)
+FUNC_FUNC(band,  int64_t,  int64_t)
+FUNC_FUNC(band, uint64_t, uint64_t)
+FUNC_FUNC(band,  long,  long)
+FUNC_FUNC(band,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC(band, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC(band, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC(band, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC(band, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC(band, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC(band, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC(bor,   int8_t,   int8_t)
+FUNC_FUNC(bor,  uint8_t,  uint8_t)
+FUNC_FUNC(bor,  int16_t,  int16_t)
+FUNC_FUNC(bor, uint16_t, uint16_t)
+FUNC_FUNC(bor,  int32_t,  int32_t)
+FUNC_FUNC(bor, uint32_t, uint32_t)
+FUNC_FUNC(bor,  int64_t,  int64_t)
+FUNC_FUNC(bor, uint64_t, uint64_t)
+FUNC_FUNC(bor,  long,  long)
+FUNC_FUNC(bor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC(bor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC(bor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC(bor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC(bor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC(bor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC(bor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC(bxor,   int8_t,   int8_t)
+FUNC_FUNC(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC(bxor,  int16_t,  int16_t)
+FUNC_FUNC(bxor, uint16_t, uint16_t)
+FUNC_FUNC(bxor,  int32_t,  int32_t)
+FUNC_FUNC(bxor, uint32_t, uint32_t)
+FUNC_FUNC(bxor,  int64_t,  int64_t)
+FUNC_FUNC(bxor, uint64_t, uint64_t)
+FUNC_FUNC(bxor,  long,  long)
+FUNC_FUNC(bxor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC(bxor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC(bxor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC(bxor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC(bxor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC(bxor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC(bxor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC(bxor, byte, char)
+
+/*************************************************************************
+ * Min and max location "pair" datatypes
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_STRUCT(2real, ompi_fortran_real_t, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_STRUCT(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_STRUCT(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
+#endif
+LOC_STRUCT(float_int, float, int)
+LOC_STRUCT(double_int, double, int)
+LOC_STRUCT(long_int, long, int)
+LOC_STRUCT(2int, int, int)
+LOC_STRUCT(short_int, short, int)
+LOC_STRUCT(long_double_int, long double, int)
+LOC_STRUCT(unsigned_long, unsigned long, int)
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC(maxloc, 2real, >)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC(maxloc, 2double_precision, >)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC(maxloc, 2integer, >)
+#endif
+LOC_FUNC(maxloc, float_int, >)
+LOC_FUNC(maxloc, double_int, >)
+LOC_FUNC(maxloc, long_int, >)
+LOC_FUNC(maxloc, 2int, >)
+LOC_FUNC(maxloc, short_int, >)
+LOC_FUNC(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC(minloc, 2real, <)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC(minloc, 2double_precision, <)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC(minloc, 2integer, <)
+#endif
+LOC_FUNC(minloc, float_int, <)
+LOC_FUNC(minloc, double_int, <)
+LOC_FUNC(minloc, long_int, <)
+LOC_FUNC(minloc, 2int, <)
+LOC_FUNC(minloc, short_int, <)
+LOC_FUNC(minloc, long_double_int, <)
+
+
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *    routines, needed for some optimizations.
+ */
+#define OP_FUNC(name, type_name, type, op)                                 \
+    __global__ void                                                                 \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type* in2, type *out, int n) {   \
+        int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
+        if (i < n) out[i] = in1[i] op in2[i];                                       \
+    }                                                                               \
+    void ompi_op_cuda_3buff_##name##_##type_name##(const void *in1, const void *in2, void *out, int *count,  \
+                                                   struct ompi_datatype_t **dtype,           \
+                                                   struct ompi_op_cuda_module_1_0_0_t *module) {     \
+        int threads = THREADS_PER_BLOCK;                                            \
+        int blocks  = *count / THREADS_PER_BLOCK;                                   \
+        type *out_ = (type*)out;                                                    \
+        const type *in1_ = (const type*)in1;                                        \
+        const type *in2_ = (const type*)in2;                                        \
+        int n = *count;                                                             \
+        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads>>>(in1_, int2_, out_, n); \
+    }
+
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out = op(in1, in2))
+ */
+#define FUNC_FUNC(name, type_name, type)                                            \
+    __global__ void                                                                 \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type *in2, type *out, int n) {   \
+        int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
+        if (i < n) out[i] = current_func(in1[i], in2[i]);                           \
+    }                                                                               \
+    static void                                                                     \
+    ompi_op_cuda_3buff_##name##_##type_name##(const void *in1, const void *in2, void *out, int *count,  \
+                                              struct ompi_datatype_t **dtype,           \
+                                              struct ompi_op_cuda_module_1_0_0_t *module) {     \
+        int threads = THREADS_PER_BLOCK;                                            \
+        int blocks  = *count / THREADS_PER_BLOCK;                                   \
+        type *out_ = (type*)out;                                                    \
+        const type *in1_ = (const type*)in1;                                        \
+        const type *in2_ = (const type*)in2;                                        \
+        int n = *count;                                                             \
+        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<blocks, threads>>(in1_, in2_, out_, n); \
+    }
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+/*
+#define LOC_STRUCT(type_name, type1, type2) \
+  typedef struct { \
+      type1 v; \
+      type2 k; \
+  } ompi_op_predefined_##type_name##_t;
+*/
+
+#define LOC_FUNC(name, type_name, op) \
+    __global__ void                   \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in1, \
+                                                     const ompi_op_predefined_##type_name##_t *in2, \
+                                                     ompi_op_predefined_##type_name##_t *out,       \
+                                                     int n)                                         \
+    {                                                                       \
+        int i = blockIdx.x*blockDim.x + threadIdx.x;                        \
+        if (i < n) {                                                        \
+            const ompi_op_predefined_##type_name##_t *a1 = &in1[i];         \
+            const ompi_op_predefined_##type_name##_t *a2 = &in2[i];         \
+            ompi_op_predefined_##type_name##_t *b = &out[i];                \
+            if (a1->v op a2->v) {                                           \
+                b->v = a1->v;                                               \
+                b->k = a1->k;                                               \
+            } else if (a1->v == a2->v) {                                    \
+                b->v = a1->v;                                               \
+                b->k = (a2->k < a1->k ? a2->k : a1->k);                     \
+            } else {                                                        \
+                b->v = a2->v;                                               \
+                b->k = a2->k;                                               \
+            }                                                               \
+        }                                                                   \
+    }                                                                       \
+    static void                                                             \
+    ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,\
+                                            struct ompi_datatype_t **dtype,             \
+                                            struct ompi_op_cuda_module_1_0_0_t *module) \
+    {                                                                                   \
+        int i;                                                                          \
+        int threads = THREADS_PER_BLOCK;                                                \
+        int blocks  = *count / THREADS_PER_BLOCK;                                       \
+        const ompi_op_predefined_##type_name##_t *a1 = (const ompi_op_predefined_##type_name##_t*) in1; \
+        const ompi_op_predefined_##type_name##_t *a2 = (const ompi_op_predefined_##type_name##_t*) in2; \
+        ompi_op_predefined_##type_name##_t *b = (ompi_op_predefined_##type_name##_t*) out;            \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads>>(a1, a2, b, n);            \
+    }
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF(max,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(max,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(max,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(max, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(max,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(max, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(max,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(max,  long,  long)
+FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(max, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(max, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(max, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(max, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(max, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(max, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF(max, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF(max, float, float)
+FUNC_FUNC_3BUF(max, double, double)
+FUNC_FUNC_3BUF(max, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF(max, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF(max, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF(max, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF(max, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF(max, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF(max, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF(min,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(min,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(min,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(min, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(min,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(min, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(min,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(min,  long,  long)
+FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(min, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(min, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(min, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(min, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(min, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(min, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF(min, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF(min, float, float)
+FUNC_FUNC_3BUF(min, double, double)
+FUNC_FUNC_3BUF(min, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF(min, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF(min, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF(min, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF(min, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF(min, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF(min, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF(sum,   int8_t,   int8_t, +)
+OP_FUNC_3BUF(sum,  uint8_t,  uint8_t, +)
+OP_FUNC_3BUF(sum,  int16_t,  int16_t, +)
+OP_FUNC_3BUF(sum, uint16_t, uint16_t, +)
+OP_FUNC_3BUF(sum,  int32_t,  int32_t, +)
+OP_FUNC_3BUF(sum, uint32_t, uint32_t, +)
+OP_FUNC_3BUF(sum,  int64_t,  int64_t, +)
+OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
+OP_FUNC_3BUF(sum,  long,  long, +)
+OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF(sum, fortran_integer, ompi_fortran_integer_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF(sum, fortran_integer1, ompi_fortran_integer1_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF(sum, fortran_integer2, ompi_fortran_integer2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF(sum, fortran_integer4, ompi_fortran_integer4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF(sum, fortran_integer8, ompi_fortran_integer8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF(sum, fortran_integer16, ompi_fortran_integer16_t, +)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF(sum, short_float, short float, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF(sum, short_float, opal_short_float_t, +)
+#endif
+OP_FUNC_3BUF(sum, float, float, +)
+OP_FUNC_3BUF(sum, double, double, +)
+OP_FUNC_3BUF(sum, long_double, long double, +)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF(sum, fortran_real, ompi_fortran_real_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF(sum, fortran_real2, ompi_fortran_real2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF(sum, fortran_real4, ompi_fortran_real4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF(sum, fortran_real8, ompi_fortran_real8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF(sum, fortran_real16, ompi_fortran_real16_t, +)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF(sum, c_short_float_complex, short float _Complex, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF(sum, c_float_complex, float _Complex, +)
+OP_FUNC_3BUF(sum, c_double_complex, double _Complex, +)
+OP_FUNC_3BUF(sum, c_long_double_complex, long double _Complex, +)
+#endif // 0
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF(prod,   int8_t,   int8_t, *)
+OP_FUNC_3BUF(prod,  uint8_t,  uint8_t, *)
+OP_FUNC_3BUF(prod,  int16_t,  int16_t, *)
+OP_FUNC_3BUF(prod, uint16_t, uint16_t, *)
+OP_FUNC_3BUF(prod,  int32_t,  int32_t, *)
+OP_FUNC_3BUF(prod, uint32_t, uint32_t, *)
+OP_FUNC_3BUF(prod,  int64_t,  int64_t, *)
+OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
+OP_FUNC_3BUF(prod,  long,  long, *)
+OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF(prod, fortran_integer, ompi_fortran_integer_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF(prod, fortran_integer1, ompi_fortran_integer1_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF(prod, fortran_integer2, ompi_fortran_integer2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF(prod, fortran_integer4, ompi_fortran_integer4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF(prod, fortran_integer8, ompi_fortran_integer8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF(prod, fortran_integer16, ompi_fortran_integer16_t, *)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF(prod, short_float, short float, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF(prod, short_float, opal_short_float_t, *)
+#endif
+OP_FUNC_3BUF(prod, float, float, *)
+OP_FUNC_3BUF(prod, double, double, *)
+OP_FUNC_3BUF(prod, long_double, long double, *)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF(prod, fortran_real, ompi_fortran_real_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF(prod, fortran_real2, ompi_fortran_real2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF(prod, fortran_real4, ompi_fortran_real4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF(prod, fortran_real8, ompi_fortran_real8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF(prod, fortran_real16, ompi_fortran_real16_t, *)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF(prod, c_short_float_complex, short float _Complex, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF(prod, c_float_complex, float _Complex, *)
+OP_FUNC_3BUF(prod, c_double_complex, double _Complex, *)
+OP_FUNC_3BUF(prod, c_long_double_complex, long double _Complex, *)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC_3BUF(land,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(land,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(land,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(land, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(land,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(land, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(land,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(land,  long,  long)
+FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC_3BUF(lor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(lor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(lor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(lor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(lor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(lor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(lor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(lor,  long,  long)
+FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(lor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC_3BUF(lxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(lxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(lxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(lxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(lxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(lxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(lxor,  long,  long)
+FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(lxor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC_3BUF(band,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(band,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(band,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(band, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(band,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(band, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(band,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(band,  long,  long)
+FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(band, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(band, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(band, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(band, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(band, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(band, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC_3BUF(bor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(bor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(bor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(bor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(bor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(bor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(bor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(bor,  long,  long)
+FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(bor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(bor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(bor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(bor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(bor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(bor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC_3BUF(bxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(bxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(bxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(bxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(bxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(bxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(bxor,  long,  long)
+FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(bxor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(bxor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(bxor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(bxor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(bxor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(bxor, byte, char)
+
+/*************************************************************************
+ * Min and max location "pair" datatypes
+ *************************************************************************/
+
+/*
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_STRUCT_3BUF(2real, ompi_fortran_real_t, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_STRUCT_3BUF(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_STRUCT_3BUF(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
+#endif
+LOC_STRUCT_3BUF(float_int, float, int)
+LOC_STRUCT_3BUF(double_int, double, int)
+LOC_STRUCT_3BUF(long_int, long, int)
+LOC_STRUCT_3BUF(2int, int, int)
+LOC_STRUCT_3BUF(short_int, short, int)
+LOC_STRUCT_3BUF(long_double_int, long double, int)
+*/
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF(maxloc, 2real, >)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF(maxloc, 2double_precision, >)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF(maxloc, 2integer, >)
+#endif
+LOC_FUNC_3BUF(maxloc, float_int, >)
+LOC_FUNC_3BUF(maxloc, double_int, >)
+LOC_FUNC_3BUF(maxloc, long_int, >)
+LOC_FUNC_3BUF(maxloc, 2int, >)
+LOC_FUNC_3BUF(maxloc, short_int, >)
+LOC_FUNC_3BUF(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF(minloc, 2real, <)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF(minloc, 2double_precision, <)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF(minloc, 2integer, <)
+#endif
+LOC_FUNC_3BUF(minloc, float_int, <)
+LOC_FUNC_3BUF(minloc, double_int, <)
+LOC_FUNC_3BUF(minloc, long_int, <)
+LOC_FUNC_3BUF(minloc, 2int, <)
+LOC_FUNC_3BUF(minloc, short_int, <)
+LOC_FUNC_3BUF(minloc, long_double_int, <)
+
+
+/*
+ * Helpful defines, because there's soooo many names!
+ *
+ * **NOTE** These #define's used to be strictly ordered but the use of
+ * designated initializers removed this restrictions. When adding new
+ * operators ALWAYS use a designated initializer!
+ */
+
+/** C integer ***********************************************************/
+#define C_INTEGER(name, ftype)                                             \
+  [OMPI_OP_CUDA_TYPE_INT8_T] = ompi_op_cuda_##ftype##_##name##_int8_t,     \
+  [OMPI_OP_CUDA_TYPE_UINT8_T] = ompi_op_cuda_##ftype##_##name##_uint8_t,   \
+  [OMPI_OP_CUDA_TYPE_INT16_T] = ompi_op_cuda_##ftype##_##name##_int16_t,   \
+  [OMPI_OP_CUDA_TYPE_UINT16_T] = ompi_op_cuda_##ftype##_##name##_uint16_t, \
+  [OMPI_OP_CUDA_TYPE_INT32_T] = ompi_op_cuda_##ftype##_##name##_int32_t,   \
+  [OMPI_OP_CUDA_TYPE_UINT32_T] = ompi_op_cuda_##ftype##_##name##_uint32_t, \
+  [OMPI_OP_CUDA_TYPE_INT64_T] = ompi_op_cuda_##ftype##_##name##_int64_t,   \
+  [OMPI_OP_CUDA_TYPE_LONG] = ompi_op_cuda_##ftype##_##name##_long,   \
+  [OMPI_OP_CUDA_TYPE_UNSIGNED_LONG] = ompi_op_cuda_##ftype##_##name##_unsigned_long,   \
+  [OMPI_OP_CUDA_TYPE_UINT64_T] = ompi_op_cuda_##ftype##_##name##_uint64_t
+
+/** All the Fortran integers ********************************************/
+
+#if OMPI_HAVE_FORTRAN_INTEGER
+#define FORTRAN_INTEGER_PLAIN(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_integer
+#else
+#define FORTRAN_INTEGER_PLAIN(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+#define FORTRAN_INTEGER1(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_integer1
+#else
+#define FORTRAN_INTEGER1(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+#define FORTRAN_INTEGER2(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_integer2
+#else
+#define FORTRAN_INTEGER2(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+#define FORTRAN_INTEGER4(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_integer4
+#else
+#define FORTRAN_INTEGER4(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+#define FORTRAN_INTEGER8(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_integer8
+#else
+#define FORTRAN_INTEGER8(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+#define FORTRAN_INTEGER16(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_integer16
+#else
+#define FORTRAN_INTEGER16(name, ftype) NULL
+#endif
+
+#define FORTRAN_INTEGER(name, ftype)                                  \
+    [OMPI_OP_CUDA_TYPE_INTEGER] = FORTRAN_INTEGER_PLAIN(name, ftype), \
+    [OMPI_OP_CUDA_TYPE_INTEGER1] = FORTRAN_INTEGER1(name, ftype),     \
+    [OMPI_OP_CUDA_TYPE_INTEGER2] = FORTRAN_INTEGER2(name, ftype),     \
+    [OMPI_OP_CUDA_TYPE_INTEGER4] = FORTRAN_INTEGER4(name, ftype),     \
+    [OMPI_OP_CUDA_TYPE_INTEGER8] = FORTRAN_INTEGER8(name, ftype),     \
+    [OMPI_OP_CUDA_TYPE_INTEGER16] = FORTRAN_INTEGER16(name, ftype)
+
+/** All the Fortran reals ***********************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+#define FLOATING_POINT_FORTRAN_REAL_PLAIN(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_real
+#else
+#define FLOATING_POINT_FORTRAN_REAL_PLAIN(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+#define FLOATING_POINT_FORTRAN_REAL2(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_real2
+#else
+#define FLOATING_POINT_FORTRAN_REAL2(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+#define FLOATING_POINT_FORTRAN_REAL4(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_real4
+#else
+#define FLOATING_POINT_FORTRAN_REAL4(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+#define FLOATING_POINT_FORTRAN_REAL8(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_real8
+#else
+#define FLOATING_POINT_FORTRAN_REAL8(name, ftype) NULL
+#endif
+/* If:
+   - we have fortran REAL*16, *and*
+   - fortran REAL*16 matches the bit representation of the
+     corresponding C type
+   Only then do we put in function pointers for REAL*16 reductions.
+   Otherwise, just put in NULL. */
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+#define FLOATING_POINT_FORTRAN_REAL16(name, ftype) ompi_op_cuda_##ftype##_##name##_fortran_real16
+#else
+#define FLOATING_POINT_FORTRAN_REAL16(name, ftype) NULL
+#endif
+
+#define FLOATING_POINT_FORTRAN_REAL(name, ftype)                               \
+    [OMPI_OP_CUDA_TYPE_REAL] = FLOATING_POINT_FORTRAN_REAL_PLAIN(name, ftype), \
+    [OMPI_OP_CUDA_TYPE_REAL2] = FLOATING_POINT_FORTRAN_REAL2(name, ftype),     \
+    [OMPI_OP_CUDA_TYPE_REAL4] = FLOATING_POINT_FORTRAN_REAL4(name, ftype),     \
+    [OMPI_OP_CUDA_TYPE_REAL8] = FLOATING_POINT_FORTRAN_REAL8(name, ftype),     \
+    [OMPI_OP_CUDA_TYPE_REAL16] = FLOATING_POINT_FORTRAN_REAL16(name, ftype)
+
+/** Fortran double precision ********************************************/
+
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+#define FLOATING_POINT_FORTRAN_DOUBLE_PRECISION(name, ftype)  \
+    ompi_op_cuda_##ftype##_##name##_fortran_double_precision
+#else
+#define FLOATING_POINT_FORTRAN_DOUBLE_PRECISION(name, ftype) NULL
+#endif
+
+/** Floating point, including all the Fortran reals *********************/
+
+#if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T)
+#define SHORT_FLOAT(name, ftype) ompi_op_cuda_##ftype##_##name##_short_float
+#else
+#define SHORT_FLOAT(name, ftype) NULL
+#endif
+#define FLOAT(name, ftype) ompi_op_cuda_##ftype##_##name##_float
+#define DOUBLE(name, ftype) ompi_op_cuda_##ftype##_##name##_double
+#define LONG_DOUBLE(name, ftype) ompi_op_cuda_##ftype##_##name##_long_double
+
+#define FLOATING_POINT(name, ftype)                                                            \
+  [OMPI_OP_CUDA_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype),                                  \
+  [OMPI_OP_CUDA_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
+  [OMPI_OP_CUDA_TYPE_DOUBLE] = DOUBLE(name, ftype),                                            \
+  FLOATING_POINT_FORTRAN_REAL(name, ftype),                                                    \
+  [OMPI_OP_CUDA_TYPE_DOUBLE_PRECISION] = FLOATING_POINT_FORTRAN_DOUBLE_PRECISION(name, ftype), \
+  [OMPI_OP_CUDA_TYPE_LONG_DOUBLE] = LONG_DOUBLE(name, ftype)
+
+/** Fortran logical *****************************************************/
+
+#if OMPI_HAVE_FORTRAN_LOGICAL
+#define FORTRAN_LOGICAL(name, ftype)                                          \
+  ompi_op_cuda_##ftype##_##name##_fortran_logical  /* OMPI_OP_CUDA_TYPE_LOGICAL */
+#else
+#define FORTRAN_LOGICAL(name, ftype) NULL
+#endif
+
+#define LOGICAL(name, ftype)                                    \
+    [OMPI_OP_CUDA_TYPE_LOGICAL] = FORTRAN_LOGICAL(name, ftype), \
+    [OMPI_OP_CUDA_TYPE_BOOL] = ompi_op_cuda_##ftype##_##name##_bool
+
+/** Complex *****************************************************/
+#if 0
+
+#if defined(HAVE_SHORT_FLOAT__COMPLEX) || defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+#define SHORT_FLOAT_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_short_float_complex
+#else
+#define SHORT_FLOAT_COMPLEX(name, ftype) NULL
+#endif
+#define FLOAT_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_float_complex
+#define DOUBLE_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_double_complex
+#define LONG_DOUBLE_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_long_double_complex
+
+#define COMPLEX(name, ftype)                                                  \
+    [OMPI_OP_CUDA_TYPE_C_SHORT_FLOAT_COMPLEX] = SHORT_FLOAT_COMPLEX(name, ftype), \
+    [OMPI_OP_CUDA_TYPE_C_FLOAT_COMPLEX] = FLOAT_COMPLEX(name, ftype),         \
+    [OMPI_OP_CUDA_TYPE_C_DOUBLE_COMPLEX] = DOUBLE_COMPLEX(name, ftype),       \
+    [OMPI_OP_CUDA_TYPE_C_LONG_DOUBLE_COMPLEX] = LONG_DOUBLE_COMPLEX(name, ftype)
+
+#endif // 0
+
+/** Byte ****************************************************************/
+
+#define BYTE(name, ftype)                                     \
+  [OMPI_OP_CUDA_TYPE_BYTE] = ompi_op_cuda_##ftype##_##name##_byte
+
+/** Fortran complex *****************************************************/
+/** Fortran "2" types ***************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+#define TWOLOC_FORTRAN_2REAL(name, ftype) ompi_op_cuda_##ftype##_##name##_2real
+#else
+#define TWOLOC_FORTRAN_2REAL(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+#define TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype) ompi_op_cuda_##ftype##_##name##_2double_precision
+#else
+#define TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+#define TWOLOC_FORTRAN_2INTEGER(name, ftype) ompi_op_cuda_##ftype##_##name##_2integer
+#else
+#define TWOLOC_FORTRAN_2INTEGER(name, ftype) NULL
+#endif
+
+/** All "2" types *******************************************************/
+
+#define TWOLOC(name, ftype)                                                                   \
+    [OMPI_OP_CUDA_TYPE_2REAL] = TWOLOC_FORTRAN_2REAL(name, ftype),                            \
+    [OMPI_OP_CUDA_TYPE_2DOUBLE_PRECISION] = TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype),    \
+    [OMPI_OP_CUDA_TYPE_2INTEGER] = TWOLOC_FORTRAN_2INTEGER(name, ftype),                      \
+    [OMPI_OP_CUDA_TYPE_FLOAT_INT] = ompi_op_cuda_##ftype##_##name##_float_int,                \
+    [OMPI_OP_CUDA_TYPE_DOUBLE_INT] = ompi_op_cuda_##ftype##_##name##_double_int,              \
+    [OMPI_OP_CUDA_TYPE_LONG_INT] = ompi_op_cuda_##ftype##_##name##_long_int,                  \
+    [OMPI_OP_CUDA_TYPE_2INT] = ompi_op_cuda_##ftype##_##name##_2int,                          \
+    [OMPI_OP_CUDA_TYPE_SHORT_INT] = ompi_op_cuda_##ftype##_##name##_short_int,                \
+    [OMPI_OP_CUDA_TYPE_LONG_DOUBLE_INT] = ompi_op_cuda_##ftype##_##name##_long_double_int
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+    (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+    (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+     OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+ompi_op_cuda_handler_fn_t ompi_op_cuda_functions[OMPI_OP_CUDA_FORTRAN_OP_MAX][OMPI_OP_CUDA_TYPE_MAX] =
+    {
+        /* Corresponds to MPI_OP_NULL */
+        [OMPI_OP_CUDA_FORTRAN_NULL] = {
+            /* Leaving this empty puts in NULL for all entries */
+            NULL,
+        },
+        /* Corresponds to MPI_MAX */
+        [OMPI_OP_CUDA_FORTRAN_MAX] = {
+            C_INTEGER(max, 2buff),
+            FORTRAN_INTEGER(max, 2buff),
+            FLOATING_POINT(max, 2buff),
+        },
+        /* Corresponds to MPI_MIN */
+        [OMPI_OP_CUDA_FORTRAN_MIN] = {
+            C_INTEGER(min, 2buff),
+            FORTRAN_INTEGER(min, 2buff),
+            FLOATING_POINT(min, 2buff),
+        },
+        /* Corresponds to MPI_SUM */
+        [OMPI_OP_CUDA_FORTRAN_SUM] = {
+            C_INTEGER(sum, 2buff),
+            FORTRAN_INTEGER(sum, 2buff),
+            FLOATING_POINT(sum, 2buff),
+            NULL,
+        },
+        /* Corresponds to MPI_PROD */
+        [OMPI_OP_CUDA_FORTRAN_PROD] = {
+            C_INTEGER(prod, 2buff),
+            FORTRAN_INTEGER(prod, 2buff),
+            FLOATING_POINT(prod, 2buff),
+            NULL,
+        },
+        /* Corresponds to MPI_LAND */
+        [OMPI_OP_CUDA_FORTRAN_LAND] = {
+            C_INTEGER(land, 2buff),
+            LOGICAL(land, 2buff),
+        },
+        /* Corresponds to MPI_BAND */
+        [OMPI_OP_CUDA_FORTRAN_BAND] = {
+            C_INTEGER(band, 2buff),
+            FORTRAN_INTEGER(band, 2buff),
+            BYTE(band, 2buff),
+        },
+        /* Corresponds to MPI_LOR */
+        [OMPI_OP_CUDA_FORTRAN_LOR] = {
+            C_INTEGER(lor, 2buff),
+            LOGICAL(lor, 2buff),
+        },
+        /* Corresponds to MPI_BOR */
+        [OMPI_OP_CUDA_FORTRAN_BOR] = {
+            C_INTEGER(bor, 2buff),
+            FORTRAN_INTEGER(bor, 2buff),
+            BYTE(bor, 2buff),
+        },
+        /* Corresponds to MPI_LXOR */
+        [OMPI_OP_CUDA_FORTRAN_LXOR] = {
+            C_INTEGER(lxor, 2buff),
+            LOGICAL(lxor, 2buff),
+        },
+        /* Corresponds to MPI_BXOR */
+        [OMPI_OP_CUDA_FORTRAN_BXOR] = {
+            C_INTEGER(bxor, 2buff),
+            FORTRAN_INTEGER(bxor, 2buff),
+            BYTE(bxor, 2buff),
+        },
+        /* Corresponds to MPI_MAXLOC */
+        [OMPI_OP_CUDA_FORTRAN_MAXLOC] = {
+            TWOLOC(maxloc, 2buff),
+        },
+        /* Corresponds to MPI_MINLOC */
+        [OMPI_OP_CUDA_FORTRAN_MINLOC] = {
+            TWOLOC(minloc, 2buff),
+        },
+        /* Corresponds to MPI_REPLACE */
+        [OMPI_OP_CUDA_FORTRAN_REPLACE] = {
+            /* (MPI_ACCUMULATE is handled differently than the other
+               reductions, so just zero out its function
+               implementations here to ensure that users don't invoke
+               MPI_REPLACE with any reduction operations other than
+               ACCUMULATE) */
+            NULL,
+        },
+
+    };
+
+
+ompi_op_base_3buff_handler_fn_t ompi_op_base_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+    {
+        /* Corresponds to MPI_OP_NULL */
+        [OMPI_OP_BASE_FORTRAN_NULL] = {
+            /* Leaving this empty puts in NULL for all entries */
+            NULL,
+        },
+        /* Corresponds to MPI_MAX */
+        [OMPI_OP_BASE_FORTRAN_MAX] = {
+            C_INTEGER(max, 3buff),
+            FORTRAN_INTEGER(max, 3buff),
+            FLOATING_POINT(max, 3buff),
+        },
+        /* Corresponds to MPI_MIN */
+        [OMPI_OP_BASE_FORTRAN_MIN] = {
+            C_INTEGER(min, 3buff),
+            FORTRAN_INTEGER(min, 3buff),
+            FLOATING_POINT(min, 3buff),
+        },
+        /* Corresponds to MPI_SUM */
+        [OMPI_OP_BASE_FORTRAN_SUM] = {
+            C_INTEGER(sum, 3buff),
+            FORTRAN_INTEGER(sum, 3buff),
+            FLOATING_POINT(sum, 3buff),
+            NULL,
+        },
+        /* Corresponds to MPI_PROD */
+        [OMPI_OP_BASE_FORTRAN_PROD] = {
+            C_INTEGER(prod, 3buff),
+            FORTRAN_INTEGER(prod, 3buff),
+            FLOATING_POINT(prod, 3buff),
+            NULL,
+        },
+        /* Corresponds to MPI_LAND */
+        [OMPI_OP_BASE_FORTRAN_LAND] ={
+            C_INTEGER(land, 3buff),
+            LOGICAL(land, 3buff),
+        },
+        /* Corresponds to MPI_BAND */
+        [OMPI_OP_BASE_FORTRAN_BAND] = {
+            C_INTEGER(band, 3buff),
+            FORTRAN_INTEGER(band, 3buff),
+            BYTE(band, 3buff),
+        },
+        /* Corresponds to MPI_LOR */
+        [OMPI_OP_BASE_FORTRAN_LOR] = {
+            C_INTEGER(lor, 3buff),
+            LOGICAL(lor, 3buff),
+        },
+        /* Corresponds to MPI_BOR */
+        [OMPI_OP_BASE_FORTRAN_BOR] = {
+            C_INTEGER(bor, 3buff),
+            FORTRAN_INTEGER(bor, 3buff),
+            BYTE(bor, 3buff),
+        },
+        /* Corresponds to MPI_LXOR */
+        [OMPI_OP_BASE_FORTRAN_LXOR] = {
+            C_INTEGER(lxor, 3buff),
+            LOGICAL(lxor, 3buff),
+        },
+        /* Corresponds to MPI_BXOR */
+        [OMPI_OP_BASE_FORTRAN_BXOR] = {
+            C_INTEGER(bxor, 3buff),
+            FORTRAN_INTEGER(bxor, 3buff),
+            BYTE(bxor, 3buff),
+        },
+        /* Corresponds to MPI_MAXLOC */
+        [OMPI_OP_BASE_FORTRAN_MAXLOC] = {
+            TWOLOC(maxloc, 3buff),
+        },
+        /* Corresponds to MPI_MINLOC */
+        [OMPI_OP_BASE_FORTRAN_MINLOC] = {
+            TWOLOC(minloc, 3buff),
+        },
+        /* Corresponds to MPI_REPLACE */
+        [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+            /* MPI_ACCUMULATE is handled differently than the other
+               reductions, so just zero out its function
+               implementations here to ensure that users don't invoke
+               MPI_REPLACE with any reduction operations other than
+               ACCUMULATE */
+            NULL,
+        },
+    };
diff --git a/ompi/op/op.c b/ompi/op/op.c
index 3977fa8b97b..45abed1d2fc 100644
--- a/ompi/op/op.c
+++ b/ompi/op/op.c
@@ -475,6 +475,7 @@ static void ompi_op_construct(ompi_op_t *new_op)
         new_op->o_3buff_intrinsic.fns[i] = NULL;
         new_op->o_3buff_intrinsic.modules[i] = NULL;
     }
+    new_op->o_device_op = NULL;
 }
 
 
@@ -506,4 +507,22 @@ static void ompi_op_destruct(ompi_op_t *op)
             op->o_3buff_intrinsic.modules[i] = NULL;
         }
     }
+
+    if (op->o_device_op != NULL) {
+        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            if( NULL != op->o_device_op->o_intrisic.modules[i] ) {
+                OBJ_RELEASE(op->o_device_op->o_intrisic.modules[i]);
+                op->o_device_op->o_intrisic->modules[i] = NULL;
+            }
+            if( NULL != op->o_device_op->o_3buff_intrisic.modules[i] ) {
+                OBJ_RELEASE(op->o_device_op->o_3buff_intrisic.modules[i]);
+                op->o_device_op->o_3buff_intrisic->modules[i] = NULL;
+            }
+        }
+    }
+    if (op->o_device_op) {
+        OBJ_RELEASE(op->do_stream);
+        op->o_device_op->do_stream = NULL;
+    }
+    free(op->o_device_op);
 }
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 3aa95be7b90..3ff6c3aac47 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -122,6 +122,15 @@ enum ompi_op_type {
     OMPI_OP_REPLACE,
     OMPI_OP_NUM_OF_TYPES
 };
+
+/* device op information */
+struct ompi_device_op_t {
+    opal_accelerator_stream_t *do_stream;
+    ompi_op_base_op_fns_t do_intrisic;
+    ompi_op_base_op_3buff_fns_t do_3buff_intrisic;
+};
+typedef struct ompi_device_op_t ompi_device_op_t;
+
 /**
  * Back-end type of MPI_Op
  */
@@ -167,6 +176,10 @@ struct ompi_op_t {
     /** 3-buffer functions, which is only for intrinsic ops.  No need
         for the C/C++/Fortran user-defined functions. */
     ompi_op_base_op_3buff_fns_t o_3buff_intrinsic;
+
+    /** device functions, only for intrinsic ops.
+        Provided if device support is detected. */
+    ompi_device_op_t *o_device_op;
 };
 
 /**
@@ -560,6 +573,24 @@ static inline void ompi_op_reduce(ompi_op_t * op, void *source,
      * :-)
      */
 
+    bool use_device_op = false;
+    int source_dev_id, target_dev_id;
+    uint64_t source_flags, target_flags;
+    /* check if either of the buffers is on a device and if so make sure we can
+     * access handle it properly */
+    if (opal_accelerator.check_addr(source, &source_dev_id, &source_flags) > 0 ||
+        opal_accelerator.check_addr(target, &target_dev_id, &target_flags) > 0) {
+        if (ompi_datatype_is_predefined(dtype) &&
+            source_dev_id == target_dev_id &&
+            0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC) &&
+            NULL == op->o_device_intrisic) {
+            use_device_op = true;
+        } else {
+            /* TODO: can we be more graceful here? */
+            abort();
+        }
+    }
+
     /* For intrinsics, we also pass the corresponding op module */
     if (0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC)) {
         int dtype_id;
@@ -569,9 +600,18 @@ static inline void ompi_op_reduce(ompi_op_t * op, void *source,
         } else {
             dtype_id = ompi_op_ddt_map[dtype->id];
         }
-        op->o_func.intrinsic.fns[dtype_id](source, target,
-                                           &count, &dtype,
-                                           op->o_func.intrinsic.modules[dtype_id]);
+        if (use_device_op) {
+            if (NULL == op->o_device_intrisic) {
+                abort(); // TODO: be more graceful!
+            }
+            op->o_device_intrisic->intrinsic.fns[dtype_id](source, target,
+                                                           &count, &dtype,
+                                                           op->o_device_intrisic->intrinsic.modules[dtype_id]);
+        } else {
+            op->o_func.intrinsic.fns[dtype_id](source, target,
+                                               &count, &dtype,
+                                               op->o_func.intrinsic.modules[dtype_id]);
+        }
         return;
     }
 

From b7e6f895f6663b1f5471f9fc513900ab6ef443d3 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
Date: Tue, 14 Mar 2023 15:24:05 -0400
Subject: [PATCH 02/74] First working version of CUDA op support

Signed-off-by: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
---
 ompi/mca/op/base/op_base_frame.c              |    2 +
 ompi/mca/op/base/op_base_op_select.c          |   58 +-
 ompi/mca/op/cuda/Makefile.am                  |   34 +-
 ompi/mca/op/cuda/configure.m4                 |    6 +-
 ompi/mca/op/cuda/op_cuda.h                    |   35 +-
 ompi/mca/op/cuda/op_cuda_component.c          |  297 ++---
 ..._cuda_functions.cu => op_cuda_functions.c} |  561 +++++----
 ompi/mca/op/cuda/op_cuda_impl.cu              | 1023 +++++++++++++++++
 ompi/mca/op/cuda/op_cuda_impl.h               |  915 +++++++++++++++
 ompi/mca/op/op.h                              |    2 +
 ompi/op/op.c                                  |   19 +-
 ompi/op/op.h                                  |   20 +-
 opal/mca/accelerator/cuda/accelerator_cuda.c  |   29 +-
 13 files changed, 2485 insertions(+), 516 deletions(-)
 rename ompi/mca/op/cuda/{op_cuda_functions.cu => op_cuda_functions.c} (75%)
 create mode 100644 ompi/mca/op/cuda/op_cuda_impl.cu
 create mode 100644 ompi/mca/op/cuda/op_cuda_impl.h

diff --git a/ompi/mca/op/base/op_base_frame.c b/ompi/mca/op/base/op_base_frame.c
index 90167300851..9cde5589694 100644
--- a/ompi/mca/op/base/op_base_frame.c
+++ b/ompi/mca/op/base/op_base_frame.c
@@ -42,6 +42,7 @@ static void module_constructor(ompi_op_base_module_t *m)
 {
     m->opm_enable = NULL;
     m->opm_op = NULL;
+    m->opm_device_enabled = false;
     memset(&(m->opm_fns), 0, sizeof(m->opm_fns));
     memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns));
 }
@@ -50,6 +51,7 @@ static void module_constructor_1_0_0(ompi_op_base_module_1_0_0_t *m)
 {
     m->opm_enable = NULL;
     m->opm_op = NULL;
+    m->opm_device_enabled = false;
     memset(&(m->opm_fns), 0, sizeof(m->opm_fns));
     memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns));
 }
diff --git a/ompi/mca/op/base/op_base_op_select.c b/ompi/mca/op/base/op_base_op_select.c
index 53754ce5668..5b26df1a0ca 100644
--- a/ompi/mca/op/base/op_base_op_select.c
+++ b/ompi/mca/op/base/op_base_op_select.c
@@ -152,22 +152,50 @@ int ompi_op_base_op_select(ompi_op_t *op)
         }
 
         /* Copy over the non-NULL pointers */
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            /* 2-buffer variants */
-            if (NULL != avail->ao_module->opm_fns[i]) {
-                OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
-                op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i];
-                op->o_func.intrinsic.modules[i] = avail->ao_module;
-                OBJ_RETAIN(avail->ao_module);
+        if (avail->ao_module->opm_device_enabled) {
+            if (NULL == op->o_device_op) {
+                op->o_device_op = malloc(sizeof(*op->o_device_op));
             }
-
-            /* 3-buffer variants */
-            if (NULL != avail->ao_module->opm_3buff_fns[i]) {
-                OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
-                op->o_3buff_intrinsic.fns[i] =
-                    avail->ao_module->opm_3buff_fns[i];
-                op->o_3buff_intrinsic.modules[i] = avail->ao_module;
-                OBJ_RETAIN(avail->ao_module);
+            for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+                /* 2-buffer variants */
+                if (NULL != avail->ao_module->opm_fns[i]) {
+                    if (NULL != op->o_device_op->do_intrinsic.modules[i]) {
+                        OBJ_RELEASE(op->o_device_op->do_intrinsic.modules[i]);
+                    }
+                    op->o_device_op->do_intrinsic.fns[i] = avail->ao_module->opm_fns[i];
+                    op->o_device_op->do_intrinsic.modules[i] = avail->ao_module;
+                    OBJ_RETAIN(avail->ao_module);
+                }
+
+                /* 3-buffer variants */
+                if (NULL != avail->ao_module->opm_3buff_fns[i]) {
+                    if (NULL != op->o_device_op->do_3buff_intrinsic.modules[i]) {
+                        OBJ_RELEASE(op->o_device_op->do_3buff_intrinsic.modules[i]);
+                    }
+                    op->o_device_op->do_3buff_intrinsic.fns[i] =
+                        avail->ao_module->opm_3buff_fns[i];
+                    op->o_device_op->do_3buff_intrinsic.modules[i] = avail->ao_module;
+                    OBJ_RETAIN(avail->ao_module);
+                }
+            }
+        } else {
+            for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+                /* 2-buffer variants */
+                if (NULL != avail->ao_module->opm_fns[i]) {
+                    OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
+                    op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i];
+                    op->o_func.intrinsic.modules[i] = avail->ao_module;
+                    OBJ_RETAIN(avail->ao_module);
+                }
+
+                /* 3-buffer variants */
+                if (NULL != avail->ao_module->opm_3buff_fns[i]) {
+                    OBJ_RELEASE(op->o_3buff_intrinsic.modules[i]);
+                    op->o_3buff_intrinsic.fns[i] =
+                        avail->ao_module->opm_3buff_fns[i];
+                    op->o_3buff_intrinsic.modules[i] = avail->ao_module;
+                    OBJ_RETAIN(avail->ao_module);
+                }
             }
         }
 
diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index 0e844271d67..f96d8b8a896 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -1,9 +1,7 @@
 #
-# Copyright (c) 2019-2023 The University of Tennessee and The University
+# Copyright (c) 2023      The University of Tennessee and The University
 #                         of Tennessee Research Foundation.  All rights
 #                         reserved.
-# Copyright (c) 2020-2021 Research Organization for Information Science
-#                         and Technology (RIST).  All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -21,8 +19,17 @@
 
 AM_CPPFLAGS = $(common_cuda_CPPFLAGS)
 
-sources = op_cuda_component.c op_cuda.h
-sources_extended = op_cuda_functions.c
+sources = op_cuda_component.c op_cuda.h op_cuda_functions.c op_cuda_impl.h
+#sources_extended = op_cuda_functions.cu
+cu_sources = op_cuda_impl.cu
+
+NVCC = nvcc -allow-unsupported-compiler
+
+.cu.l$(OBJEXT):
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(NVCC) -prefer-non-pic $(NVCCFLAGS) -Wc,-Xcompiler,-fPIC,-g -c $<
+
+# -o $($@.o:.lo)
 
 # Open MPI components can be compiled two ways:
 #
@@ -44,7 +51,7 @@ if MCA_BUILD_ompi_op_cuda_DSO
 component_install = mca_op_cuda.la
 else
 component_install =
-component_noinst = libmca_cuda_avx.la
+component_noinst = libmca_op_cuda.la
 endif
 
 # Specific information for DSO builds.
@@ -52,12 +59,15 @@ endif
 # The DSO should install itself in $(ompilibdir) (by default,
 # $prefix/lib/openmpi).
 
+CUDADIR=/nfs/apps/spacks/2023-01-01/opt/spack/linux-centos7-x86_64/gcc-9.5.0/cuda-11.8.0-u2modnncfevx54ibr5dy27sxkirwsf7f
+
 mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_cuda_la_SOURCES = $(sources)
-mca_op_cuda_la_LIBADD = $(specialized_op_libs)
-mca_op_cuda_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
-
+mca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
+mca_op_cuda_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
+		$(accelerator_cuda_LIBS) -L$(CUDADIR)/lib64 -lcudart
+EXTRA_mca_op_cuda_la_SOURCES = $(cu_sources)
 
 # Specific information for static builds.
 #
@@ -66,6 +76,8 @@ mca_op_cuda_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LI
 
 noinst_LTLIBRARIES = $(component_noinst)
 libmca_op_cuda_la_SOURCES = $(sources)
-libmca_op_cuda_la_LIBADD = $(specialized_op_libs)
-libmca_op_cuda_la_LDFLAGS = -module -avoid-version
+libmca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
+libmca_op_cuda_la_LDFLAGS = -module -avoid-version\
+		$(accelerator_cuda_LIBS) -L$(CUDADIR)/lib64 -lcudart
+EXTRA_libmca_op_cuda_la_SOURCES = $(cu_sources)
 
diff --git a/ompi/mca/op/cuda/configure.m4 b/ompi/mca/op/cuda/configure.m4
index 7ea9c31f0dc..9c5c4794fba 100644
--- a/ompi/mca/op/cuda/configure.m4
+++ b/ompi/mca/op/cuda/configure.m4
@@ -29,8 +29,8 @@ AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
           [$1],
           [$2])
 
-    AC_SUBST([accelerator_cuda_CPPFLAGS])
-    AC_SUBST([accelerator_cuda_LDFLAGS])
-    AC_SUBST([accelerator_cuda_LIBS])
+    AC_SUBST([op_cuda_CPPFLAGS])
+    AC_SUBST([op_cuda_LDFLAGS])
+    AC_SUBST([op_cuda_LIBS])
 
 ])dnl
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index b21223813c0..bbc16d26b25 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -9,8 +9,8 @@
  * $HEADER$
  */
 
-#ifndef MCA_OP_AVX_EXPORT_H
-#define MCA_OP_AVX_EXPORT_H
+#ifndef MCA_OP_CUDA_EXPORT_H
+#define MCA_OP_CUDA_EXPORT_H
 
 #include "ompi_config.h"
 
@@ -18,9 +18,29 @@
 #include "opal/class/opal_object.h"
 
 #include "ompi/mca/op/op.h"
+#include "ompi/runtime/mpiruntime.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
 
 BEGIN_C_DECLS
 
+
+#define xstr(x) #x
+#define str(x) xstr(x)
+
+#define CHECK(fn, args)                                       \
+    do {                                                      \
+        cudaError_t err = fn args;                            \
+        if (err != cudaSuccess) {                             \
+            fprintf(stderr, "%s:%d: %s failed at line: %s: %s\n", \
+                    __FILE__, __LINE__, str(fn), cudaGetErrorName(err), \
+                    cudaGetErrorString(err));                 \
+            ompi_mpi_abort(MPI_COMM_WORLD, 1);                \
+        }                                                     \
+    } while (0)
+
+
 /**
  * Derive a struct from the base op component struct, allowing us to
  * cache some component-specific information on our well-known
@@ -31,7 +51,11 @@ typedef struct {
     ompi_op_base_component_1_0_0_t super;
 
     /* a stream on which to schedule kernel calls */
-    opal_accelerator_stream_t *stream;
+    CUstream cu_stream;
+    CUcontext *cu_ctx;
+    int *cu_max_threads_per_block;
+    CUdevice *cu_devices;
+    int cu_num_devices;
 } ompi_op_cuda_component_t;
 
 /**
@@ -45,6 +69,9 @@ typedef struct {
 OMPI_DECLSPEC extern ompi_op_cuda_component_t
     mca_op_cuda_component;
 
+OMPI_DECLSPEC extern
+ompi_op_base_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
 END_C_DECLS
 
-#endif /* MCA_OP_AVX_EXPORT_H */
+#endif /* MCA_OP_CUDA_EXPORT_H */
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index a2f01a373e2..d61f99b06e8 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -14,7 +14,7 @@
 
 /** @file
  *
- * This is the "avx" component source code.
+ * This is the "cuda" op component source code.
  *
  */
 
@@ -26,134 +26,47 @@
 #include "ompi/op/op.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
-#include "ompi/mca/op/avx/op_avx.h"
+#include "ompi/mca/op/cuda/op_cuda.h"
 
-static int avx_component_open(void);
-static int avx_component_close(void);
-static int avx_component_init_query(bool enable_progress_threads,
+#include <stdlib.h>
+
+static int cuda_component_open(void);
+static int cuda_component_close(void);
+static int cuda_component_init_query(bool enable_progress_threads,
                                     bool enable_mpi_thread_multiple);
 static struct ompi_op_base_module_1_0_0_t *
-    avx_component_op_query(struct ompi_op_t *op, int *priority);
-static int avx_component_register(void);
-
-static mca_base_var_enum_value_flag_t avx_support_flags[] = {
-    { .flag = 0x001, .string = "SSE" },
-    { .flag = 0x002, .string = "SSE2" },
-    { .flag = 0x004, .string = "SSE3" },
-    { .flag = 0x008, .string = "SSE4.1" },
-    { .flag = 0x010, .string = "AVX" },
-    { .flag = 0x020, .string = "AVX2" },
-    { .flag = 0x100, .string = "AVX512F" },
-    { .flag = 0x200, .string = "AVX512BW" },
-    { .flag = 0,     .string = NULL },
-};
-
-/**
- * A slightly modified code from
- * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
- */
-#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
-
-#include <immintrin.h>
-
-static uint32_t has_intel_AVX_features(void)
-{
-    uint32_t flags = 0;
-
-    flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F)  ? OMPI_OP_AVX_HAS_AVX512F_FLAG   : 0;
-    flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
-    flags |= _may_i_use_cpu_feature(_FEATURE_AVX2)     ? OMPI_OP_AVX_HAS_AVX2_FLAG      : 0;
-    flags |= _may_i_use_cpu_feature(_FEATURE_AVX)      ? OMPI_OP_AVX_HAS_AVX_FLAG       : 0;
-    flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1)   ? OMPI_OP_AVX_HAS_SSE4_1_FLAG    : 0;
-    flags |= _may_i_use_cpu_feature(_FEATURE_SSE3)     ? OMPI_OP_AVX_HAS_SSE3_FLAG      : 0;
-    flags |= _may_i_use_cpu_feature(_FEATURE_SSE2)     ? OMPI_OP_AVX_HAS_SSE2_FLAG      : 0;
-    flags |= _may_i_use_cpu_feature(_FEATURE_SSE)      ? OMPI_OP_AVX_HAS_SSE_FLAG       : 0;
-    return flags;
-}
-#else /* non-Intel compiler */
-#include <stdint.h>
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
-static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
-{
-#if defined(_MSC_VER)
-    __cpuidex(abcd, eax, ecx);
-#else
-    uint32_t ebx = 0, edx = 0;
-#if defined( __i386__ ) && defined ( __PIC__ )
-    /* in case of PIC under 32-bit EBX cannot be clobbered */
-    __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
-#else
-    __asm__ ( "cpuid" : "+b" (ebx),
-#endif  /* defined( __i386__ ) && defined ( __PIC__ ) */
-              "+a" (eax), "+c" (ecx), "=d" (edx) );
-    abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
-#endif
-}
-
-static uint32_t has_intel_AVX_features(void)
-{
-    /* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
-    const uint32_t avx512f_mask   = (1U << 16);  // AVX512F   (EAX = 7, ECX = 0) : EBX
-    const uint32_t avx512_bw_mask = (1U << 30);  // AVX512BW  (EAX = 7, ECX = 0) : EBX
-    const uint32_t avx2_mask      = (1U << 5);   // AVX2      (EAX = 7, ECX = 0) : EBX
-    const uint32_t avx_mask       = (1U << 28);  // AVX       (EAX = 1, ECX = 0) : ECX
-    const uint32_t sse4_1_mask    = (1U << 19);  // SSE4.1    (EAX = 1, ECX = 0) : ECX
-    const uint32_t sse3_mask      = (1U << 0);   // SSE3      (EAX = 1, ECX = 0) : ECX
-    const uint32_t sse2_mask      = (1U << 26);  // SSE2      (EAX = 1, ECX = 0) : EDX
-    const uint32_t sse_mask       = (1U << 15);  // SSE       (EAX = 1, ECX = 0) : EDX
-    uint32_t flags = 0, abcd[4];
+    cuda_component_op_query(struct ompi_op_t *op, int *priority);
+static int cuda_component_register(void);
 
-    run_cpuid( 1, 0, abcd );
-    flags |= (abcd[2] & avx_mask)       ? OMPI_OP_AVX_HAS_AVX_FLAG      : 0;
-    flags |= (abcd[2] & sse4_1_mask)    ? OMPI_OP_AVX_HAS_SSE4_1_FLAG   : 0;
-    flags |= (abcd[2] & sse3_mask)      ? OMPI_OP_AVX_HAS_SSE3_FLAG     : 0;
-    flags |= (abcd[3] & sse2_mask)      ? OMPI_OP_AVX_HAS_SSE2_FLAG     : 0;
-    flags |= (abcd[3] & sse_mask)       ? OMPI_OP_AVX_HAS_SSE_FLAG      : 0;
-#if defined(__APPLE__)
-    uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27));  /* FMA(12) + MOVBE (22) OSXSAVE (27) */
-    // OS supports extended processor state management ?
-    if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
-        return 0;
-#endif  /* defined(__APPLE__) */
-
-    run_cpuid( 7, 0, abcd );
-    flags |= (abcd[1] & avx512f_mask)   ? OMPI_OP_AVX_HAS_AVX512F_FLAG  : 0;
-    flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
-    flags |= (abcd[1] & avx2_mask)      ? OMPI_OP_AVX_HAS_AVX2_FLAG     : 0;
-    return flags;
-}
-#endif /* non-Intel compiler */
-
-ompi_op_avx_component_t mca_op_avx_component = {
+ompi_op_cuda_component_t mca_op_cuda_component = {
     {
         .opc_version = {
             OMPI_OP_BASE_VERSION_1_0_0,
 
-            .mca_component_name = "avx",
+            .mca_component_name = "cuda",
             MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
                                   OMPI_RELEASE_VERSION),
-            .mca_open_component = avx_component_open,
-            .mca_close_component = avx_component_close,
-            .mca_register_component_params = avx_component_register,
+            .mca_open_component = cuda_component_open,
+            .mca_close_component = cuda_component_close,
+            .mca_register_component_params = cuda_component_register,
         },
         .opc_data = {
             /* The component is checkpoint ready */
             MCA_BASE_METADATA_PARAM_CHECKPOINT
         },
 
-        .opc_init_query = avx_component_init_query,
-        .opc_op_query = avx_component_op_query,
+        .opc_init_query = cuda_component_init_query,
+        .opc_op_query = cuda_component_op_query,
     },
+    .cu_max_threads_per_block = NULL,
+    .cu_devices = NULL,
+    .cu_num_devices  = 0,
 };
 
 /*
  * Component open
  */
-static int avx_component_open(void)
+static int cuda_component_open(void)
 {
     /* We checked the flags during register, so if they are set to
      * zero either the architecture is not suitable or the user disabled
@@ -172,14 +85,16 @@ static int avx_component_open(void)
 /*
  * Component close
  */
-static int avx_component_close(void)
+static int cuda_component_close(void)
 {
-    /* If avx was opened successfully, close it (i.e., release any
-       resources that may have been allocated on this component).
-       Note that _component_close() will always be called at the end
-       of the process, so it may have been after any/all of the other
-       component functions have been invoked (and possibly even after
-       modules have been created and/or destroyed). */
+    if (mca_op_cuda_component.cu_num_devices > 0) {
+        CHECK(cuStreamDestroy, (mca_op_cuda_component.cu_stream));
+        free(mca_op_cuda_component.cu_max_threads_per_block);
+        mca_op_cuda_component.cu_max_threads_per_block = NULL;
+        free(mca_op_cuda_component.cu_devices);
+        mca_op_cuda_component.cu_devices = NULL;
+        mca_op_cuda_component.cu_num_devices = 0;
+    }
 
     return OMPI_SUCCESS;
 }
@@ -188,142 +103,68 @@ static int avx_component_close(void)
  * Register MCA params.
  */
 static int
-avx_component_register(void)
+cuda_component_register(void)
 {
-    mca_op_avx_component.supported =
-        mca_op_avx_component.flags = has_intel_AVX_features();
-
-    // MCA var enum flag for conveniently seeing SSE/MMX/AVX support
-    // values
-    mca_base_var_enum_flag_t *new_enum_flag = NULL;
-    (void) mca_base_var_enum_create_flag("op_avx_support_flags",
-                                         avx_support_flags, &new_enum_flag);
-
-    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
-                                           "capabilities",
-                                           "Level of SSE/MMX/AVX support available in the current environment",
-                                           MCA_BASE_VAR_TYPE_INT,
-                                           &(new_enum_flag->super), 0, 0,
-                                           OPAL_INFO_LVL_4,
-                                           MCA_BASE_VAR_SCOPE_CONSTANT,
-                                           &mca_op_avx_component.supported);
-
-    (void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
-                                           "support",
-                                           "Level of SSE/MMX/AVX support to be used, capped by the local architecture capabilities",
-                                           MCA_BASE_VAR_TYPE_INT,
-                                           &(new_enum_flag->super), 0, 0,
-                                           OPAL_INFO_LVL_4,
-                                           MCA_BASE_VAR_SCOPE_LOCAL,
-                                           &mca_op_avx_component.flags);
-    OBJ_RELEASE(new_enum_flag);
-
-    mca_op_avx_component.flags &= mca_op_avx_component.supported;
+    /* TODO: add mca paramters */
 
     return OMPI_SUCCESS;
 }
 
+
 /*
  * Query whether this component wants to be used in this process.
  */
 static int
-avx_component_init_query(bool enable_progress_threads,
+cuda_component_init_query(bool enable_progress_threads,
                          bool enable_mpi_thread_multiple)
 {
-    if( 0 == mca_op_avx_component.flags )
-        return OMPI_ERR_NOT_SUPPORTED;
+    int num_devices;
+    int prio_lo, prio_hi;
+    cuInit(0);
+    CHECK(cuDeviceGetCount, (&num_devices));
+    mca_op_cuda_component.cu_num_devices = num_devices;
+    mca_op_cuda_component.cu_devices = (CUdevice*)malloc(num_devices*sizeof(CUdevice));
+    mca_op_cuda_component.cu_ctx = (CUcontext*)malloc(num_devices*sizeof(CUcontext));
+    mca_op_cuda_component.cu_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
+    for (int i = 0; i < num_devices; ++i) {
+        CHECK(cuDeviceGet, (&mca_op_cuda_component.cu_devices[i], i));
+        CHECK(cuCtxCreate, (&mca_op_cuda_component.cu_ctx[i],
+                            CU_CTX_SCHED_YIELD,
+                            mca_op_cuda_component.cu_devices[i]));
+        mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
+        // TODO: this call fails, why?!
+        //CHECK(cuDeviceGetAttribute, (&mca_op_cuda_component.cu_max_threads_per_block[i],
+        //                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+        //                             mca_op_cuda_component.cu_devices[i]));
+    }
+
+    CHECK(cuCtxGetStreamPriorityRange, (&prio_lo, &prio_hi));
+    CHECK(cuStreamCreateWithPriority, (&mca_op_cuda_component.cu_stream, CU_STREAM_NON_BLOCKING, prio_hi));
+
     return OMPI_SUCCESS;
 }
 
-#if OMPI_MCA_OP_HAVE_AVX512
- extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
- extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
-#endif
-#if OMPI_MCA_OP_HAVE_AVX2
- extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
- extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
-#endif
-#if OMPI_MCA_OP_HAVE_AVX
- extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
- extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
-#endif
 /*
  * Query whether this component can be used for a specific op
  */
 static struct ompi_op_base_module_1_0_0_t*
-avx_component_op_query(struct ompi_op_t *op, int *priority)
+cuda_component_op_query(struct ompi_op_t *op, int *priority)
 {
     ompi_op_base_module_t *module = NULL;
-    /* Sanity check -- although the framework should never invoke the
-       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
-       check here just to be sure. */
-    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
-        return NULL;
-    }
 
-    switch (op->o_f_to_c_index) {
-    case OMPI_OP_BASE_FORTRAN_MAX:
-    case OMPI_OP_BASE_FORTRAN_MIN:
-    case OMPI_OP_BASE_FORTRAN_SUM:
-    case OMPI_OP_BASE_FORTRAN_PROD:
-    case OMPI_OP_BASE_FORTRAN_BOR:
-    case OMPI_OP_BASE_FORTRAN_BAND:
-    case OMPI_OP_BASE_FORTRAN_BXOR:
-        module = OBJ_NEW(ompi_op_base_module_t);
-        for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-#if OMPI_MCA_OP_HAVE_AVX512
-            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX512F_FLAG ) {
-                module->opm_fns[i] = ompi_op_avx_functions_avx512[op->o_f_to_c_index][i];
-                module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx512[op->o_f_to_c_index][i];
-            }
-#endif
-#if OMPI_MCA_OP_HAVE_AVX2
-            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX2_FLAG ) {
-                if( NULL == module->opm_fns[i] ) {
-                    module->opm_fns[i] = ompi_op_avx_functions_avx2[op->o_f_to_c_index][i];
-                }
-                if( NULL == module->opm_3buff_fns[i] ) {
-                    module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx2[op->o_f_to_c_index][i];
-                }
-            }
-#endif
-#if OMPI_MCA_OP_HAVE_AVX
-            if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX_FLAG ) {
-                if( NULL == module->opm_fns[i] ) {
-                    module->opm_fns[i] = ompi_op_avx_functions_avx[op->o_f_to_c_index][i];
-                }
-                if( NULL == module->opm_3buff_fns[i] ) {
-                    module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx[op->o_f_to_c_index][i];
-                }
-            }
-#endif
-            if( NULL != module->opm_fns[i] ) {
-                OBJ_RETAIN(module);
-            }
-            if( NULL != module->opm_3buff_fns[i] ) {
-                OBJ_RETAIN(module);
-            }
+    module = OBJ_NEW(ompi_op_base_module_t);
+    module->opm_device_enabled = true;
+    for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+        module->opm_fns[i] = ompi_op_cuda_functions[op->o_f_to_c_index][i];
+        //module->opm_3buff_fns[i] = ompi_op_cuda_3buff_functions[op->o_f_to_c_index][i];
+
+        if( NULL != module->opm_fns[i] ) {
+            OBJ_RETAIN(module);
+        }
+        if( NULL != module->opm_3buff_fns[i] ) {
+            OBJ_RETAIN(module);
         }
-        break;
-    case OMPI_OP_BASE_FORTRAN_LAND:
-    case OMPI_OP_BASE_FORTRAN_LOR:
-    case OMPI_OP_BASE_FORTRAN_LXOR:
-    case OMPI_OP_BASE_FORTRAN_MAXLOC:
-    case OMPI_OP_BASE_FORTRAN_MINLOC:
-    case OMPI_OP_BASE_FORTRAN_REPLACE:
-    default:
-        break;
-    }
-    /* If we got a module from above, we'll return it.  Otherwise,
-       we'll return NULL, indicating that this component does not want
-       to be considered for selection for this MPI_Op.  Note that the
-       functions each returned a *avx* component pointer
-       (vs. a *base* component pointer -- where an *avx* component
-       is a base component plus some other module-specific cached
-       information), so we have to cast it to the right pointer type
-       before returning. */
-    if (NULL != module) {
-        *priority = 50;
     }
+    *priority = 50;
     return (ompi_op_base_module_1_0_0_t *) module;
 }
diff --git a/ompi/mca/op/cuda/op_cuda_functions.cu b/ompi/mca/op/cuda/op_cuda_functions.c
similarity index 75%
rename from ompi/mca/op/cuda/op_cuda_functions.cu
rename to ompi/mca/op/cuda/op_cuda_functions.c
index ad1741ffaa6..4ef46be783c 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.cu
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -18,49 +18,130 @@
 #endif
 #include "opal/util/output.h"
 
+
 #include "ompi/op/op.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
-#include "ompi/mca/op/avx/op_avx.h"
+#include "ompi/mca/op/cuda/op_cuda.h"
+#include "opal/mca/accelerator/accelerator.h"
+
+#include "ompi/mca/op/cuda/op_cuda.h"
+#include "ompi/mca/op/cuda/op_cuda_impl.h"
+
+
+static inline void device_op_pre(const void *orig_source,
+                                 void *orig_target,
+                                 int count,
+                                 struct ompi_datatype_t *dtype,
+                                 void **source,
+                                 int *source_device,
+                                 void **target,
+                                 int *target_device,
+                                 int *threads_per_block,
+                                 int *device)
+{
+    uint64_t target_flags = -1, source_flags = -1;
+    int target_rc, source_rc;
+
+    *target = orig_target;
+    *source = (void*)orig_source;
+
+    target_rc = opal_accelerator.check_addr(*target, target_device, &target_flags);
+    source_rc = opal_accelerator.check_addr(*source, source_device, &source_flags);
+    *device = *target_device;
+
+    if (0 == target_rc && 0 == source_rc) {
+        /* no buffers are on any device, select device 0 */
+        *device = 0;
+    }
 
-#define THREADS_PER_BLOCK 512
+    /* swap contexts */
+    CHECK(cuCtxPushCurrent, (mca_op_cuda_component.cu_ctx[*device]));
+
+    if (0 == target_rc || 0 == source_rc || *target_device != *source_device) {
+        size_t nbytes;
+        ompi_datatype_type_size(dtype, &nbytes);
+        nbytes *= count;
+
+        if (0 == target_rc) {
+            // allocate memory on the device for the target buffer
+            CUdeviceptr dptr;
+            CHECK(cuMemAllocAsync,   (&dptr, nbytes, mca_op_cuda_component.cu_stream));
+            CHECK(cuMemcpyHtoDAsync, (dptr, *target, nbytes, mca_op_cuda_component.cu_stream));
+            *target = (void*)dptr;
+            *target_device = -1; // mark target device as host
+        }
+
+        if (0 == source_rc || *device != *source_device) {
+            // allocate memory on the device for the source buffer
+            CUdeviceptr dptr;
+            CHECK(cuMemAllocAsync, (&dptr, nbytes, mca_op_cuda_component.cu_stream));
+            if (0 == source_rc) {
+                /* copy from host to device */
+                CHECK(cuMemcpyHtoDAsync, (dptr, *source, nbytes, mca_op_cuda_component.cu_stream));
+            } else {
+                /* copy from one device to another device */
+                /* TODO: does this actually work? Can we enable P2P? */
+                CHECK(cuMemcpyDtoDAsync, (dptr, (CUdeviceptr)*source, nbytes, mca_op_cuda_component.cu_stream));
+            }
+        }
+    }
+    *threads_per_block = mca_op_cuda_component.cu_max_threads_per_block[*device];
+}
+
+static inline void device_op_post(void *orig_target,
+                                  int count,
+                                  struct ompi_datatype_t *dtype,
+                                  void *source,
+                                  int source_device,
+                                  void *target,
+                                  int target_device,
+                                  int device)
+{
+    if (-1 == target_device) {
+
+        size_t nbytes;
+        ompi_datatype_type_size(dtype, &nbytes);
+        nbytes *= count;
+
+        CHECK(cuMemcpyDtoHAsync, (orig_target, (CUdeviceptr)target, nbytes, mca_op_cuda_component.cu_stream));
+
+        CHECK(cuMemFreeAsync, ((CUdeviceptr)target, mca_op_cuda_component.cu_stream));
+    }
 
-#define OP_FUNC(name, type_name, type, op)                                 \
-    __global__ void                                                                 \
-    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {   \
-        int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
-        if (i < n) inout[i] = inout[i] op in[i];                                    \
-    }                                                                               \
-    void ompi_op_cuda_2buff_##name##_##type_name##(const void *in, void *inout, int *count,  \
-                                                   struct ompi_datatype_t **dtype,           \
-                                                   struct ompi_op_cuda_module_1_0_0_t *module) {     \
-        int threads = THREADS_PER_BLOCK;                                            \
-        int blocks  = *count / THREADS_PER_BLOCK;                                   \
-        type *inout_ = (type*)inout;                                                \
-        const type *in_ = (const type*)in;                                          \
-        int n = *count;                                                             \
-        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads>>>(in_, inout_, n); \
+    if (source_device != device) {
+        CHECK(cuMemFreeAsync, ((CUdeviceptr)source, mca_op_cuda_component.cu_stream));
     }
 
+    /* wait for all scheduled operations to complete */
+    CHECK(cuStreamSynchronize, (mca_op_cuda_component.cu_stream));
 
-#define FUNC_FUNC(name, type_name, type)                                            \
-    __global__ void                                                                 \
-    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {   \
-        int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
-        if (i < n) inout[i] = current_func(inout[i], in[i]);                        \
-    }                                                                               \
-    static void                                                                     \
-    ompi_op_cuda_2buff_##name##_##type_name##(const void *in, void *inout, int *count,  \
-                                              struct ompi_datatype_t **dtype,           \
-                                              struct ompi_op_cuda_module_1_0_0_t *module) {     \
-        int threads = THREADS_PER_BLOCK;                                            \
-        int blocks  = *count / THREADS_PER_BLOCK;                                   \
-        type *inout_ = (type*)inout;                                                \
-        const type *in_ = (const type*)in;                                          \
-        int n = *count;                                                             \
-        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads>>(in_, inout_, n); \
+    /* restore the context */
+    CUcontext ctx;
+    CHECK(cuCtxPopCurrent, (&ctx));
+}
+
+#define FUNC(name, type_name, type)                                 \
+    static \
+    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,  \
+                                                   struct ompi_datatype_t **dtype,           \
+                                                   struct ompi_op_base_module_1_0_0_t *module) { \
+        int threads_per_block; \
+        int source_device, target_device, device; \
+        type *source, *target; \
+        int n = *count; \
+        device_op_pre(in, inout, n, *dtype, (void**)&source, &source_device, (void**)&target, &target_device, \
+                      &threads_per_block, &device); \
+        CUstream *stream = &mca_op_cuda_component.cu_stream;                        \
+        ompi_op_cuda_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, *stream); \
+        device_op_post(inout, n, *dtype, source, source_device, target, target_device, device); \
     }
 
+#define OP_FUNC(name, type_name, type, op, ...) FUNC(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
+
+/* reuse the macro above, no work is actually done so we don't care about the func */
+#define FUNC_FUNC(name, type_name, type, ...) FUNC(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
+
 /*
  * Since all the functions in this file are essentially identical, we
  * use a macro to substitute in names and types.  The core operation
@@ -68,43 +149,51 @@
  *
  * This macro is for minloc and maxloc
  */
-#define LOC_STRUCT(type_name, type1, type2) \
-  typedef struct { \
-      type1 v; \
-      type2 k; \
-  } ompi_op_predefined_##type_name##_t;
+#define LOC_FUNC(name, type_name, op) FUNC(name, type_name, ompi_op_predefined_##type_name##_t)
 
-#define LOC_FUNC(name, type_name, op) \
-    __global__ void                   \
-    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in, \
-                                                     ompi_op_predefined_##type_name##_t *inout,    \
-                                                     int n)                                        \
-    {                                                                       \
-        int i = blockIdx.x*blockDim.x + threadIdx.x;                        \
-        if (i < n) {                                                        \
-            const ompi_op_predefined_##type_name##_t *a = &in[i];           \
-            ompi_op_predefined_##type_name##_t *b = &inout[i];              \
-            if (a->v op b->v) {                                             \
-                b->v = a->v;                                                \
-                b->k = a->k;                                                \
-            } else if (a->v == b->v) {                                      \
-                b->k = (b->k < a->k ? b->k : a->k);                         \
-            }                                                               \
-        }                                                                   \
-    }                                                                       \
-    static void                                                             \
-    ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *out, int *count,      \
-                                            struct ompi_datatype_t **dtype,             \
-                                            struct ompi_op_cuda_module_1_0_0_t *module) \
-    {                                                                                   \
-        int i;                                                                          \
-        int threads = THREADS_PER_BLOCK;                                                \
-        int blocks  = *count / THREADS_PER_BLOCK;                                       \
-        const ompi_op_predefined_##type_name##_t *a = (const ompi_op_predefined_##type_name##_t*) in; \
-        ompi_op_predefined_##type_name##_t *b = (ompi_op_predefined_##type_name##_t*) out;            \
-        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads>>(a, b, n);                 \
+/* Dispatch Fortran types to C types */
+#define FORT_INT_FUNC(name, type_name, type)                                 \
+    static \
+    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,  \
+                                                   struct ompi_datatype_t **dtype,           \
+                                                   struct ompi_op_base_module_1_0_0_t *module) { \
+                                                                                                 \
+        _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
+        switch(sizeof(type)) {  \
+            case sizeof(int8_t):  \
+                ompi_op_cuda_2buff_##name##_int8_t(in, inout, count, dtype, module); \
+                break; \
+            case sizeof(int16_t): \
+                ompi_op_cuda_2buff_##name##_int16_t(in, inout, count, dtype, module); \
+                break; \
+            case sizeof(int32_t): \
+                ompi_op_cuda_2buff_##name##_int32_t(in, inout, count, dtype, module); \
+                break; \
+            case sizeof(int64_t): \
+                ompi_op_cuda_2buff_##name##_int64_t(in, inout, count, dtype, module); \
+                break; \
+        } \
     }
 
+/* Dispatch Fortran types to C types */
+#define FORT_FLOAT_FUNC(name, type_name, type)                                 \
+    static \
+    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,  \
+                                                   struct ompi_datatype_t **dtype,           \
+                                                   struct ompi_op_base_module_1_0_0_t *module) { \
+        _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
+        switch(sizeof(type)) {  \
+            case sizeof(float):  \
+                ompi_op_cuda_2buff_##name##_float(in, inout, count, dtype, module);  \
+                break;  \
+            case sizeof(double): \
+                ompi_op_cuda_2buff_##name##_double(in, inout, count, dtype, module); \
+                break; \
+            case sizeof(long double): \
+                ompi_op_cuda_2buff_##name##_long_double(in, inout, count, dtype, module); \
+                break; \
+        } \
+    }
 
 /*************************************************************************
  * Max
@@ -126,49 +215,52 @@ FUNC_FUNC(max,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC(max, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC(max, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC(max, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC(max, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC(max, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC(max, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC(max, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC(max, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC(max, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC(max, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC(max, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC(max, fortran_integer16, ompi_fortran_integer16_t)
 #endif
+
+#if 0
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC(max, short_float, short float)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
 FUNC_FUNC(max, short_float, opal_short_float_t)
 #endif
+#endif // 0
 FUNC_FUNC(max, float, float)
 FUNC_FUNC(max, double, double)
 FUNC_FUNC(max, long_double, long double)
 #if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC(max, fortran_real, ompi_fortran_real_t)
+FORT_FLOAT_FUNC(max, fortran_real, ompi_fortran_real_t)
 #endif
 #if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC(max, fortran_double_precision, ompi_fortran_double_precision_t)
+FORT_FLOAT_FUNC(max, fortran_double_precision, ompi_fortran_double_precision_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC(max, fortran_real2, ompi_fortran_real2_t)
+FORT_FLOAT_FUNC(max, fortran_real2, ompi_fortran_real2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC(max, fortran_real4, ompi_fortran_real4_t)
+FORT_FLOAT_FUNC(max, fortran_real4, ompi_fortran_real4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC(max, fortran_real8, ompi_fortran_real8_t)
+FORT_FLOAT_FUNC(max, fortran_real8, ompi_fortran_real8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC(max, fortran_real16, ompi_fortran_real16_t)
+FORT_FLOAT_FUNC(max, fortran_real16, ompi_fortran_real16_t)
 #endif
 
 
@@ -192,49 +284,53 @@ FUNC_FUNC(min,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC(min, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC(min, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC(min, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC(min, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC(min, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC(min, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC(min, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC(min, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC(min, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC(min, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC(min, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC(min, fortran_integer16, ompi_fortran_integer16_t)
 #endif
+
+#if 0
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC(min, short_float, short float)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
 FUNC_FUNC(min, short_float, opal_short_float_t)
 #endif
+#endif // 0
+
 FUNC_FUNC(min, float, float)
 FUNC_FUNC(min, double, double)
 FUNC_FUNC(min, long_double, long double)
 #if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC(min, fortran_real, ompi_fortran_real_t)
+FORT_FLOAT_FUNC(min, fortran_real, ompi_fortran_real_t)
 #endif
 #if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC(min, fortran_double_precision, ompi_fortran_double_precision_t)
+FORT_FLOAT_FUNC(min, fortran_double_precision, ompi_fortran_double_precision_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC(min, fortran_real2, ompi_fortran_real2_t)
+FORT_FLOAT_FUNC(min, fortran_real2, ompi_fortran_real2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC(min, fortran_real4, ompi_fortran_real4_t)
+FORT_FLOAT_FUNC(min, fortran_real4, ompi_fortran_real4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC(min, fortran_real8, ompi_fortran_real8_t)
+FORT_FLOAT_FUNC(min, fortran_real8, ompi_fortran_real8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC(min, fortran_real16, ompi_fortran_real16_t)
+FORT_FLOAT_FUNC(min, fortran_real16, ompi_fortran_real16_t)
 #endif
 
 /*************************************************************************
@@ -255,49 +351,53 @@ OP_FUNC(sum,  unsigned_long, unsigned long, +=)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC(sum, fortran_integer, ompi_fortran_integer_t, +=)
+FORT_INT_FUNC(sum, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC(sum, fortran_integer1, ompi_fortran_integer1_t, +=)
+FORT_INT_FUNC(sum, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC(sum, fortran_integer2, ompi_fortran_integer2_t, +=)
+FORT_INT_FUNC(sum, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC(sum, fortran_integer4, ompi_fortran_integer4_t, +=)
+FORT_INT_FUNC(sum, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC(sum, fortran_integer8, ompi_fortran_integer8_t, +=)
+FORT_INT_FUNC(sum, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC(sum, fortran_integer16, ompi_fortran_integer16_t, +=)
+FORT_INT_FUNC(sum, fortran_integer16, ompi_fortran_integer16_t)
 #endif
+
+#if 0
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 OP_FUNC(sum, short_float, short float, +=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
 OP_FUNC(sum, short_float, opal_short_float_t, +=)
 #endif
+#endif // 0
+
 OP_FUNC(sum, float, float, +=)
 OP_FUNC(sum, double, double, +=)
 OP_FUNC(sum, long_double, long double, +=)
 #if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC(sum, fortran_real, ompi_fortran_real_t, +=)
+FORT_FLOAT_FUNC(sum, fortran_real, ompi_fortran_real_t)
 #endif
 #if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC(sum, fortran_double_precision, ompi_fortran_double_precision_t, +=)
+FORT_FLOAT_FUNC(sum, fortran_double_precision, ompi_fortran_double_precision_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC(sum, fortran_real2, ompi_fortran_real2_t, +=)
+FORT_FLOAT_FUNC(sum, fortran_real2, ompi_fortran_real2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC(sum, fortran_real4, ompi_fortran_real4_t, +=)
+FORT_FLOAT_FUNC(sum, fortran_real4, ompi_fortran_real4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC(sum, fortran_real8, ompi_fortran_real8_t, +=)
+FORT_FLOAT_FUNC(sum, fortran_real8, ompi_fortran_real8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC(sum, fortran_real16, ompi_fortran_real16_t, +=)
+FORT_FLOAT_FUNC(sum, fortran_real16, ompi_fortran_real16_t)
 #endif
 /* Complex */
 #if 0
@@ -329,49 +429,53 @@ OP_FUNC(prod,  unsigned_long, unsigned long, *=)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC(prod, fortran_integer, ompi_fortran_integer_t, *=)
+FORT_INT_FUNC(prod, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC(prod, fortran_integer1, ompi_fortran_integer1_t, *=)
+FORT_INT_FUNC(prod, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC(prod, fortran_integer2, ompi_fortran_integer2_t, *=)
+FORT_INT_FUNC(prod, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC(prod, fortran_integer4, ompi_fortran_integer4_t, *=)
+FORT_INT_FUNC(prod, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC(prod, fortran_integer8, ompi_fortran_integer8_t, *=)
+FORT_INT_FUNC(prod, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC(prod, fortran_integer16, ompi_fortran_integer16_t, *=)
+FORT_INT_FUNC(prod, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Floating point */
+
+#if 0
 #if defined(HAVE_SHORT_FLOAT)
 OP_FUNC(prod, short_float, short float, *=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
 OP_FUNC(prod, short_float, opal_short_float_t, *=)
 #endif
+#endif // 0
+
 OP_FUNC(prod, float, float, *=)
 OP_FUNC(prod, double, double, *=)
 OP_FUNC(prod, long_double, long double, *=)
 #if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC(prod, fortran_real, ompi_fortran_real_t, *=)
+FORT_FLOAT_FUNC(prod, fortran_real, ompi_fortran_real_t)
 #endif
 #if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC(prod, fortran_double_precision, ompi_fortran_double_precision_t, *=)
+FORT_FLOAT_FUNC(prod, fortran_double_precision, ompi_fortran_double_precision_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC(prod, fortran_real2, ompi_fortran_real2_t, *=)
+FORT_FLOAT_FUNC(prod, fortran_real2, ompi_fortran_real2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC(prod, fortran_real4, ompi_fortran_real4_t, *=)
+FORT_FLOAT_FUNC(prod, fortran_real4, ompi_fortran_real4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC(prod, fortran_real8, ompi_fortran_real8_t, *=)
+FORT_FLOAT_FUNC(prod, fortran_real8, ompi_fortran_real8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC(prod, fortran_real16, ompi_fortran_real16_t, *=)
+FORT_FLOAT_FUNC(prod, fortran_real16, ompi_fortran_real16_t)
 #endif
 /* Complex */
 #if 0
@@ -405,7 +509,7 @@ FUNC_FUNC(land,  unsigned_long, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC(land, fortran_logical, ompi_fortran_logical_t)
+FORT_INT_FUNC(land, fortran_logical, ompi_fortran_logical_t)
 #endif
 /* C++ bool */
 FUNC_FUNC(land, bool, bool)
@@ -430,7 +534,7 @@ FUNC_FUNC(lor,  unsigned_long, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC(lor, fortran_logical, ompi_fortran_logical_t)
+FORT_INT_FUNC(lor, fortran_logical, ompi_fortran_logical_t)
 #endif
 /* C++ bool */
 FUNC_FUNC(lor, bool, bool)
@@ -456,7 +560,7 @@ FUNC_FUNC(lxor,  unsigned_long, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC(lxor, fortran_logical, ompi_fortran_logical_t)
+FORT_INT_FUNC(lxor, fortran_logical, ompi_fortran_logical_t)
 #endif
 /* C++ bool */
 FUNC_FUNC(lxor, bool, bool)
@@ -481,22 +585,22 @@ FUNC_FUNC(band,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC(band, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC(band, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC(band, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC(band, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC(band, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC(band, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC(band, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC(band, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC(band, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC(band, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC(band, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC(band, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Byte */
 FUNC_FUNC(band, byte, char)
@@ -521,22 +625,22 @@ FUNC_FUNC(bor,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC(bor, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC(bor, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC(bor, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC(bor, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC(bor, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC(bor, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC(bor, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC(bor, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC(bor, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC(bor, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC(bor, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC(bor, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Byte */
 FUNC_FUNC(bor, byte, char)
@@ -561,51 +665,31 @@ FUNC_FUNC(bxor,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC(bxor, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC(bxor, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC(bxor, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC(bxor, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC(bxor, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC(bxor, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC(bxor, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC(bxor, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC(bxor, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC(bxor, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC(bxor, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC(bxor, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Byte */
 FUNC_FUNC(bxor, byte, char)
 
-/*************************************************************************
- * Min and max location "pair" datatypes
- *************************************************************************/
-
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_STRUCT(2real, ompi_fortran_real_t, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_STRUCT(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_STRUCT(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
-#endif
-LOC_STRUCT(float_int, float, int)
-LOC_STRUCT(double_int, double, int)
-LOC_STRUCT(long_int, long, int)
-LOC_STRUCT(2int, int, int)
-LOC_STRUCT(short_int, short, int)
-LOC_STRUCT(long_double_int, long double, int)
-LOC_STRUCT(unsigned_long, unsigned long, int)
-
 /*************************************************************************
  * Max location
  *************************************************************************/
 
+#if 0
 #if OMPI_HAVE_FORTRAN_REAL
 LOC_FUNC(maxloc, 2real, >)
 #endif
@@ -615,6 +699,7 @@ LOC_FUNC(maxloc, 2double_precision, >)
 #if OMPI_HAVE_FORTRAN_INTEGER
 LOC_FUNC(maxloc, 2integer, >)
 #endif
+#endif // 0
 LOC_FUNC(maxloc, float_int, >)
 LOC_FUNC(maxloc, double_int, >)
 LOC_FUNC(maxloc, long_int, >)
@@ -625,7 +710,7 @@ LOC_FUNC(maxloc, long_double_int, >)
 /*************************************************************************
  * Min location
  *************************************************************************/
-
+#if 0
 #if OMPI_HAVE_FORTRAN_REAL
 LOC_FUNC(minloc, 2real, <)
 #endif
@@ -635,6 +720,7 @@ LOC_FUNC(minloc, 2double_precision, <)
 #if OMPI_HAVE_FORTRAN_INTEGER
 LOC_FUNC(minloc, 2integer, <)
 #endif
+#endif // 0
 LOC_FUNC(minloc, float_int, <)
 LOC_FUNC(minloc, double_int, <)
 LOC_FUNC(minloc, long_int, <)
@@ -643,27 +729,28 @@ LOC_FUNC(minloc, short_int, <)
 LOC_FUNC(minloc, long_double_int, <)
 
 
-
+#if 0
 /*
  *  This is a three buffer (2 input and 1 output) version of the reduction
  *    routines, needed for some optimizations.
  */
 #define OP_FUNC(name, type_name, type, op)                                 \
-    __global__ void                                                                 \
+    __device__ void                                                                 \
     ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type* in2, type *out, int n) {   \
         int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
         if (i < n) out[i] = in1[i] op in2[i];                                       \
     }                                                                               \
     void ompi_op_cuda_3buff_##name##_##type_name##(const void *in1, const void *in2, void *out, int *count,  \
                                                    struct ompi_datatype_t **dtype,           \
-                                                   struct ompi_op_cuda_module_1_0_0_t *module) {     \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {     \
         int threads = THREADS_PER_BLOCK;                                            \
         int blocks  = *count / THREADS_PER_BLOCK;                                   \
         type *out_ = (type*)out;                                                    \
         const type *in1_ = (const type*)in1;                                        \
         const type *in2_ = (const type*)in2;                                        \
         int n = *count;                                                             \
-        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads>>>(in1_, int2_, out_, n); \
+        CUstream *stream = &mca_op_cuda_component.cu_stream;                        \
+        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads, *stream>>>(in1_, int2_, out_, n); \
     }
 
 
@@ -675,7 +762,7 @@ LOC_FUNC(minloc, long_double_int, <)
  * This macro is for (out = op(in1, in2))
  */
 #define FUNC_FUNC(name, type_name, type)                                            \
-    __global__ void                                                                 \
+    __device__ void                                                                 \
     ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type *in2, type *out, int n) {   \
         int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
         if (i < n) out[i] = current_func(in1[i], in2[i]);                           \
@@ -683,14 +770,15 @@ LOC_FUNC(minloc, long_double_int, <)
     static void                                                                     \
     ompi_op_cuda_3buff_##name##_##type_name##(const void *in1, const void *in2, void *out, int *count,  \
                                               struct ompi_datatype_t **dtype,           \
-                                              struct ompi_op_cuda_module_1_0_0_t *module) {     \
+                                              struct ompi_op_base_module_1_0_0_t *module) { \
         int threads = THREADS_PER_BLOCK;                                            \
         int blocks  = *count / THREADS_PER_BLOCK;                                   \
         type *out_ = (type*)out;                                                    \
         const type *in1_ = (const type*)in1;                                        \
         const type *in2_ = (const type*)in2;                                        \
         int n = *count;                                                             \
-        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<blocks, threads>>(in1_, in2_, out_, n); \
+        CUstream *stream = &mca_op_cuda_component.cu_stream;                        \
+        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<blocks, threads, *stream>>(in1_, in2_, out_, n); \
     }
 
 /*
@@ -709,7 +797,7 @@ LOC_FUNC(minloc, long_double_int, <)
 */
 
 #define LOC_FUNC(name, type_name, op) \
-    __global__ void                   \
+    __device__ void                   \
     ompi_op_cuda_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in1, \
                                                      const ompi_op_predefined_##type_name##_t *in2, \
                                                      ompi_op_predefined_##type_name##_t *out,       \
@@ -743,7 +831,8 @@ LOC_FUNC(minloc, long_double_int, <)
         const ompi_op_predefined_##type_name##_t *a1 = (const ompi_op_predefined_##type_name##_t*) in1; \
         const ompi_op_predefined_##type_name##_t *a2 = (const ompi_op_predefined_##type_name##_t*) in2; \
         ompi_op_predefined_##type_name##_t *b = (ompi_op_predefined_##type_name##_t*) out;            \
-        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads>>(a1, a2, b, n);            \
+        CUstream *stream = &mca_op_cuda_component.cu_stream;                            \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads, *stream>>(a1, a2, b, n); \
     }
 
 
@@ -1282,7 +1371,7 @@ LOC_FUNC_3BUF(minloc, long_int, <)
 LOC_FUNC_3BUF(minloc, 2int, <)
 LOC_FUNC_3BUF(minloc, short_int, <)
 LOC_FUNC_3BUF(minloc, long_double_int, <)
-
+#endif // 0
 
 /*
  * Helpful defines, because there's soooo many names!
@@ -1294,16 +1383,16 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 
 /** C integer ***********************************************************/
 #define C_INTEGER(name, ftype)                                             \
-  [OMPI_OP_CUDA_TYPE_INT8_T] = ompi_op_cuda_##ftype##_##name##_int8_t,     \
-  [OMPI_OP_CUDA_TYPE_UINT8_T] = ompi_op_cuda_##ftype##_##name##_uint8_t,   \
-  [OMPI_OP_CUDA_TYPE_INT16_T] = ompi_op_cuda_##ftype##_##name##_int16_t,   \
-  [OMPI_OP_CUDA_TYPE_UINT16_T] = ompi_op_cuda_##ftype##_##name##_uint16_t, \
-  [OMPI_OP_CUDA_TYPE_INT32_T] = ompi_op_cuda_##ftype##_##name##_int32_t,   \
-  [OMPI_OP_CUDA_TYPE_UINT32_T] = ompi_op_cuda_##ftype##_##name##_uint32_t, \
-  [OMPI_OP_CUDA_TYPE_INT64_T] = ompi_op_cuda_##ftype##_##name##_int64_t,   \
-  [OMPI_OP_CUDA_TYPE_LONG] = ompi_op_cuda_##ftype##_##name##_long,   \
-  [OMPI_OP_CUDA_TYPE_UNSIGNED_LONG] = ompi_op_cuda_##ftype##_##name##_unsigned_long,   \
-  [OMPI_OP_CUDA_TYPE_UINT64_T] = ompi_op_cuda_##ftype##_##name##_uint64_t
+  [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_cuda_##ftype##_##name##_int8_t,     \
+  [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_cuda_##ftype##_##name##_uint8_t,   \
+  [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_cuda_##ftype##_##name##_int16_t,   \
+  [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_cuda_##ftype##_##name##_uint16_t, \
+  [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_cuda_##ftype##_##name##_int32_t,   \
+  [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_cuda_##ftype##_##name##_uint32_t, \
+  [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_cuda_##ftype##_##name##_int64_t,   \
+  [OMPI_OP_BASE_TYPE_LONG] = ompi_op_cuda_##ftype##_##name##_long,   \
+  [OMPI_OP_BASE_TYPE_UNSIGNED_LONG] = ompi_op_cuda_##ftype##_##name##_unsigned_long,   \
+  [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_cuda_##ftype##_##name##_uint64_t
 
 /** All the Fortran integers ********************************************/
 
@@ -1339,12 +1428,12 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 #endif
 
 #define FORTRAN_INTEGER(name, ftype)                                  \
-    [OMPI_OP_CUDA_TYPE_INTEGER] = FORTRAN_INTEGER_PLAIN(name, ftype), \
-    [OMPI_OP_CUDA_TYPE_INTEGER1] = FORTRAN_INTEGER1(name, ftype),     \
-    [OMPI_OP_CUDA_TYPE_INTEGER2] = FORTRAN_INTEGER2(name, ftype),     \
-    [OMPI_OP_CUDA_TYPE_INTEGER4] = FORTRAN_INTEGER4(name, ftype),     \
-    [OMPI_OP_CUDA_TYPE_INTEGER8] = FORTRAN_INTEGER8(name, ftype),     \
-    [OMPI_OP_CUDA_TYPE_INTEGER16] = FORTRAN_INTEGER16(name, ftype)
+    [OMPI_OP_BASE_TYPE_INTEGER] = FORTRAN_INTEGER_PLAIN(name, ftype), \
+    [OMPI_OP_BASE_TYPE_INTEGER1] = FORTRAN_INTEGER1(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_INTEGER2] = FORTRAN_INTEGER2(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_INTEGER4] = FORTRAN_INTEGER4(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_INTEGER8] = FORTRAN_INTEGER8(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_INTEGER16] = FORTRAN_INTEGER16(name, ftype)
 
 /** All the Fortran reals ***********************************************/
 
@@ -1381,11 +1470,11 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 #endif
 
 #define FLOATING_POINT_FORTRAN_REAL(name, ftype)                               \
-    [OMPI_OP_CUDA_TYPE_REAL] = FLOATING_POINT_FORTRAN_REAL_PLAIN(name, ftype), \
-    [OMPI_OP_CUDA_TYPE_REAL2] = FLOATING_POINT_FORTRAN_REAL2(name, ftype),     \
-    [OMPI_OP_CUDA_TYPE_REAL4] = FLOATING_POINT_FORTRAN_REAL4(name, ftype),     \
-    [OMPI_OP_CUDA_TYPE_REAL8] = FLOATING_POINT_FORTRAN_REAL8(name, ftype),     \
-    [OMPI_OP_CUDA_TYPE_REAL16] = FLOATING_POINT_FORTRAN_REAL16(name, ftype)
+    [OMPI_OP_BASE_TYPE_REAL] = FLOATING_POINT_FORTRAN_REAL_PLAIN(name, ftype), \
+    [OMPI_OP_BASE_TYPE_REAL2] = FLOATING_POINT_FORTRAN_REAL2(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_REAL4] = FLOATING_POINT_FORTRAN_REAL4(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_REAL8] = FLOATING_POINT_FORTRAN_REAL8(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_REAL16] = FLOATING_POINT_FORTRAN_REAL16(name, ftype)
 
 /** Fortran double precision ********************************************/
 
@@ -1398,22 +1487,22 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 
 /** Floating point, including all the Fortran reals *********************/
 
-#if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T)
-#define SHORT_FLOAT(name, ftype) ompi_op_cuda_##ftype##_##name##_short_float
-#else
+//#if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T)
+//#define SHORT_FLOAT(name, ftype) ompi_op_cuda_##ftype##_##name##_short_float
+//#else
 #define SHORT_FLOAT(name, ftype) NULL
-#endif
+//#endif
 #define FLOAT(name, ftype) ompi_op_cuda_##ftype##_##name##_float
 #define DOUBLE(name, ftype) ompi_op_cuda_##ftype##_##name##_double
 #define LONG_DOUBLE(name, ftype) ompi_op_cuda_##ftype##_##name##_long_double
 
 #define FLOATING_POINT(name, ftype)                                                            \
-  [OMPI_OP_CUDA_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype),                                  \
-  [OMPI_OP_CUDA_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
-  [OMPI_OP_CUDA_TYPE_DOUBLE] = DOUBLE(name, ftype),                                            \
+  [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype),                                  \
+  [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
+  [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype),                                            \
   FLOATING_POINT_FORTRAN_REAL(name, ftype),                                                    \
-  [OMPI_OP_CUDA_TYPE_DOUBLE_PRECISION] = FLOATING_POINT_FORTRAN_DOUBLE_PRECISION(name, ftype), \
-  [OMPI_OP_CUDA_TYPE_LONG_DOUBLE] = LONG_DOUBLE(name, ftype)
+  [OMPI_OP_BASE_TYPE_DOUBLE_PRECISION] = FLOATING_POINT_FORTRAN_DOUBLE_PRECISION(name, ftype), \
+  [OMPI_OP_BASE_TYPE_LONG_DOUBLE] = LONG_DOUBLE(name, ftype)
 
 /** Fortran logical *****************************************************/
 
@@ -1425,8 +1514,8 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 #endif
 
 #define LOGICAL(name, ftype)                                    \
-    [OMPI_OP_CUDA_TYPE_LOGICAL] = FORTRAN_LOGICAL(name, ftype), \
-    [OMPI_OP_CUDA_TYPE_BOOL] = ompi_op_cuda_##ftype##_##name##_bool
+    [OMPI_OP_BASE_TYPE_LOGICAL] = FORTRAN_LOGICAL(name, ftype), \
+    [OMPI_OP_BASE_TYPE_BOOL] = ompi_op_cuda_##ftype##_##name##_bool
 
 /** Complex *****************************************************/
 #if 0
@@ -1439,6 +1528,12 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 #define FLOAT_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_float_complex
 #define DOUBLE_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_double_complex
 #define LONG_DOUBLE_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_long_double_complex
+#else
+#define SHORT_FLOAT_COMPLEX(name, ftype) NULL
+#define FLOAT_COMPLEX(name, ftype) NULL
+#define DOUBLE_COMPLEX(name, ftype) NULL
+#define LONG_DOUBLE_COMPLEX(name, ftype) NULL
+#endif // 0
 
 #define COMPLEX(name, ftype)                                                  \
     [OMPI_OP_CUDA_TYPE_C_SHORT_FLOAT_COMPLEX] = SHORT_FLOAT_COMPLEX(name, ftype), \
@@ -1446,28 +1541,27 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
     [OMPI_OP_CUDA_TYPE_C_DOUBLE_COMPLEX] = DOUBLE_COMPLEX(name, ftype),       \
     [OMPI_OP_CUDA_TYPE_C_LONG_DOUBLE_COMPLEX] = LONG_DOUBLE_COMPLEX(name, ftype)
 
-#endif // 0
-
 /** Byte ****************************************************************/
 
 #define BYTE(name, ftype)                                     \
-  [OMPI_OP_CUDA_TYPE_BYTE] = ompi_op_cuda_##ftype##_##name##_byte
+  [OMPI_OP_BASE_TYPE_BYTE] = ompi_op_cuda_##ftype##_##name##_byte
 
 /** Fortran complex *****************************************************/
 /** Fortran "2" types ***************************************************/
 
-#if OMPI_HAVE_FORTRAN_REAL
-#define TWOLOC_FORTRAN_2REAL(name, ftype) ompi_op_cuda_##ftype##_##name##_2real
+#if OMPI_HAVE_FORTRAN_REAL && OMPI_SIZEOF_FLOAT == OMPI_SIZEOF_FORTRAN_REAL
+#define TWOLOC_FORTRAN_2REAL(name, ftype) ompi_op_cuda_##ftype##_##name##_2double_precision
 #else
 #define TWOLOC_FORTRAN_2REAL(name, ftype) NULL
 #endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION && OMPI_SIZEOF_DOUBLE == OMPI_SIZEOF_FORTRAN_DOUBLE_PRECISION
 #define TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype) ompi_op_cuda_##ftype##_##name##_2double_precision
 #else
 #define TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype) NULL
 #endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-#define TWOLOC_FORTRAN_2INTEGER(name, ftype) ompi_op_cuda_##ftype##_##name##_2integer
+#if OMPI_HAVE_FORTRAN_INTEGER && OMPI_SIZEOF_INT == OMPI_SIZEOF_FORTRAN_INTEGER
+#define TWOLOC_FORTRAN_2INTEGER(name, ftype) ompi_op_cuda_##ftype##_##name##_2int
 #else
 #define TWOLOC_FORTRAN_2INTEGER(name, ftype) NULL
 #endif
@@ -1475,15 +1569,15 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 /** All "2" types *******************************************************/
 
 #define TWOLOC(name, ftype)                                                                   \
-    [OMPI_OP_CUDA_TYPE_2REAL] = TWOLOC_FORTRAN_2REAL(name, ftype),                            \
-    [OMPI_OP_CUDA_TYPE_2DOUBLE_PRECISION] = TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype),    \
-    [OMPI_OP_CUDA_TYPE_2INTEGER] = TWOLOC_FORTRAN_2INTEGER(name, ftype),                      \
-    [OMPI_OP_CUDA_TYPE_FLOAT_INT] = ompi_op_cuda_##ftype##_##name##_float_int,                \
-    [OMPI_OP_CUDA_TYPE_DOUBLE_INT] = ompi_op_cuda_##ftype##_##name##_double_int,              \
-    [OMPI_OP_CUDA_TYPE_LONG_INT] = ompi_op_cuda_##ftype##_##name##_long_int,                  \
-    [OMPI_OP_CUDA_TYPE_2INT] = ompi_op_cuda_##ftype##_##name##_2int,                          \
-    [OMPI_OP_CUDA_TYPE_SHORT_INT] = ompi_op_cuda_##ftype##_##name##_short_int,                \
-    [OMPI_OP_CUDA_TYPE_LONG_DOUBLE_INT] = ompi_op_cuda_##ftype##_##name##_long_double_int
+    [OMPI_OP_BASE_TYPE_2REAL] = TWOLOC_FORTRAN_2REAL(name, ftype),                            \
+    [OMPI_OP_BASE_TYPE_2DOUBLE_PRECISION] = TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype),    \
+    [OMPI_OP_BASE_TYPE_2INTEGER] = TWOLOC_FORTRAN_2INTEGER(name, ftype),                      \
+    [OMPI_OP_BASE_TYPE_FLOAT_INT] = ompi_op_cuda_##ftype##_##name##_float_int,                \
+    [OMPI_OP_BASE_TYPE_DOUBLE_INT] = ompi_op_cuda_##ftype##_##name##_double_int,              \
+    [OMPI_OP_BASE_TYPE_LONG_INT] = ompi_op_cuda_##ftype##_##name##_long_int,                  \
+    [OMPI_OP_BASE_TYPE_2INT] = ompi_op_cuda_##ftype##_##name##_2int,                          \
+    [OMPI_OP_BASE_TYPE_SHORT_INT] = ompi_op_cuda_##ftype##_##name##_short_int,                \
+    [OMPI_OP_BASE_TYPE_LONG_DOUBLE_INT] = ompi_op_cuda_##ftype##_##name##_long_double_int
 
 /*
  * MPI_OP_NULL
@@ -1495,82 +1589,82 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
     (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
      OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
 
-ompi_op_cuda_handler_fn_t ompi_op_cuda_functions[OMPI_OP_CUDA_FORTRAN_OP_MAX][OMPI_OP_CUDA_TYPE_MAX] =
+ompi_op_base_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
     {
         /* Corresponds to MPI_OP_NULL */
-        [OMPI_OP_CUDA_FORTRAN_NULL] = {
+        [OMPI_OP_BASE_FORTRAN_NULL] = {
             /* Leaving this empty puts in NULL for all entries */
             NULL,
         },
         /* Corresponds to MPI_MAX */
-        [OMPI_OP_CUDA_FORTRAN_MAX] = {
+        [OMPI_OP_BASE_FORTRAN_MAX] = {
             C_INTEGER(max, 2buff),
             FORTRAN_INTEGER(max, 2buff),
             FLOATING_POINT(max, 2buff),
         },
         /* Corresponds to MPI_MIN */
-        [OMPI_OP_CUDA_FORTRAN_MIN] = {
+        [OMPI_OP_BASE_FORTRAN_MIN] = {
             C_INTEGER(min, 2buff),
             FORTRAN_INTEGER(min, 2buff),
             FLOATING_POINT(min, 2buff),
         },
         /* Corresponds to MPI_SUM */
-        [OMPI_OP_CUDA_FORTRAN_SUM] = {
+        [OMPI_OP_BASE_FORTRAN_SUM] = {
             C_INTEGER(sum, 2buff),
             FORTRAN_INTEGER(sum, 2buff),
             FLOATING_POINT(sum, 2buff),
             NULL,
         },
         /* Corresponds to MPI_PROD */
-        [OMPI_OP_CUDA_FORTRAN_PROD] = {
+        [OMPI_OP_BASE_FORTRAN_PROD] = {
             C_INTEGER(prod, 2buff),
             FORTRAN_INTEGER(prod, 2buff),
             FLOATING_POINT(prod, 2buff),
             NULL,
         },
         /* Corresponds to MPI_LAND */
-        [OMPI_OP_CUDA_FORTRAN_LAND] = {
+        [OMPI_OP_BASE_FORTRAN_LAND] = {
             C_INTEGER(land, 2buff),
             LOGICAL(land, 2buff),
         },
         /* Corresponds to MPI_BAND */
-        [OMPI_OP_CUDA_FORTRAN_BAND] = {
+        [OMPI_OP_BASE_FORTRAN_BAND] = {
             C_INTEGER(band, 2buff),
             FORTRAN_INTEGER(band, 2buff),
             BYTE(band, 2buff),
         },
         /* Corresponds to MPI_LOR */
-        [OMPI_OP_CUDA_FORTRAN_LOR] = {
+        [OMPI_OP_BASE_FORTRAN_LOR] = {
             C_INTEGER(lor, 2buff),
             LOGICAL(lor, 2buff),
         },
         /* Corresponds to MPI_BOR */
-        [OMPI_OP_CUDA_FORTRAN_BOR] = {
+        [OMPI_OP_BASE_FORTRAN_BOR] = {
             C_INTEGER(bor, 2buff),
             FORTRAN_INTEGER(bor, 2buff),
             BYTE(bor, 2buff),
         },
         /* Corresponds to MPI_LXOR */
-        [OMPI_OP_CUDA_FORTRAN_LXOR] = {
+        [OMPI_OP_BASE_FORTRAN_LXOR] = {
             C_INTEGER(lxor, 2buff),
             LOGICAL(lxor, 2buff),
         },
         /* Corresponds to MPI_BXOR */
-        [OMPI_OP_CUDA_FORTRAN_BXOR] = {
+        [OMPI_OP_BASE_FORTRAN_BXOR] = {
             C_INTEGER(bxor, 2buff),
             FORTRAN_INTEGER(bxor, 2buff),
             BYTE(bxor, 2buff),
         },
         /* Corresponds to MPI_MAXLOC */
-        [OMPI_OP_CUDA_FORTRAN_MAXLOC] = {
+        [OMPI_OP_BASE_FORTRAN_MAXLOC] = {
             TWOLOC(maxloc, 2buff),
         },
         /* Corresponds to MPI_MINLOC */
-        [OMPI_OP_CUDA_FORTRAN_MINLOC] = {
+        [OMPI_OP_BASE_FORTRAN_MINLOC] = {
             TWOLOC(minloc, 2buff),
         },
         /* Corresponds to MPI_REPLACE */
-        [OMPI_OP_CUDA_FORTRAN_REPLACE] = {
+        [OMPI_OP_BASE_FORTRAN_REPLACE] = {
             /* (MPI_ACCUMULATE is handled differently than the other
                reductions, so just zero out its function
                implementations here to ensure that users don't invoke
@@ -1581,7 +1675,7 @@ ompi_op_cuda_handler_fn_t ompi_op_cuda_functions[OMPI_OP_CUDA_FORTRAN_OP_MAX][OM
 
     };
 
-
+#if 0
 ompi_op_base_3buff_handler_fn_t ompi_op_base_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
     {
         /* Corresponds to MPI_OP_NULL */
@@ -1666,3 +1760,4 @@ ompi_op_base_3buff_handler_fn_t ompi_op_base_3buff_functions[OMPI_OP_BASE_FORTRA
             NULL,
         },
     };
+#endif // 0
\ No newline at end of file
diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
new file mode 100644
index 00000000000..ce4677b9047
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -0,0 +1,1023 @@
+/*
+ * Copyright (c) 2019-2021 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <sys/types.h>
+
+#include <cuComplex.h>
+
+#include "op_cuda_impl.h"
+
+/* TODO: missing support for
+ * - short float (conditional on whether short float is available)
+ * - complex
+ * - 3buff implementation
+ */
+
+#define THREADS_PER_BLOCK 512
+
+#define OP_FUNC(name, type_name, type, op)                                                          \
+    static __global__ void                                                                          \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {          \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            inout[i] = inout[i] op in[i];                                                           \
+        }                                                                                           \
+    }                                                                                               \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                           \
+                                                   type *inout,                                     \
+                                                   int count,                                       \
+                                                   int threads_per_block,                           \
+                                                   CUstream stream) {                               \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count + threads-1) / threads;                                                \
+        int n = count;                                                                              \
+        CUstream s = stream;                                                                        \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+    }
+
+
+#define FUNC_FUNC(name, type_name, type)                                                            \
+    static __global__ void                                                                          \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {          \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            inout[i] = current_func(inout[i], in[i]);                                               \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                                \
+                                              type *inout,                                          \
+                                              int count,                                            \
+                                              int threads_per_block,                                \
+                                              CUstream stream) {                                    \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count + threads-1) / threads;                                                \
+        int n = count;                                                                              \
+        CUstream s = stream;                                                                        \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+    }
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+
+#define LOC_FUNC(name, type_name, op)                                                               \
+    static __global__ void                                                                          \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in,  \
+                                                     ompi_op_predefined_##type_name##_t *inout,     \
+                                                     int n)                                         \
+    {                                                                                               \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            const ompi_op_predefined_##type_name##_t *a = &in[i];                                   \
+            ompi_op_predefined_##type_name##_t *b = &inout[i];                                      \
+            if (a->v op b->v) {                                                                     \
+                b->v = a->v;                                                                        \
+                b->k = a->k;                                                                        \
+            } else if (a->v == b->v) {                                                              \
+                b->k = (b->k < a->k ? b->k : a->k);                                                 \
+            }                                                                                       \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_cuda_2buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a,   \
+                                            ompi_op_predefined_##type_name##_t *b,                  \
+                                            int count,                                              \
+                                            int threads_per_block,                                  \
+                                            CUstream stream) {                                      \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count + threads-1) / threads;                                                \
+        CUstream s = stream;                                                                        \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(a, b, count);   \
+    }
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC(max,   int8_t,   int8_t)
+FUNC_FUNC(max,  uint8_t,  uint8_t)
+FUNC_FUNC(max,  int16_t,  int16_t)
+FUNC_FUNC(max, uint16_t, uint16_t)
+FUNC_FUNC(max,  int32_t,  int32_t)
+FUNC_FUNC(max, uint32_t, uint32_t)
+FUNC_FUNC(max,  int64_t,  int64_t)
+FUNC_FUNC(max, uint64_t, uint64_t)
+FUNC_FUNC(max,  long,  long)
+FUNC_FUNC(max,  unsigned_long, unsigned long)
+
+FUNC_FUNC(max, float, float)
+FUNC_FUNC(max, double, double)
+FUNC_FUNC(max, long_double, long double)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC(min,   int8_t,   int8_t)
+FUNC_FUNC(min,  uint8_t,  uint8_t)
+FUNC_FUNC(min,  int16_t,  int16_t)
+FUNC_FUNC(min, uint16_t, uint16_t)
+FUNC_FUNC(min,  int32_t,  int32_t)
+FUNC_FUNC(min, uint32_t, uint32_t)
+FUNC_FUNC(min,  int64_t,  int64_t)
+FUNC_FUNC(min, uint64_t, uint64_t)
+FUNC_FUNC(min,  long,  long)
+FUNC_FUNC(min,  unsigned_long, unsigned long)
+
+
+FUNC_FUNC(min, float, float)
+FUNC_FUNC(min, double, double)
+FUNC_FUNC(min, long_double, long double)
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC(sum,   int8_t,   int8_t, +=)
+OP_FUNC(sum,  uint8_t,  uint8_t, +=)
+OP_FUNC(sum,  int16_t,  int16_t, +=)
+OP_FUNC(sum, uint16_t, uint16_t, +=)
+OP_FUNC(sum,  int32_t,  int32_t, +=)
+OP_FUNC(sum, uint32_t, uint32_t, +=)
+OP_FUNC(sum,  int64_t,  int64_t, +=)
+OP_FUNC(sum, uint64_t, uint64_t, +=)
+OP_FUNC(sum,  long,  long, +=)
+OP_FUNC(sum,  unsigned_long, unsigned long, +=)
+
+OP_FUNC(sum, float, float, +=)
+OP_FUNC(sum, double, double, +=)
+OP_FUNC(sum, long_double, long double, +=)
+
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+#endif // 0
+#undef current_func
+#define current_func(a, b) (cuCmulf(a,b))
+FUNC_FUNC(sum, c_float_complex, cuFloatComplex)
+#undef current_func
+#define current_func(a, b) (cuCmul(a,b))
+FUNC_FUNC(sum, c_double_complex, cuDoubleComplex)
+//OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC(prod,   int8_t,   int8_t, *=)
+OP_FUNC(prod,  uint8_t,  uint8_t, *=)
+OP_FUNC(prod,  int16_t,  int16_t, *=)
+OP_FUNC(prod, uint16_t, uint16_t, *=)
+OP_FUNC(prod,  int32_t,  int32_t, *=)
+OP_FUNC(prod, uint32_t, uint32_t, *=)
+OP_FUNC(prod,  int64_t,  int64_t, *=)
+OP_FUNC(prod, uint64_t, uint64_t, *=)
+OP_FUNC(prod,  long,  long, *=)
+OP_FUNC(prod,  unsigned_long, unsigned long, *=)
+
+OP_FUNC(prod, float, float, *=)
+OP_FUNC(prod, double, double, *=)
+OP_FUNC(prod, long_double, long double, *=)
+
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC(prod, c_short_float_complex, short float _Complex, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC(prod, c_float_complex, float _Complex, *=)
+OP_FUNC(prod, c_double_complex, double _Complex, *=)
+OP_FUNC(prod, c_long_double_complex, long double _Complex, *=)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC(land,   int8_t,   int8_t)
+FUNC_FUNC(land,  uint8_t,  uint8_t)
+FUNC_FUNC(land,  int16_t,  int16_t)
+FUNC_FUNC(land, uint16_t, uint16_t)
+FUNC_FUNC(land,  int32_t,  int32_t)
+FUNC_FUNC(land, uint32_t, uint32_t)
+FUNC_FUNC(land,  int64_t,  int64_t)
+FUNC_FUNC(land, uint64_t, uint64_t)
+FUNC_FUNC(land,  long,  long)
+FUNC_FUNC(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC(lor,   int8_t,   int8_t)
+FUNC_FUNC(lor,  uint8_t,  uint8_t)
+FUNC_FUNC(lor,  int16_t,  int16_t)
+FUNC_FUNC(lor, uint16_t, uint16_t)
+FUNC_FUNC(lor,  int32_t,  int32_t)
+FUNC_FUNC(lor, uint32_t, uint32_t)
+FUNC_FUNC(lor,  int64_t,  int64_t)
+FUNC_FUNC(lor, uint64_t, uint64_t)
+FUNC_FUNC(lor,  long,  long)
+FUNC_FUNC(lor,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC(lxor,   int8_t,   int8_t)
+FUNC_FUNC(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC(lxor,  int16_t,  int16_t)
+FUNC_FUNC(lxor, uint16_t, uint16_t)
+FUNC_FUNC(lxor,  int32_t,  int32_t)
+FUNC_FUNC(lxor, uint32_t, uint32_t)
+FUNC_FUNC(lxor,  int64_t,  int64_t)
+FUNC_FUNC(lxor, uint64_t, uint64_t)
+FUNC_FUNC(lxor,  long,  long)
+FUNC_FUNC(lxor,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC(band,   int8_t,   int8_t)
+FUNC_FUNC(band,  uint8_t,  uint8_t)
+FUNC_FUNC(band,  int16_t,  int16_t)
+FUNC_FUNC(band, uint16_t, uint16_t)
+FUNC_FUNC(band,  int32_t,  int32_t)
+FUNC_FUNC(band, uint32_t, uint32_t)
+FUNC_FUNC(band,  int64_t,  int64_t)
+FUNC_FUNC(band, uint64_t, uint64_t)
+FUNC_FUNC(band,  long,  long)
+FUNC_FUNC(band,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC(bor,   int8_t,   int8_t)
+FUNC_FUNC(bor,  uint8_t,  uint8_t)
+FUNC_FUNC(bor,  int16_t,  int16_t)
+FUNC_FUNC(bor, uint16_t, uint16_t)
+FUNC_FUNC(bor,  int32_t,  int32_t)
+FUNC_FUNC(bor, uint32_t, uint32_t)
+FUNC_FUNC(bor,  int64_t,  int64_t)
+FUNC_FUNC(bor, uint64_t, uint64_t)
+FUNC_FUNC(bor,  long,  long)
+FUNC_FUNC(bor,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC(bxor,   int8_t,   int8_t)
+FUNC_FUNC(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC(bxor,  int16_t,  int16_t)
+FUNC_FUNC(bxor, uint16_t, uint16_t)
+FUNC_FUNC(bxor,  int32_t,  int32_t)
+FUNC_FUNC(bxor, uint32_t, uint32_t)
+FUNC_FUNC(bxor,  int64_t,  int64_t)
+FUNC_FUNC(bxor, uint64_t, uint64_t)
+FUNC_FUNC(bxor,  long,  long)
+FUNC_FUNC(bxor,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC(bxor, byte, char)
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+LOC_FUNC(maxloc, float_int, >)
+LOC_FUNC(maxloc, double_int, >)
+LOC_FUNC(maxloc, long_int, >)
+LOC_FUNC(maxloc, 2int, >)
+LOC_FUNC(maxloc, short_int, >)
+LOC_FUNC(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+LOC_FUNC(minloc, float_int, <)
+LOC_FUNC(minloc, double_int, <)
+LOC_FUNC(minloc, long_int, <)
+LOC_FUNC(minloc, 2int, <)
+LOC_FUNC(minloc, short_int, <)
+LOC_FUNC(minloc, long_double_int, <)
+
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *    routines, needed for some optimizations.
+ */
+#define OP_FUNC_3BUF(name, type_name, type, op)                                                     \
+    static __global__ void                                                                          \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type* in2,              \
+                                                     type *out, int n) {                            \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            out[i] = in1[i] op in2[i];                                                              \
+        }                                                                                           \
+    }                                                                                               \
+    void ompi_op_cuda_3buff_##name##_##type_name##_submit(const type *in1, const type *in2,         \
+                                                          type *out, int count,                     \
+                                                          int threads_per_block,                    \
+                                                          CUstream stream) {                        \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count+threads-1) / threads;                                                  \
+        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
+                                                           0, stream>>>(in1, in2, out, count);      \
+    }
+
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out = op(in1, in2))
+ */
+#define FUNC_FUNC_3BUF(name, type_name, type)                                                       \
+    static __global__ void                                                                          \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type *in2,              \
+                                                     type *out, int n) {                            \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            out[i] = current_func(in1[i], in2[i]);                                                  \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_cuda_3buff_##name##_##type_name##_submit(const type *in1, const type *in2,              \
+                                                     type *out, int count,                          \
+                                                     int threads_per_block,                         \
+                                                     CUstream stream) {                             \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count+threads-1) / threads;                                                  \
+        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
+                                                           0, stream>>>(in1, in2, out, count);      \
+    }
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+/*
+#define LOC_STRUCT(type_name, type1, type2) \
+  typedef struct { \
+      type1 v; \
+      type2 k; \
+  } ompi_op_predefined_##type_name##_t;
+*/
+
+#define LOC_FUNC_3BUF(name, type_name, op)                                                          \
+    static __global__ void                                                                          \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in1, \
+                                                     const ompi_op_predefined_##type_name##_t *in2, \
+                                                     ompi_op_predefined_##type_name##_t *out,       \
+                                                     int n)                                         \
+    {                                                                                               \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            const ompi_op_predefined_##type_name##_t *a1 = &in1[i];                                 \
+            const ompi_op_predefined_##type_name##_t *a2 = &in2[i];                                 \
+            ompi_op_predefined_##type_name##_t *b = &out[i];                                        \
+            if (a1->v op a2->v) {                                                                   \
+                b->v = a1->v;                                                                       \
+                b->k = a1->k;                                                                       \
+            } else if (a1->v == a2->v) {                                                            \
+                b->v = a1->v;                                                                       \
+                b->k = (a2->k < a1->k ? a2->k : a1->k);                                             \
+            } else {                                                                                \
+                b->v = a2->v;                                                                       \
+                b->k = a2->k;                                                                       \
+            }                                                                                       \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_cuda_3buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *in1, \
+                                                     const ompi_op_predefined_##type_name##_t *in2, \
+                                                     ompi_op_predefined_##type_name##_t *out,       \
+                                                     int count,                                     \
+                                                     int threads_per_block,                         \
+                                                     CUstream stream)                               \
+    {                                                                                               \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count+threads-1) / threads;                                                  \
+        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
+                                                           0, stream>>>(in1, in2, out, count);      \
+    }
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF(max,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(max,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(max,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(max, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(max,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(max, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(max,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(max,  long,  long)
+FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(max, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(max, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(max, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(max, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(max, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(max, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF(max, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF(max, float, float)
+FUNC_FUNC_3BUF(max, double, double)
+FUNC_FUNC_3BUF(max, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF(max, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF(max, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF(max, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF(max, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF(max, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF(max, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF(min,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(min,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(min,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(min, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(min,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(min, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(min,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(min,  long,  long)
+FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(min, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(min, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(min, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(min, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(min, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(min, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF(min, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF(min, float, float)
+FUNC_FUNC_3BUF(min, double, double)
+FUNC_FUNC_3BUF(min, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF(min, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF(min, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF(min, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF(min, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF(min, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF(min, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF(sum,   int8_t,   int8_t, +)
+OP_FUNC_3BUF(sum,  uint8_t,  uint8_t, +)
+OP_FUNC_3BUF(sum,  int16_t,  int16_t, +)
+OP_FUNC_3BUF(sum, uint16_t, uint16_t, +)
+OP_FUNC_3BUF(sum,  int32_t,  int32_t, +)
+OP_FUNC_3BUF(sum, uint32_t, uint32_t, +)
+OP_FUNC_3BUF(sum,  int64_t,  int64_t, +)
+OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
+OP_FUNC_3BUF(sum,  long,  long, +)
+OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF(sum, fortran_integer, ompi_fortran_integer_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF(sum, fortran_integer1, ompi_fortran_integer1_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF(sum, fortran_integer2, ompi_fortran_integer2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF(sum, fortran_integer4, ompi_fortran_integer4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF(sum, fortran_integer8, ompi_fortran_integer8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF(sum, fortran_integer16, ompi_fortran_integer16_t, +)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF(sum, short_float, short float, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF(sum, short_float, opal_short_float_t, +)
+#endif
+OP_FUNC_3BUF(sum, float, float, +)
+OP_FUNC_3BUF(sum, double, double, +)
+OP_FUNC_3BUF(sum, long_double, long double, +)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF(sum, fortran_real, ompi_fortran_real_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF(sum, fortran_real2, ompi_fortran_real2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF(sum, fortran_real4, ompi_fortran_real4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF(sum, fortran_real8, ompi_fortran_real8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF(sum, fortran_real16, ompi_fortran_real16_t, +)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF(sum, c_short_float_complex, short float _Complex, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+#endif // 0
+#undef current_func
+#define current_func(a, b) (cuCmulf(a,b))
+FUNC_FUNC_3BUF(sum, c_float_complex, cuFloatComplex)
+#undef current_func
+#define current_func(a, b) (cuCmul(a,b))
+FUNC_FUNC_3BUF(sum, c_double_complex, cuDoubleComplex)
+//OP_FUNC_3BUF(sum, c_long_double_complex, cuLongDoubleComplex, +)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF(prod,   int8_t,   int8_t, *)
+OP_FUNC_3BUF(prod,  uint8_t,  uint8_t, *)
+OP_FUNC_3BUF(prod,  int16_t,  int16_t, *)
+OP_FUNC_3BUF(prod, uint16_t, uint16_t, *)
+OP_FUNC_3BUF(prod,  int32_t,  int32_t, *)
+OP_FUNC_3BUF(prod, uint32_t, uint32_t, *)
+OP_FUNC_3BUF(prod,  int64_t,  int64_t, *)
+OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
+OP_FUNC_3BUF(prod,  long,  long, *)
+OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF(prod, fortran_integer, ompi_fortran_integer_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF(prod, fortran_integer1, ompi_fortran_integer1_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF(prod, fortran_integer2, ompi_fortran_integer2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF(prod, fortran_integer4, ompi_fortran_integer4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF(prod, fortran_integer8, ompi_fortran_integer8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF(prod, fortran_integer16, ompi_fortran_integer16_t, *)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF(prod, short_float, short float, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF(prod, short_float, opal_short_float_t, *)
+#endif
+OP_FUNC_3BUF(prod, float, float, *)
+OP_FUNC_3BUF(prod, double, double, *)
+OP_FUNC_3BUF(prod, long_double, long double, *)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF(prod, fortran_real, ompi_fortran_real_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF(prod, fortran_real2, ompi_fortran_real2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF(prod, fortran_real4, ompi_fortran_real4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF(prod, fortran_real8, ompi_fortran_real8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF(prod, fortran_real16, ompi_fortran_real16_t, *)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF(prod, c_short_float_complex, short float _Complex, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF(prod, c_float_complex, float _Complex, *)
+OP_FUNC_3BUF(prod, c_double_complex, double _Complex, *)
+OP_FUNC_3BUF(prod, c_long_double_complex, long double _Complex, *)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC_3BUF(land,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(land,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(land,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(land, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(land,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(land, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(land,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(land,  long,  long)
+FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC_3BUF(lor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(lor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(lor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(lor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(lor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(lor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(lor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(lor,  long,  long)
+FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(lor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC_3BUF(lxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(lxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(lxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(lxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(lxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(lxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(lxor,  long,  long)
+FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(lxor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC_3BUF(band,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(band,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(band,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(band, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(band,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(band, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(band,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(band,  long,  long)
+FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(band, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(band, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(band, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(band, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(band, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(band, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC_3BUF(bor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(bor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(bor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(bor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(bor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(bor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(bor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(bor,  long,  long)
+FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(bor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(bor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(bor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(bor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(bor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(bor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC_3BUF(bxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(bxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(bxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(bxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(bxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(bxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(bxor,  long,  long)
+FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(bxor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(bxor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(bxor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(bxor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(bxor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(bxor, byte, char)
+
+/*************************************************************************
+ * Min and max location "pair" datatypes
+ *************************************************************************/
+
+/*
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_STRUCT_3BUF(2real, ompi_fortran_real_t, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_STRUCT_3BUF(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_STRUCT_3BUF(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
+#endif
+LOC_STRUCT_3BUF(float_int, float, int)
+LOC_STRUCT_3BUF(double_int, double, int)
+LOC_STRUCT_3BUF(long_int, long, int)
+LOC_STRUCT_3BUF(2int, int, int)
+LOC_STRUCT_3BUF(short_int, short, int)
+LOC_STRUCT_3BUF(long_double_int, long double, int)
+*/
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF(maxloc, 2real, >)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF(maxloc, 2double_precision, >)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF(maxloc, 2integer, >)
+#endif
+LOC_FUNC_3BUF(maxloc, float_int, >)
+LOC_FUNC_3BUF(maxloc, double_int, >)
+LOC_FUNC_3BUF(maxloc, long_int, >)
+LOC_FUNC_3BUF(maxloc, 2int, >)
+LOC_FUNC_3BUF(maxloc, short_int, >)
+LOC_FUNC_3BUF(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF(minloc, 2real, <)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF(minloc, 2double_precision, <)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF(minloc, 2integer, <)
+#endif
+LOC_FUNC_3BUF(minloc, float_int, <)
+LOC_FUNC_3BUF(minloc, double_int, <)
+LOC_FUNC_3BUF(minloc, long_int, <)
+LOC_FUNC_3BUF(minloc, 2int, <)
+LOC_FUNC_3BUF(minloc, short_int, <)
+LOC_FUNC_3BUF(minloc, long_double_int, <)
diff --git a/ompi/mca/op/cuda/op_cuda_impl.h b/ompi/mca/op/cuda/op_cuda_impl.h
new file mode 100644
index 00000000000..e9b755305e8
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_impl.h
@@ -0,0 +1,915 @@
+/*
+ * Copyright (c) 2019-2021 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <sys/types.h>
+
+#include <cuda.h>
+#include <cuComplex.h>
+
+#ifndef BEGIN_C_DECLS
+#if defined(c_plusplus) || defined(__cplusplus)
+#    define BEGIN_C_DECLS extern "C" {
+#    define END_C_DECLS   }
+#else
+#    define BEGIN_C_DECLS /* empty */
+#    define END_C_DECLS   /* empty */
+#endif
+#endif
+
+BEGIN_C_DECLS
+
+#define OP_FUNC_SIG(name, type_name, type, op)                                               \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                  \
+                                                   type *inout,                     \
+                                                   int count,                      \
+                                                   int threads_per_block,           \
+                                                   CUstream stream);
+
+#define FUNC_FUNC_SIG(name, type_name, type)                                            \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                  \
+                                                   type *inout,                     \
+                                                   int count,                      \
+                                                   int threads_per_block,           \
+                                                   CUstream stream);
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+#define LOC_STRUCT(type_name, type1, type2) \
+  typedef struct { \
+      type1 v; \
+      type2 k; \
+  } ompi_op_predefined_##type_name##_t;
+
+#define LOC_FUNC_SIG(name, type_name, op) \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a, \
+                                            ompi_op_predefined_##type_name##_t *b,    \
+                                            int count,                                   \
+                                            int threads_per_block,                        \
+                                            CUstream stream);
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_SIG(max,   int8_t,   int8_t)
+FUNC_FUNC_SIG(max,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(max,  int16_t,  int16_t)
+FUNC_FUNC_SIG(max, uint16_t, uint16_t)
+FUNC_FUNC_SIG(max,  int32_t,  int32_t)
+FUNC_FUNC_SIG(max, uint32_t, uint32_t)
+FUNC_FUNC_SIG(max,  int64_t,  int64_t)
+FUNC_FUNC_SIG(max, uint64_t, uint64_t)
+FUNC_FUNC_SIG(max,  long,  long)
+FUNC_FUNC_SIG(max,  unsigned_long, unsigned long)
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_SIG(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_SIG(max, short_float, opal_short_float_t)
+#endif
+#endif // 0
+
+FUNC_FUNC_SIG(max, float, float)
+FUNC_FUNC_SIG(max, double, double)
+FUNC_FUNC_SIG(max, long_double, long double)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_SIG(min,   int8_t,   int8_t)
+FUNC_FUNC_SIG(min,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(min,  int16_t,  int16_t)
+FUNC_FUNC_SIG(min, uint16_t, uint16_t)
+FUNC_FUNC_SIG(min,  int32_t,  int32_t)
+FUNC_FUNC_SIG(min, uint32_t, uint32_t)
+FUNC_FUNC_SIG(min,  int64_t,  int64_t)
+FUNC_FUNC_SIG(min, uint64_t, uint64_t)
+FUNC_FUNC_SIG(min,  long,  long)
+FUNC_FUNC_SIG(min,  unsigned_long, unsigned long)
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_SIG(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_SIG(min, short_float, opal_short_float_t)
+#endif
+#endif // 0
+
+FUNC_FUNC_SIG(min, float, float)
+FUNC_FUNC_SIG(min, double, double)
+FUNC_FUNC_SIG(min, long_double, long double)
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_SIG(sum,   int8_t,   int8_t, +=)
+OP_FUNC_SIG(sum,  uint8_t,  uint8_t, +=)
+OP_FUNC_SIG(sum,  int16_t,  int16_t, +=)
+OP_FUNC_SIG(sum, uint16_t, uint16_t, +=)
+OP_FUNC_SIG(sum,  int32_t,  int32_t, +=)
+OP_FUNC_SIG(sum, uint32_t, uint32_t, +=)
+OP_FUNC_SIG(sum,  int64_t,  int64_t, +=)
+OP_FUNC_SIG(sum, uint64_t, uint64_t, +=)
+OP_FUNC_SIG(sum,  long,  long, +=)
+OP_FUNC_SIG(sum,  unsigned_long, unsigned long, +=)
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_SIG(sum, short_float, short float, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_SIG(sum, short_float, opal_short_float_t, +=)
+#endif
+#endif // 0
+
+OP_FUNC_SIG(sum, float, float, +=)
+OP_FUNC_SIG(sum, double, double, +=)
+OP_FUNC_SIG(sum, long_double, long double, +=)
+
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_SIG(sum, c_short_float_complex, short float _Complex, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+#endif // 0
+FUNC_FUNC_SIG(sum, c_float_complex, cuFloatComplex)
+FUNC_FUNC_SIG(sum, c_double_complex, cuDoubleComplex)
+//OP_FUNC_SIG(sum, c_float_complex, float _Complex, +=)
+//OP_FUNC_SIG(sum, c_double_complex, double _Complex, +=)
+//OP_FUNC_SIG(sum, c_long_double_complex, long double _Complex, +=)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_SIG(prod,   int8_t,   int8_t, *=)
+OP_FUNC_SIG(prod,  uint8_t,  uint8_t, *=)
+OP_FUNC_SIG(prod,  int16_t,  int16_t, *=)
+OP_FUNC_SIG(prod, uint16_t, uint16_t, *=)
+OP_FUNC_SIG(prod,  int32_t,  int32_t, *=)
+OP_FUNC_SIG(prod, uint32_t, uint32_t, *=)
+OP_FUNC_SIG(prod,  int64_t,  int64_t, *=)
+OP_FUNC_SIG(prod, uint64_t, uint64_t, *=)
+OP_FUNC_SIG(prod,  long,  long, *=)
+OP_FUNC_SIG(prod,  unsigned_long, unsigned long, *=)
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_SIG(prod, short_float, short float, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_SIG(prod, short_float, opal_short_float_t, *=)
+#endif
+#endif // 0
+
+OP_FUNC_SIG(prod, float, float, *=)
+OP_FUNC_SIG(prod, double, double, *=)
+OP_FUNC_SIG(prod, long_double, long double, *=)
+
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_SIG(prod, c_short_float_complex, short float _Complex, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_SIG(prod, c_float_complex, float _Complex, *=)
+OP_FUNC_SIG(prod, c_double_complex, double _Complex, *=)
+OP_FUNC_SIG(prod, c_long_double_complex, long double _Complex, *=)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC_SIG(land,   int8_t,   int8_t)
+FUNC_FUNC_SIG(land,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(land,  int16_t,  int16_t)
+FUNC_FUNC_SIG(land, uint16_t, uint16_t)
+FUNC_FUNC_SIG(land,  int32_t,  int32_t)
+FUNC_FUNC_SIG(land, uint32_t, uint32_t)
+FUNC_FUNC_SIG(land,  int64_t,  int64_t)
+FUNC_FUNC_SIG(land, uint64_t, uint64_t)
+FUNC_FUNC_SIG(land,  long,  long)
+FUNC_FUNC_SIG(land,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC_SIG(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC_SIG(lor,   int8_t,   int8_t)
+FUNC_FUNC_SIG(lor,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(lor,  int16_t,  int16_t)
+FUNC_FUNC_SIG(lor, uint16_t, uint16_t)
+FUNC_FUNC_SIG(lor,  int32_t,  int32_t)
+FUNC_FUNC_SIG(lor, uint32_t, uint32_t)
+FUNC_FUNC_SIG(lor,  int64_t,  int64_t)
+FUNC_FUNC_SIG(lor, uint64_t, uint64_t)
+FUNC_FUNC_SIG(lor,  long,  long)
+FUNC_FUNC_SIG(lor,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC_SIG(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC_SIG(lxor,   int8_t,   int8_t)
+FUNC_FUNC_SIG(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(lxor,  int16_t,  int16_t)
+FUNC_FUNC_SIG(lxor, uint16_t, uint16_t)
+FUNC_FUNC_SIG(lxor,  int32_t,  int32_t)
+FUNC_FUNC_SIG(lxor, uint32_t, uint32_t)
+FUNC_FUNC_SIG(lxor,  int64_t,  int64_t)
+FUNC_FUNC_SIG(lxor, uint64_t, uint64_t)
+FUNC_FUNC_SIG(lxor,  long,  long)
+FUNC_FUNC_SIG(lxor,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC_SIG(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC_SIG(band,   int8_t,   int8_t)
+FUNC_FUNC_SIG(band,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(band,  int16_t,  int16_t)
+FUNC_FUNC_SIG(band, uint16_t, uint16_t)
+FUNC_FUNC_SIG(band,  int32_t,  int32_t)
+FUNC_FUNC_SIG(band, uint32_t, uint32_t)
+FUNC_FUNC_SIG(band,  int64_t,  int64_t)
+FUNC_FUNC_SIG(band, uint64_t, uint64_t)
+FUNC_FUNC_SIG(band,  long,  long)
+FUNC_FUNC_SIG(band,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC_SIG(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC_SIG(bor,   int8_t,   int8_t)
+FUNC_FUNC_SIG(bor,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(bor,  int16_t,  int16_t)
+FUNC_FUNC_SIG(bor, uint16_t, uint16_t)
+FUNC_FUNC_SIG(bor,  int32_t,  int32_t)
+FUNC_FUNC_SIG(bor, uint32_t, uint32_t)
+FUNC_FUNC_SIG(bor,  int64_t,  int64_t)
+FUNC_FUNC_SIG(bor, uint64_t, uint64_t)
+FUNC_FUNC_SIG(bor,  long,  long)
+FUNC_FUNC_SIG(bor,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC_SIG(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC_SIG(bxor,   int8_t,   int8_t)
+FUNC_FUNC_SIG(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(bxor,  int16_t,  int16_t)
+FUNC_FUNC_SIG(bxor, uint16_t, uint16_t)
+FUNC_FUNC_SIG(bxor,  int32_t,  int32_t)
+FUNC_FUNC_SIG(bxor, uint32_t, uint32_t)
+FUNC_FUNC_SIG(bxor,  int64_t,  int64_t)
+FUNC_FUNC_SIG(bxor, uint64_t, uint64_t)
+FUNC_FUNC_SIG(bxor,  long,  long)
+FUNC_FUNC_SIG(bxor,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC_SIG(bxor, byte, char)
+
+/*************************************************************************
+ * Min and max location "pair" datatypes
+ *************************************************************************/
+
+LOC_STRUCT(float_int, float, int)
+LOC_STRUCT(double_int, double, int)
+LOC_STRUCT(long_int, long, int)
+LOC_STRUCT(2int, int, int)
+LOC_STRUCT(short_int, short, int)
+LOC_STRUCT(long_double_int, long double, int)
+LOC_STRUCT(unsigned_long, unsigned long, int)
+/* compat types for Fortran */
+LOC_STRUCT(2real, float, float)
+LOC_STRUCT(2double_precision, double, double)
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+LOC_FUNC_SIG(maxloc, float_int, >)
+LOC_FUNC_SIG(maxloc, double_int, >)
+LOC_FUNC_SIG(maxloc, long_int, >)
+LOC_FUNC_SIG(maxloc, 2int, >)
+LOC_FUNC_SIG(maxloc, short_int, >)
+LOC_FUNC_SIG(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+LOC_FUNC_SIG(minloc, float_int, <)
+LOC_FUNC_SIG(minloc, double_int, <)
+LOC_FUNC_SIG(minloc, long_int, <)
+LOC_FUNC_SIG(minloc, 2int, <)
+LOC_FUNC_SIG(minloc, short_int, <)
+LOC_FUNC_SIG(minloc, long_double_int, <)
+
+
+
+#define OP_FUNC_3BUF_SIG(name, type_name, type, op)                                               \
+    void ompi_op_cuda_3buff_##name##_##type_name##_submit(const type *in1,                  \
+                                                          const type *in2,                  \
+                                                          type *inout,                     \
+                                                          int count,                      \
+                                                          int threads_per_block,           \
+                                                          CUstream stream);
+
+#define FUNC_FUNC_3BUF_SIG(name, type_name, type)                                            \
+    void ompi_op_cuda_3buff_##name##_##type_name##_submit(const type *in1,                  \
+                                                          const type *in2,                  \
+                                                          type *inout,                     \
+                                                          int count,                      \
+                                                          int threads_per_block,           \
+                                                          CUstream stream);
+
+#define LOC_FUNC_3BUF_SIG(name, type_name, op) \
+    void ompi_op_cuda_3buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a1, \
+                                                          const ompi_op_predefined_##type_name##_t *a2, \
+                                                          ompi_op_predefined_##type_name##_t *b,    \
+                                                          int count,                                   \
+                                                          int threads_per_block,                        \
+                                                          CUstream stream);
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF_SIG(max,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(max,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(max,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(max, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(max,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(max, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(max,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(max, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(max,  long,  long)
+FUNC_FUNC_3BUF_SIG(max,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(max, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(max, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(max, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(max, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(max, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(max, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF_SIG(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF_SIG(max, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF_SIG(max, float, float)
+FUNC_FUNC_3BUF_SIG(max, double, double)
+FUNC_FUNC_3BUF_SIG(max, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF_SIG(max, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF_SIG(max, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF_SIG(max, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF_SIG(max, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF_SIG(max, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF_SIG(max, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF_SIG(min,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(min,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(min,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(min, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(min,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(min, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(min,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(min, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(min,  long,  long)
+FUNC_FUNC_3BUF_SIG(min,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(min, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(min, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(min, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(min, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(min, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(min, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF_SIG(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF_SIG(min, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF_SIG(min, float, float)
+FUNC_FUNC_3BUF_SIG(min, double, double)
+FUNC_FUNC_3BUF_SIG(min, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF_SIG(min, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF_SIG(min, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF_SIG(min, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF_SIG(min, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF_SIG(min, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF_SIG(min, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF_SIG(sum,   int8_t,   int8_t, +)
+OP_FUNC_3BUF_SIG(sum,  uint8_t,  uint8_t, +)
+OP_FUNC_3BUF_SIG(sum,  int16_t,  int16_t, +)
+OP_FUNC_3BUF_SIG(sum, uint16_t, uint16_t, +)
+OP_FUNC_3BUF_SIG(sum,  int32_t,  int32_t, +)
+OP_FUNC_3BUF_SIG(sum, uint32_t, uint32_t, +)
+OP_FUNC_3BUF_SIG(sum,  int64_t,  int64_t, +)
+OP_FUNC_3BUF_SIG(sum, uint64_t, uint64_t, +)
+OP_FUNC_3BUF_SIG(sum,  long,  long, +)
+OP_FUNC_3BUF_SIG(sum,  unsigned_long, unsigned long, +)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF_SIG(sum, fortran_integer, ompi_fortran_integer_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF_SIG(sum, fortran_integer1, ompi_fortran_integer1_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF_SIG(sum, fortran_integer2, ompi_fortran_integer2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF_SIG(sum, fortran_integer4, ompi_fortran_integer4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF_SIG(sum, fortran_integer8, ompi_fortran_integer8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF_SIG(sum, fortran_integer16, ompi_fortran_integer16_t, +)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF_SIG(sum, short_float, short float, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF_SIG(sum, short_float, opal_short_float_t, +)
+#endif
+OP_FUNC_3BUF_SIG(sum, float, float, +)
+OP_FUNC_3BUF_SIG(sum, double, double, +)
+OP_FUNC_3BUF_SIG(sum, long_double, long double, +)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF_SIG(sum, fortran_real, ompi_fortran_real_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF_SIG(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF_SIG(sum, fortran_real2, ompi_fortran_real2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF_SIG(sum, fortran_real4, ompi_fortran_real4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF_SIG(sum, fortran_real8, ompi_fortran_real8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF_SIG(sum, fortran_real16, ompi_fortran_real16_t, +)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF_SIG(sum, c_short_float_complex, short float _Complex, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF_SIG(sum, c_float_complex, float _Complex, +)
+OP_FUNC_3BUF_SIG(sum, c_double_complex, double _Complex, +)
+OP_FUNC_3BUF_SIG(sum, c_long_double_complex, long double _Complex, +)
+#endif // 0
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF_SIG(prod,   int8_t,   int8_t, *)
+OP_FUNC_3BUF_SIG(prod,  uint8_t,  uint8_t, *)
+OP_FUNC_3BUF_SIG(prod,  int16_t,  int16_t, *)
+OP_FUNC_3BUF_SIG(prod, uint16_t, uint16_t, *)
+OP_FUNC_3BUF_SIG(prod,  int32_t,  int32_t, *)
+OP_FUNC_3BUF_SIG(prod, uint32_t, uint32_t, *)
+OP_FUNC_3BUF_SIG(prod,  int64_t,  int64_t, *)
+OP_FUNC_3BUF_SIG(prod, uint64_t, uint64_t, *)
+OP_FUNC_3BUF_SIG(prod,  long,  long, *)
+OP_FUNC_3BUF_SIG(prod,  unsigned_long, unsigned long, *)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF_SIG(prod, fortran_integer, ompi_fortran_integer_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF_SIG(prod, fortran_integer1, ompi_fortran_integer1_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF_SIG(prod, fortran_integer2, ompi_fortran_integer2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF_SIG(prod, fortran_integer4, ompi_fortran_integer4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF_SIG(prod, fortran_integer8, ompi_fortran_integer8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF_SIG(prod, fortran_integer16, ompi_fortran_integer16_t, *)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF_SIG(prod, short_float, short float, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF_SIG(prod, short_float, opal_short_float_t, *)
+#endif
+OP_FUNC_3BUF_SIG(prod, float, float, *)
+OP_FUNC_3BUF_SIG(prod, double, double, *)
+OP_FUNC_3BUF_SIG(prod, long_double, long double, *)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF_SIG(prod, fortran_real, ompi_fortran_real_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF_SIG(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF_SIG(prod, fortran_real2, ompi_fortran_real2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF_SIG(prod, fortran_real4, ompi_fortran_real4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF_SIG(prod, fortran_real8, ompi_fortran_real8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF_SIG(prod, fortran_real16, ompi_fortran_real16_t, *)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF_SIG(prod, c_short_float_complex, short float _Complex, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF_SIG(prod, c_float_complex, float _Complex, *)
+OP_FUNC_3BUF_SIG(prod, c_double_complex, double _Complex, *)
+OP_FUNC_3BUF_SIG(prod, c_long_double_complex, long double _Complex, *)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC_3BUF_SIG(land,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(land,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(land,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(land, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(land,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(land, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(land,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(land, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(land,  long,  long)
+FUNC_FUNC_3BUF_SIG(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF_SIG(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF_SIG(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC_3BUF_SIG(lor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(lor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(lor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(lor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(lor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(lor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(lor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(lor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(lor,  long,  long)
+FUNC_FUNC_3BUF_SIG(lor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF_SIG(lor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF_SIG(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC_3BUF_SIG(lxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(lxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(lxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(lxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(lxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(lxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(lxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(lxor,  long,  long)
+FUNC_FUNC_3BUF_SIG(lxor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF_SIG(lxor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF_SIG(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC_3BUF_SIG(band,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(band,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(band,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(band, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(band,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(band, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(band,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(band, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(band,  long,  long)
+FUNC_FUNC_3BUF_SIG(band,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(band, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(band, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(band, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(band, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(band, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(band, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF_SIG(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC_3BUF_SIG(bor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(bor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(bor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(bor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(bor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(bor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(bor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(bor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(bor,  long,  long)
+FUNC_FUNC_3BUF_SIG(bor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF_SIG(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC_3BUF_SIG(bxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(bxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(bxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(bxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(bxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(bxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(bxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(bxor,  long,  long)
+FUNC_FUNC_3BUF_SIG(bxor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF_SIG(bxor, byte, char)
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+#if 0
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF_SIG(maxloc, 2real, >)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF_SIG(maxloc, 2double_precision, >)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF_SIG(maxloc, 2integer, >)
+#endif
+#endif // 0
+LOC_FUNC_3BUF_SIG(maxloc, float_int, >)
+LOC_FUNC_3BUF_SIG(maxloc, double_int, >)
+LOC_FUNC_3BUF_SIG(maxloc, long_int, >)
+LOC_FUNC_3BUF_SIG(maxloc, 2int, >)
+LOC_FUNC_3BUF_SIG(maxloc, short_int, >)
+LOC_FUNC_3BUF_SIG(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+#if 0
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF_SIG(minloc, 2real, <)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF_SIG(minloc, 2double_precision, <)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF_SIG(minloc, 2integer, <)
+#endif
+#endif // 0
+LOC_FUNC_3BUF_SIG(minloc, float_int, <)
+LOC_FUNC_3BUF_SIG(minloc, double_int, <)
+LOC_FUNC_3BUF_SIG(minloc, long_int, <)
+LOC_FUNC_3BUF_SIG(minloc, 2int, <)
+LOC_FUNC_3BUF_SIG(minloc, short_int, <)
+LOC_FUNC_3BUF_SIG(minloc, long_double_int, <)
+
+END_C_DECLS
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 34d26376ab9..3da811788e6 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -376,6 +376,8 @@ typedef struct ompi_op_base_module_1_0_0_t {
         is being used for */
     struct ompi_op_t *opm_op;
 
+    bool opm_device_enabled;
+
     /** Function pointers for all the different datatypes to be used
         with the MPI_Op that this module is used with */
     ompi_op_base_handler_fn_1_0_0_t opm_fns[OMPI_OP_BASE_TYPE_MAX];
diff --git a/ompi/op/op.c b/ompi/op/op.c
index 45abed1d2fc..a75d6b33d5b 100644
--- a/ompi/op/op.c
+++ b/ompi/op/op.c
@@ -510,19 +510,16 @@ static void ompi_op_destruct(ompi_op_t *op)
 
     if (op->o_device_op != NULL) {
         for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            if( NULL != op->o_device_op->o_intrisic.modules[i] ) {
-                OBJ_RELEASE(op->o_device_op->o_intrisic.modules[i]);
-                op->o_device_op->o_intrisic->modules[i] = NULL;
+            if( NULL != op->o_device_op->do_intrinsic.modules[i] ) {
+                OBJ_RELEASE(op->o_device_op->do_intrinsic.modules[i]);
+                op->o_device_op->do_intrinsic.modules[i] = NULL;
             }
-            if( NULL != op->o_device_op->o_3buff_intrisic.modules[i] ) {
-                OBJ_RELEASE(op->o_device_op->o_3buff_intrisic.modules[i]);
-                op->o_device_op->o_3buff_intrisic->modules[i] = NULL;
+            if( NULL != op->o_device_op->do_3buff_intrinsic.modules[i] ) {
+                OBJ_RELEASE(op->o_device_op->do_3buff_intrinsic.modules[i]);
+                op->o_device_op->do_3buff_intrinsic.modules[i] = NULL;
             }
         }
+        free(op->o_device_op);
+        op->o_device_op = NULL;
     }
-    if (op->o_device_op) {
-        OBJ_RELEASE(op->do_stream);
-        op->o_device_op->do_stream = NULL;
-    }
-    free(op->o_device_op);
 }
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 3ff6c3aac47..08b3927991b 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -126,8 +126,8 @@ enum ompi_op_type {
 /* device op information */
 struct ompi_device_op_t {
     opal_accelerator_stream_t *do_stream;
-    ompi_op_base_op_fns_t do_intrisic;
-    ompi_op_base_op_3buff_fns_t do_3buff_intrisic;
+    ompi_op_base_op_fns_t do_intrinsic;
+    ompi_op_base_op_3buff_fns_t do_3buff_intrinsic;
 };
 typedef struct ompi_device_op_t ompi_device_op_t;
 
@@ -576,14 +576,14 @@ static inline void ompi_op_reduce(ompi_op_t * op, void *source,
     bool use_device_op = false;
     int source_dev_id, target_dev_id;
     uint64_t source_flags, target_flags;
+    int target_check_addr = opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
+    int source_check_addr = opal_accelerator.check_addr(source, &source_dev_id, &source_flags);
     /* check if either of the buffers is on a device and if so make sure we can
      * access handle it properly */
-    if (opal_accelerator.check_addr(source, &source_dev_id, &source_flags) > 0 ||
-        opal_accelerator.check_addr(target, &target_dev_id, &target_flags) > 0) {
+    if (target_check_addr > 0 || source_check_addr > 0) {
         if (ompi_datatype_is_predefined(dtype) &&
-            source_dev_id == target_dev_id &&
             0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC) &&
-            NULL == op->o_device_intrisic) {
+            NULL != op->o_device_op) {
             use_device_op = true;
         } else {
             /* TODO: can we be more graceful here? */
@@ -601,12 +601,12 @@ static inline void ompi_op_reduce(ompi_op_t * op, void *source,
             dtype_id = ompi_op_ddt_map[dtype->id];
         }
         if (use_device_op) {
-            if (NULL == op->o_device_intrisic) {
+            if (NULL == op->o_device_op) {
                 abort(); // TODO: be more graceful!
             }
-            op->o_device_intrisic->intrinsic.fns[dtype_id](source, target,
-                                                           &count, &dtype,
-                                                           op->o_device_intrisic->intrinsic.modules[dtype_id]);
+            op->o_device_op->do_intrinsic.fns[dtype_id](source, target,
+                                                     &count, &dtype,
+                                                     op->o_device_op->do_intrinsic.modules[dtype_id]);
         } else {
             op->o_func.intrinsic.fns[dtype_id](source, target,
                                                &count, &dtype,
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index 49d181a0b00..60e432841c0 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -77,6 +77,26 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_get_buffer_id
 };
 
+static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
+    /* query the device from the context */
+    int dev_id = -1;
+    CUdevice ptr_dev;
+    int num_devices;
+    cuCtxPushCurrent(mem_ctx);
+    cuCtxGetDevice(&ptr_dev);
+    cuDeviceGetCount(&num_devices);
+    for (int i = 0; i < num_devices; ++i) {
+        CUdevice dev;
+        cuDeviceGet(&dev, i);
+        if (dev == ptr_dev) {
+            dev_id = i;
+            break;
+        }
+    }
+    cuCtxPopCurrent(&mem_ctx);
+    return dev_id;
+}
+
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
 {
     CUresult result;
@@ -125,6 +145,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else if (0 == mem_type) {
         /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
         return 0;
+    } else {
+        /* query the device from the context */
+        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
     }
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
@@ -140,6 +163,10 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else if (CU_MEMORYTYPE_HOST == mem_type) {
         /* Host memory, nothing to do here */
         return 0;
+    } else {
+        result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
+        /* query the device from the context */
+        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
     }
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
@@ -187,7 +214,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
         }
     }
 
-    /* WORKAROUND - They are times when the above code determines a pice of memory
+    /* WORKAROUND - There are times when the above code determines a pice of memory
      * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems
      * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we
      * made it this far, then the assumption at this point is we have GPU memory.

From 164388aa5100447d452eea3b6720199526619497 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
Date: Tue, 14 Mar 2023 15:28:45 -0400
Subject: [PATCH 03/74] Update copyright header

Signed-off-by: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
---
 ompi/mca/op/base/op_base_frame.c     | 2 +-
 ompi/mca/op/base/op_base_op_select.c | 2 +-
 ompi/mca/op/cuda/configure.m4        | 2 +-
 ompi/mca/op/cuda/op_cuda.h           | 2 +-
 ompi/mca/op/cuda/op_cuda_component.c | 2 +-
 ompi/mca/op/cuda/op_cuda_functions.c | 2 +-
 ompi/mca/op/cuda/op_cuda_impl.cu     | 2 +-
 ompi/mca/op/cuda/op_cuda_impl.h      | 2 +-
 ompi/op/op.h                         | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/ompi/mca/op/base/op_base_frame.c b/ompi/mca/op/base/op_base_frame.c
index 9cde5589694..1a7d6dc1320 100644
--- a/ompi/mca/op/base/op_base_frame.c
+++ b/ompi/mca/op/base/op_base_frame.c
@@ -2,7 +2,7 @@
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2005 The University of Tennessee and The University
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
diff --git a/ompi/mca/op/base/op_base_op_select.c b/ompi/mca/op/base/op_base_op_select.c
index 5b26df1a0ca..a5040d1bdca 100644
--- a/ompi/mca/op/base/op_base_op_select.c
+++ b/ompi/mca/op/base/op_base_op_select.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2009 The University of Tennessee and The University
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
diff --git a/ompi/mca/op/cuda/configure.m4 b/ompi/mca/op/cuda/configure.m4
index 9c5c4794fba..91617ba4ecb 100644
--- a/ompi/mca/op/cuda/configure.m4
+++ b/ompi/mca/op/cuda/configure.m4
@@ -1,7 +1,7 @@
 # -*- shell-script -*-
 #
 # Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
-# Copyright (c) 2013      The University of Tennessee and The University
+# Copyright (c) 2023      The University of Tennessee and The University
 #                         of Tennessee Research Foundation.  All rights
 #                         reserved.
 # Copyright (c) 2022      Amazon.com, Inc. or its affiliates.
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index bbc16d26b25..cff3bbb55ef 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 The University of Tennessee and The University
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * $COPYRIGHT$
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index d61f99b06e8..738b1a5284d 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 The University of Tennessee and The University
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2020      Research Organization for Information Science
diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 4ef46be783c..96b58fcf97c 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 The University of Tennessee and The University
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2020      Research Organization for Information Science
diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index ce4677b9047..2045d6a4aaa 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 The University of Tennessee and The University
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2020      Research Organization for Information Science
diff --git a/ompi/mca/op/cuda/op_cuda_impl.h b/ompi/mca/op/cuda/op_cuda_impl.h
index e9b755305e8..9be5ec8b9f3 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.h
+++ b/ompi/mca/op/cuda/op_cuda_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 The University of Tennessee and The University
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2020      Research Organization for Information Science
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 08b3927991b..a080153f400 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2007 The University of Tennessee and The University
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,

From d8110ace270b4c0180bc58794ae295eb2a1204d0 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
Date: Tue, 14 Mar 2023 18:01:19 -0400
Subject: [PATCH 04/74] Fix minor bugs to get osu_allreduce working

Signed-off-by: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c   | 87 +++++++++++++++++-----
 ompi/mca/coll/basic/coll_basic_allreduce.c | 20 ++++-
 ompi/mca/op/cuda/op_cuda_functions.c       | 16 +++-
 3 files changed, 97 insertions(+), 26 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 30ab0a4f869..ef94817c8a2 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -42,6 +42,51 @@
 #include "coll_base_topo.h"
 #include "coll_base_util.h"
 
+#include "opal/mca/accelerator/accelerator.h"
+
+/* returns a pointer to memory in the same memory domain as ptr, and the device.
+ * If memory is allocated on the host, device will be set to -1. */
+static inline
+void* allocate_tmpbuf(const void *sendbuf, const void *recvbuf, size_t size, int *device) {
+    void *res = NULL;
+    uint64_t flags;
+    *device = -1;
+    /* if the recvbuf is on the device we take that device */
+    if (NULL != recvbuf && 0 < opal_accelerator.check_addr(recvbuf, device, &flags)) {
+        //printf("Allocating temporary buffer on device %d\n", *device);
+        if (OPAL_SUCCESS != opal_accelerator.mem_alloc(*device, &res, size)) {
+            /* fall back to the host */
+            res = NULL;
+            *device = -1;
+        }
+    } else if (MPI_IN_PLACE != sendbuf && NULL != sendbuf &&
+               0 < opal_accelerator.check_addr(sendbuf, device, &flags)) {
+        /* send buffer is on a device so try to allocate memory there */
+        //printf("Allocating temporary buffer on device %d\n", *device);
+        if (OPAL_SUCCESS != opal_accelerator.mem_alloc(*device, &res, size)) {
+            /* fall back to the host */
+            res = NULL;
+            *device = -1;
+        }
+    }
+
+    if (NULL == res) {
+        //printf("Allocating temporary buffer on host\n");
+        res = malloc(size);
+    }
+    return res;
+}
+
+static inline
+void free_tmpbuf(void *tmpbuf, int device) {
+    if (-1 == device) {
+        free(tmpbuf);
+    } else if (NULL != tmpbuf) {
+        opal_accelerator.mem_release(device, tmpbuf);
+    }
+}
+
+
 /*
  * ompi_coll_base_allreduce_intra_nonoverlapping
  *
@@ -140,6 +185,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     int ret, line, rank, size, adjsize, remote, distance;
     int newrank, newremote, extra_ranks;
     char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf_free = NULL, *inplacebuf;
+    int inplacebuf_dev;
     ptrdiff_t span, gap = 0;
 
     size = ompi_comm_size(comm);
@@ -159,7 +205,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Allocate and initialize temporary send buffer */
     span = opal_datatype_span(&dtype->super, count, &gap);
-    inplacebuf_free = (char*) malloc(span);
+    inplacebuf_free = allocate_tmpbuf(sbuf, rbuf, span, &inplacebuf_dev);
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     inplacebuf = inplacebuf_free - gap;
 
@@ -265,14 +311,14 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
 
-    if (NULL != inplacebuf_free) free(inplacebuf_free);
+    free_tmpbuf(inplacebuf_free, inplacebuf_dev);
     return MPI_SUCCESS;
 
  error_hndl:
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
-    if (NULL != inplacebuf_free) free(inplacebuf_free);
+    free_tmpbuf(inplacebuf_free, inplacebuf_dev);
     return ret;
 }
 
@@ -349,6 +395,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
 {
     int ret, line, rank, size, k, recv_from, send_to, block_count, inbi;
     int early_segcount, late_segcount, split_rank, max_segcount;
+    int inbuf_dev[2] = {-1, -1};
     size_t typelng;
     char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
     ptrdiff_t true_lb, true_extent, lb, extent;
@@ -399,11 +446,11 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
     max_segcount = early_segcount;
     max_real_segsize = true_extent + (max_segcount - 1) * extent;
 
-
-    inbuf[0] = (char*)malloc(max_real_segsize);
+    /* we don't care about where the send buffer is */
+    inbuf[0] = allocate_tmpbuf(NULL, rbuf, max_real_segsize, &inbuf_dev[0]);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = (char*)malloc(max_real_segsize);
+        inbuf[1] = allocate_tmpbuf(NULL, rbuf, max_real_segsize, &inbuf_dev[1]);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -523,8 +570,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
 
     }
 
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    free_tmpbuf(inbuf[0], inbuf_dev[0]);
+    free_tmpbuf(inbuf[1], inbuf_dev[1]);
 
     return MPI_SUCCESS;
 
@@ -533,8 +580,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    free_tmpbuf(inbuf[0], inbuf_dev[0]);
+    free_tmpbuf(inbuf[1], inbuf_dev[1]);
     return ret;
 }
 
@@ -628,6 +675,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
     int ret, line, rank, size, k, recv_from, send_to;
     int early_blockcount, late_blockcount, split_rank;
     int segcount, max_segcount, num_phases, phase, block_count, inbi;
+    int inbuf_dev[2] = {-1, -1};
     size_t typelng;
     char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
     ptrdiff_t block_offset, max_real_segsize;
@@ -688,10 +736,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
     /* Allocate and initialize temporary buffers */
-    inbuf[0] = (char*)malloc(max_real_segsize);
+    inbuf[0] = allocate_tmpbuf(NULL, rbuf, max_real_segsize, &inbuf_dev[0]);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = (char*)malloc(max_real_segsize);
+        inbuf[1] = allocate_tmpbuf(NULL, rbuf, max_real_segsize, &inbuf_dev[1]);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -843,8 +891,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
 
     }
 
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    free_tmpbuf(inbuf[0], inbuf_dev[0]);
+    free_tmpbuf(inbuf[1], inbuf_dev[1]);
 
     return MPI_SUCCESS;
 
@@ -853,8 +901,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    free_tmpbuf(inbuf[0], inbuf_dev[0]);
+    free_tmpbuf(inbuf[1], inbuf_dev[1]);
     return ret;
 }
 
@@ -976,6 +1024,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     mca_coll_base_module_t *module)
 {
     int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL;
+    int tmp_buf_dev = -1;
 
     int comm_size = ompi_comm_size(comm);
     int rank = ompi_comm_rank(comm);
@@ -1006,7 +1055,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
     /* Temporary buffer for receiving messages */
     char *tmp_buf = NULL;
-    char *tmp_buf_raw = (char *)malloc(dsize);
+    char *tmp_buf_raw = allocate_tmpbuf(NULL, rbuf, dsize, &tmp_buf_dev);
     if (NULL == tmp_buf_raw)
         return OMPI_ERR_OUT_OF_RESOURCE;
     tmp_buf = tmp_buf_raw - gap;
@@ -1234,8 +1283,8 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     }
 
   cleanup_and_return:
-    if (NULL != tmp_buf_raw)
-        free(tmp_buf_raw);
+
+    free_tmpbuf(tmp_buf_raw, tmp_buf_dev);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
diff --git a/ompi/mca/coll/basic/coll_basic_allreduce.c b/ompi/mca/coll/basic/coll_basic_allreduce.c
index bc855726208..d9bcd54758a 100644
--- a/ompi/mca/coll/basic/coll_basic_allreduce.c
+++ b/ompi/mca/coll/basic/coll_basic_allreduce.c
@@ -32,6 +32,8 @@
 #include "coll_basic.h"
 #include "ompi/mca/pml/pml.h"
 
+#include "opal/mca/accelerator/accelerator.h"
+
 
 /*
  *	allreduce_intra
@@ -82,10 +84,11 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
                                struct ompi_communicator_t *comm,
                                mca_coll_base_module_t *module)
 {
-    int err, i, rank, root = 0, rsize, line;
+    int err, i, rank, root = 0, rsize, line, rbuf_dev;
     ptrdiff_t extent, dsize, gap;
     char *tmpbuf = NULL, *pml_buffer = NULL;
     ompi_request_t **reqs = NULL;
+    bool rbuf_on_device = false;
 
     rank = ompi_comm_rank(comm);
     rsize = ompi_comm_remote_size(comm);
@@ -105,8 +108,15 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
             return OMPI_ERROR;
         }
         dsize = opal_datatype_span(&dtype->super, count, &gap);
-        tmpbuf = (char *) malloc(dsize);
-        if (NULL == tmpbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit; }
+        if (opal_accelerator.check_addr(rbuf, &rbuf_dev, NULL) > 0 && rbuf_dev >= 0) {
+            if (OPAL_SUCCESS != opal_accelerator.mem_alloc(rbuf_dev, &tmpbuf, dsize)) {
+                err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit;
+            }
+            rbuf_on_device = true;
+        } else {
+            tmpbuf = (char *) malloc(dsize);
+            if (NULL == tmpbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit; }
+        }
         pml_buffer = tmpbuf - gap;
 
         if (rsize > 1) {
@@ -188,7 +198,9 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
         (void)line;  // silence compiler warning
         ompi_coll_base_free_reqs(reqs, rsize - 1);
     }
-    if (NULL != tmpbuf) {
+    if (rbuf_on_device) {
+        opal_accelerator.mem_release(rbuf_dev, tmpbuf);
+    } else {
         free(tmpbuf);
     }
 
diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 96b58fcf97c..172a04d2724 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -50,9 +50,14 @@ static inline void device_op_pre(const void *orig_source,
     source_rc = opal_accelerator.check_addr(*source, source_device, &source_flags);
     *device = *target_device;
 
+    //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
+    //       target_rc, *target_device, source_rc, *source_device, *device);
+
     if (0 == target_rc && 0 == source_rc) {
         /* no buffers are on any device, select device 0 */
         *device = 0;
+    } else if (*target_device == -1) {
+        *device = *source_device;
     }
 
     /* swap contexts */
@@ -66,8 +71,9 @@ static inline void device_op_pre(const void *orig_source,
         if (0 == target_rc) {
             // allocate memory on the device for the target buffer
             CUdeviceptr dptr;
+            //printf("copying target from device %d to host\n", *target_device);
             CHECK(cuMemAllocAsync,   (&dptr, nbytes, mca_op_cuda_component.cu_stream));
-            CHECK(cuMemcpyHtoDAsync, (dptr, *target, nbytes, mca_op_cuda_component.cu_stream));
+            CHECK(cuMemcpyHtoDAsync, (dptr, orig_target, nbytes, mca_op_cuda_component.cu_stream));
             *target = (void*)dptr;
             *target_device = -1; // mark target device as host
         }
@@ -75,14 +81,18 @@ static inline void device_op_pre(const void *orig_source,
         if (0 == source_rc || *device != *source_device) {
             // allocate memory on the device for the source buffer
             CUdeviceptr dptr;
+            //printf("allocating source on device %d\n", *device);
             CHECK(cuMemAllocAsync, (&dptr, nbytes, mca_op_cuda_component.cu_stream));
+            *source = (void*)dptr;
             if (0 == source_rc) {
                 /* copy from host to device */
-                CHECK(cuMemcpyHtoDAsync, (dptr, *source, nbytes, mca_op_cuda_component.cu_stream));
+                //printf("copying source from host to device %d\n", *device);
+                CHECK(cuMemcpyHtoDAsync, (dptr, orig_source, nbytes, mca_op_cuda_component.cu_stream));
             } else {
                 /* copy from one device to another device */
                 /* TODO: does this actually work? Can we enable P2P? */
-                CHECK(cuMemcpyDtoDAsync, (dptr, (CUdeviceptr)*source, nbytes, mca_op_cuda_component.cu_stream));
+                //printf("attempting cross-device copy for source\n");
+                CHECK(cuMemcpyDtoDAsync, (dptr, (CUdeviceptr)orig_source, nbytes, mca_op_cuda_component.cu_stream));
             }
         }
     }

From f6091271b956839489db1fb214703d38b13d9b2d Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
Date: Wed, 15 Mar 2023 15:56:36 -0400
Subject: [PATCH 05/74] cuMemAllocAsync is supported since CUDA 11.2.0

Signed-off-by: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_functions.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 172a04d2724..4ba8338397c 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -72,7 +72,11 @@ static inline void device_op_pre(const void *orig_source,
             // allocate memory on the device for the target buffer
             CUdeviceptr dptr;
             //printf("copying target from device %d to host\n", *target_device);
+#if CUDA_VERSION >= 11020
             CHECK(cuMemAllocAsync,   (&dptr, nbytes, mca_op_cuda_component.cu_stream));
+#else  // CUDA_VERSION >= 11020
+            CHECK(cuMemAlloc,   (&dptr, nbytes));
+#endif // CUDA_VERSION >= 11020
             CHECK(cuMemcpyHtoDAsync, (dptr, orig_target, nbytes, mca_op_cuda_component.cu_stream));
             *target = (void*)dptr;
             *target_device = -1; // mark target device as host
@@ -82,7 +86,11 @@ static inline void device_op_pre(const void *orig_source,
             // allocate memory on the device for the source buffer
             CUdeviceptr dptr;
             //printf("allocating source on device %d\n", *device);
+#if CUDA_VERSION >= 11020
             CHECK(cuMemAllocAsync, (&dptr, nbytes, mca_op_cuda_component.cu_stream));
+#else  // CUDA_VERSION >= 11020
+            CHECK(cuMemAlloc,   (&dptr, nbytes));
+#endif // CUDA_VERSION >= 11020
             *source = (void*)dptr;
             if (0 == source_rc) {
                 /* copy from host to device */
@@ -115,17 +123,31 @@ static inline void device_op_post(void *orig_target,
         nbytes *= count;
 
         CHECK(cuMemcpyDtoHAsync, (orig_target, (CUdeviceptr)target, nbytes, mca_op_cuda_component.cu_stream));
+    }
 
+#if CUDA_VERSION >= 11020
+    /* cuMemFreeAsync is supported from CUDA 11.2.0 upwards */
+    if (-1 == target_device) {
         CHECK(cuMemFreeAsync, ((CUdeviceptr)target, mca_op_cuda_component.cu_stream));
     }
-
     if (source_device != device) {
         CHECK(cuMemFreeAsync, ((CUdeviceptr)source, mca_op_cuda_component.cu_stream));
     }
+#endif // CUDA_VERSION >= 11020
 
     /* wait for all scheduled operations to complete */
     CHECK(cuStreamSynchronize, (mca_op_cuda_component.cu_stream));
 
+#if CUDA_VERSION < 11020
+    /* cuMemFreeAsync is supported from CUDA 11.2.0 upwards */
+    if (-1 == target_device) {
+        CHECK(cuMemFree, ((CUdeviceptr)target));
+    }
+    if (source_device != device) {
+        CHECK(cuMemFree, ((CUdeviceptr)source));
+    }
+#endif // CUDA_VERSION < 11020
+
     /* restore the context */
     CUcontext ctx;
     CHECK(cuCtxPopCurrent, (&ctx));

From 8ae3dacb66d256b7870b18d8e8ce5476f8046068 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
Date: Wed, 15 Mar 2023 15:57:49 -0400
Subject: [PATCH 06/74] coll/base/allreduce: Condition device allocation on
 op/dtype support

Signed-off-by: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c | 79 +++++-------------------
 ompi/mca/coll/base/coll_base_util.h      | 69 +++++++++++++++++++++
 ompi/op/op.h                             | 37 ++++++++++-
 3 files changed, 122 insertions(+), 63 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index ef94817c8a2..1dd0ceee7d9 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -42,51 +42,6 @@
 #include "coll_base_topo.h"
 #include "coll_base_util.h"
 
-#include "opal/mca/accelerator/accelerator.h"
-
-/* returns a pointer to memory in the same memory domain as ptr, and the device.
- * If memory is allocated on the host, device will be set to -1. */
-static inline
-void* allocate_tmpbuf(const void *sendbuf, const void *recvbuf, size_t size, int *device) {
-    void *res = NULL;
-    uint64_t flags;
-    *device = -1;
-    /* if the recvbuf is on the device we take that device */
-    if (NULL != recvbuf && 0 < opal_accelerator.check_addr(recvbuf, device, &flags)) {
-        //printf("Allocating temporary buffer on device %d\n", *device);
-        if (OPAL_SUCCESS != opal_accelerator.mem_alloc(*device, &res, size)) {
-            /* fall back to the host */
-            res = NULL;
-            *device = -1;
-        }
-    } else if (MPI_IN_PLACE != sendbuf && NULL != sendbuf &&
-               0 < opal_accelerator.check_addr(sendbuf, device, &flags)) {
-        /* send buffer is on a device so try to allocate memory there */
-        //printf("Allocating temporary buffer on device %d\n", *device);
-        if (OPAL_SUCCESS != opal_accelerator.mem_alloc(*device, &res, size)) {
-            /* fall back to the host */
-            res = NULL;
-            *device = -1;
-        }
-    }
-
-    if (NULL == res) {
-        //printf("Allocating temporary buffer on host\n");
-        res = malloc(size);
-    }
-    return res;
-}
-
-static inline
-void free_tmpbuf(void *tmpbuf, int device) {
-    if (-1 == device) {
-        free(tmpbuf);
-    } else if (NULL != tmpbuf) {
-        opal_accelerator.mem_release(device, tmpbuf);
-    }
-}
-
-
 /*
  * ompi_coll_base_allreduce_intra_nonoverlapping
  *
@@ -205,7 +160,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Allocate and initialize temporary send buffer */
     span = opal_datatype_span(&dtype->super, count, &gap);
-    inplacebuf_free = allocate_tmpbuf(sbuf, rbuf, span, &inplacebuf_dev);
+    inplacebuf_free = ompi_coll_base_allocate_op_tmpbuf(sbuf, rbuf, span, op, dtype, &inplacebuf_dev);
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     inplacebuf = inplacebuf_free - gap;
 
@@ -311,14 +266,14 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
 
-    free_tmpbuf(inplacebuf_free, inplacebuf_dev);
+    ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev);
     return MPI_SUCCESS;
 
  error_hndl:
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
-    free_tmpbuf(inplacebuf_free, inplacebuf_dev);
+    ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev);
     return ret;
 }
 
@@ -447,10 +402,10 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
     max_real_segsize = true_extent + (max_segcount - 1) * extent;
 
     /* we don't care about where the send buffer is */
-    inbuf[0] = allocate_tmpbuf(NULL, rbuf, max_real_segsize, &inbuf_dev[0]);
+    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[0]);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = allocate_tmpbuf(NULL, rbuf, max_real_segsize, &inbuf_dev[1]);
+        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[1]);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -570,8 +525,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
 
     }
 
-    free_tmpbuf(inbuf[0], inbuf_dev[0]);
-    free_tmpbuf(inbuf[1], inbuf_dev[1]);
+    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0]);
+    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1]);
 
     return MPI_SUCCESS;
 
@@ -580,8 +535,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    free_tmpbuf(inbuf[0], inbuf_dev[0]);
-    free_tmpbuf(inbuf[1], inbuf_dev[1]);
+    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0]);
+    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1]);
     return ret;
 }
 
@@ -736,10 +691,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
     /* Allocate and initialize temporary buffers */
-    inbuf[0] = allocate_tmpbuf(NULL, rbuf, max_real_segsize, &inbuf_dev[0]);
+    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[0]);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = allocate_tmpbuf(NULL, rbuf, max_real_segsize, &inbuf_dev[1]);
+        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[1]);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -891,8 +846,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
 
     }
 
-    free_tmpbuf(inbuf[0], inbuf_dev[0]);
-    free_tmpbuf(inbuf[1], inbuf_dev[1]);
+    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0]);
+    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1]);
 
     return MPI_SUCCESS;
 
@@ -901,8 +856,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    free_tmpbuf(inbuf[0], inbuf_dev[0]);
-    free_tmpbuf(inbuf[1], inbuf_dev[1]);
+    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0]);
+    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1]);
     return ret;
 }
 
@@ -1055,7 +1010,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
     /* Temporary buffer for receiving messages */
     char *tmp_buf = NULL;
-    char *tmp_buf_raw = allocate_tmpbuf(NULL, rbuf, dsize, &tmp_buf_dev);
+    char *tmp_buf_raw = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, dsize, op, dtype, &tmp_buf_dev);
     if (NULL == tmp_buf_raw)
         return OMPI_ERR_OUT_OF_RESOURCE;
     tmp_buf = tmp_buf_raw - gap;
@@ -1284,7 +1239,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
   cleanup_and_return:
 
-    free_tmpbuf(tmp_buf_raw, tmp_buf_dev);
+    ompi_coll_base_free_tmpbuf(tmp_buf_raw, tmp_buf_dev);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h
index 6982c0fb4f3..e841d08f0b2 100644
--- a/ompi/mca/coll/base/coll_base_util.h
+++ b/ompi/mca/coll/base/coll_base_util.h
@@ -31,6 +31,7 @@
 #include "ompi/mca/coll/base/coll_tags.h"
 #include "ompi/op/op.h"
 #include "ompi/mca/pml/pml.h"
+#include "opal/mca/accelerator/accelerator.h"
 
 BEGIN_C_DECLS
 
@@ -203,5 +204,73 @@ int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expecte
 const char* mca_coll_base_colltype_to_str(int collid);
 int mca_coll_base_name_to_colltype(const char* name);
 
+/* device/host memory allocation functions */
+
+/**
+ * Returns a pointer to memory in the same memory domain as the receive or send buffer.
+ * Device memory is allocated if either the receive buffer or the send buffer are
+ * located on the device and if the op supports on-device reductions on the datatype.
+ * If memory is allocated on the host, device will be set to -1.
+ */
+static inline
+void* ompi_coll_base_allocate_op_tmpbuf(
+    const void *sendbuf, const void *recvbuf, size_t size,
+    const struct ompi_op_t *op, const struct ompi_datatype_t *dtype,
+    int *device)
+{
+    void *res = NULL;
+    uint64_t flags;
+    *device = -1;
+    if ((NULL == op && NULL == dtype) || ompi_op_supports_device(op, dtype)) {
+        /* if the recvbuf is on the device we take that device */
+        if (NULL != recvbuf && 0 < opal_accelerator.check_addr(recvbuf, device, &flags)) {
+            if (OPAL_SUCCESS != opal_accelerator.mem_alloc(*device, &res, size)) {
+                /* fall back to the host */
+                res = NULL;
+                *device = -1;
+            }
+        } else if (MPI_IN_PLACE != sendbuf && NULL != sendbuf &&
+                0 < opal_accelerator.check_addr(sendbuf, device, &flags)) {
+            /* send buffer is on a device so try to allocate memory there */
+            if (OPAL_SUCCESS != opal_accelerator.mem_alloc(*device, &res, size)) {
+                /* fall back to the host */
+                res = NULL;
+                *device = -1;
+            }
+        }
+    }
+
+    if (NULL == res) {
+        res = malloc(size);
+    }
+    return res;
+}
+
+/**
+ * Like ompi_coll_base_allocate_op_tmpbuf but without checking op-datatype
+ * device compatibility.
+ */
+static inline
+void* ompi_coll_base_allocate_tmpbuf(
+    const void *sendbuf, const void *recvbuf,
+    size_t size, int *device)
+{
+    return ompi_coll_base_allocate_op_tmpbuf(sendbuf, recvbuf, size, NULL, NULL, device);
+}
+
+/**
+ * Frees memory allocated through ompi_coll_base_allocate_op_tmpbuf
+ * or ompi_coll_base_allocate_tmpbuf.
+ */
+static inline
+void ompi_coll_base_free_tmpbuf(void *tmpbuf, int device) {
+    if (-1 == device) {
+        free(tmpbuf);
+    } else if (NULL != tmpbuf) {
+        opal_accelerator.mem_release(device, tmpbuf);
+    }
+}
+
+
 END_C_DECLS
 #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */
diff --git a/ompi/op/op.h b/ompi/op/op.h
index a080153f400..8b2f2270552 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -389,7 +389,7 @@ OMPI_DECLSPEC void ompi_op_set_java_callback(ompi_op_t *op,  void *jnienv,
  * this function is provided to hide the internal structure field
  * names.
  */
-static inline bool ompi_op_is_intrinsic(ompi_op_t * op)
+static inline bool ompi_op_is_intrinsic(const ompi_op_t * op)
 {
     return (bool) (0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC));
 }
@@ -483,6 +483,41 @@ static inline bool ompi_op_is_valid(ompi_op_t * op, ompi_datatype_t * ddt,
 }
 
 
+
+/**
+ * Check to see if an op supports device execution on the given datatype
+ *
+ * @param op The op to check
+ * @param ddt The datatype to check
+ *
+ * @returns true If the op supports devices on that datatype
+ * @returns false If the op does not support devices on that datatype
+ *
+ */
+static inline bool ompi_op_supports_device(const ompi_op_t * op, const ompi_datatype_t * ddt)
+{
+    /* Check:
+       - non-intrinsic ddt's cannot be invoked on intrinsic op's
+       - if intrinsic ddt invoked on intrinsic op:
+       - ensure the datatype is defined in the op map
+       - ensure we have a function pointer for that combination
+     */
+    if (ompi_op_is_intrinsic(op)) {
+        if (ompi_datatype_is_predefined(ddt)) {
+            /* Intrinsic ddt on intrinsic op */
+            if (NULL == op->o_device_op ||
+                -1   == ompi_op_ddt_map[ddt->id] ||
+                NULL == op->o_device_op->do_intrinsic.fns[ompi_op_ddt_map[ddt->id]]) {
+                return false;
+            }
+        }
+    }
+
+    /* op supports device for the given datatype */
+    return true;
+}
+
+
 /**
  * Perform a reduction operation.
  *

From 655948fd9d753f0fdeda7c7827f6bcbd2ff96bfa Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
Date: Sun, 19 Mar 2023 11:32:17 -0400
Subject: [PATCH 07/74] Make sure the device op callbacks are zero-initialized

Signed-off-by: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
---
 ompi/mca/op/base/op_base_op_select.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ompi/mca/op/base/op_base_op_select.c b/ompi/mca/op/base/op_base_op_select.c
index a5040d1bdca..09652a8ac32 100644
--- a/ompi/mca/op/base/op_base_op_select.c
+++ b/ompi/mca/op/base/op_base_op_select.c
@@ -154,7 +154,7 @@ int ompi_op_base_op_select(ompi_op_t *op)
         /* Copy over the non-NULL pointers */
         if (avail->ao_module->opm_device_enabled) {
             if (NULL == op->o_device_op) {
-                op->o_device_op = malloc(sizeof(*op->o_device_op));
+                op->o_device_op = calloc(1, sizeof(*op->o_device_op));
             }
             for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
                 /* 2-buffer variants */

From 7cdc828b49b4483f591170a02168269a7a2c2743 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
Date: Sun, 19 Mar 2023 11:33:07 -0400
Subject: [PATCH 08/74] Be more graceful when creating a context and stream

Signed-off-by: Joseph Schuchart <jschuchart@xsdk.icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_component.c | 35 ++++++++++++++++++----------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 738b1a5284d..4100f3d1f5d 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -88,7 +88,7 @@ static int cuda_component_open(void)
 static int cuda_component_close(void)
 {
     if (mca_op_cuda_component.cu_num_devices > 0) {
-        CHECK(cuStreamDestroy, (mca_op_cuda_component.cu_stream));
+        cuStreamDestroy(mca_op_cuda_component.cu_stream);
         free(mca_op_cuda_component.cu_max_threads_per_block);
         mca_op_cuda_component.cu_max_threads_per_block = NULL;
         free(mca_op_cuda_component.cu_devices);
@@ -119,7 +119,9 @@ cuda_component_init_query(bool enable_progress_threads,
                          bool enable_mpi_thread_multiple)
 {
     int num_devices;
+    int rc;
     int prio_lo, prio_hi;
+    memset(&mca_op_cuda_component, sizeof(mca_op_cuda_component));
     cuInit(0);
     CHECK(cuDeviceGetCount, (&num_devices));
     mca_op_cuda_component.cu_num_devices = num_devices;
@@ -128,19 +130,28 @@ cuda_component_init_query(bool enable_progress_threads,
     mca_op_cuda_component.cu_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
     for (int i = 0; i < num_devices; ++i) {
         CHECK(cuDeviceGet, (&mca_op_cuda_component.cu_devices[i], i));
-        CHECK(cuCtxCreate, (&mca_op_cuda_component.cu_ctx[i],
-                            CU_CTX_SCHED_YIELD,
-                            mca_op_cuda_component.cu_devices[i]));
-        mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
-        // TODO: this call fails, why?!
-        //CHECK(cuDeviceGetAttribute, (&mca_op_cuda_component.cu_max_threads_per_block[i],
-        //                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-        //                             mca_op_cuda_component.cu_devices[i]));
+        rc = cuCtxCreate(&mca_op_cuda_component.cu_ctx[i],
+                         0, mca_op_cuda_component.cu_devices[i]);
+        if (CUDA_SUCCESS != rc) {
+            CHECK(cuDevicePrimaryCtxRetain,
+                  (&mca_op_cuda_component.cu_ctx[i], mca_op_cuda_component.cu_devices[i]));
+        }
+        rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_threads_per_block[i],
+                                  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                                  mca_op_cuda_component.cu_devices[i]);
+        if (CUDA_SUCCESS != rc) {
+            /* fall-back to value that should work on every device */
+            mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
+        }
     }
 
-    CHECK(cuCtxGetStreamPriorityRange, (&prio_lo, &prio_hi));
-    CHECK(cuStreamCreateWithPriority, (&mca_op_cuda_component.cu_stream, CU_STREAM_NON_BLOCKING, prio_hi));
-
+    /* try to create a high-priority stream */
+    rc = cuCtxGetStreamPriorityRange(&prio_lo, &prio_hi);
+    if (CUDA_SUCCESS != rc) {
+        cuStreamCreateWithPriority(&mca_op_cuda_component.cu_stream, CU_STREAM_NON_BLOCKING, prio_hi);
+    } else {
+        mca_op_cuda_component.cu_stream = 0;
+    }
     return OMPI_SUCCESS;
 }
 

From bdb16a173b337b1d3d19f3d59ef505aed60ba73c Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 31 Mar 2023 13:13:49 -0400
Subject: [PATCH 09/74] fix wrong call to memset

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_component.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 4100f3d1f5d..26eae61929e 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -121,7 +121,7 @@ cuda_component_init_query(bool enable_progress_threads,
     int num_devices;
     int rc;
     int prio_lo, prio_hi;
-    memset(&mca_op_cuda_component, sizeof(mca_op_cuda_component));
+    memset(&mca_op_cuda_component, 0, sizeof(mca_op_cuda_component));
     cuInit(0);
     CHECK(cuDeviceGetCount, (&num_devices));
     mca_op_cuda_component.cu_num_devices = num_devices;

From 5934f436eb5f01581f8684afb696d9071a781712 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 31 Mar 2023 13:16:54 -0400
Subject: [PATCH 10/74] Add detector for cudart

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 config/opal_check_cudart.m4            | 120 +++++++++++++++++++++++++
 opal/mca/accelerator/cuda/configure.m4 |   5 ++
 2 files changed, 125 insertions(+)
 create mode 100644 config/opal_check_cudart.m4

diff --git a/config/opal_check_cudart.m4 b/config/opal_check_cudart.m4
new file mode 100644
index 00000000000..0e3fced8065
--- /dev/null
+++ b/config/opal_check_cudart.m4
@@ -0,0 +1,120 @@
+dnl -*- autoconf -*-
+dnl
+dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
+dnl                         University Research and Technology
+dnl                         Corporation.  All rights reserved.
+dnl Copyright (c) 2004-2005 The University of Tennessee and The University
+dnl                         of Tennessee Research Foundation.  All rights
+dnl                         reserved.
+dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+dnl                         University of Stuttgart.  All rights reserved.
+dnl Copyright (c) 2004-2005 The Regents of the University of California.
+dnl                         All rights reserved.
+dnl Copyright (c) 2006-2016 Cisco Systems, Inc.  All rights reserved.
+dnl Copyright (c) 2007      Sun Microsystems, Inc.  All rights reserved.
+dnl Copyright (c) 2009      IBM Corporation.  All rights reserved.
+dnl Copyright (c) 2009      Los Alamos National Security, LLC.  All rights
+dnl                         reserved.
+dnl Copyright (c) 2009-2011 Oak Ridge National Labs.  All rights reserved.
+dnl Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
+dnl Copyright (c) 2015      Research Organization for Information Science
+dnl                         and Technology (RIST). All rights reserved.
+dnl Copyright (c) 2022      Amazon.com, Inc. or its affiliates.  All Rights reserved.
+dnl $COPYRIGHT$
+dnl
+dnl Additional copyrights may follow
+dnl
+dnl $HEADER$
+dnl
+
+
+# OPAL_CHECK_CUDART(prefix, [action-if-found], [action-if-not-found])
+# --------------------------------------------------------
+# check if CUDA runtime library support can be found.  sets prefix_{CPPFLAGS,
+# LDFLAGS, LIBS} as needed and runs action-if-found if there is
+# support, otherwise executes action-if-not-found
+
+#
+# Check for CUDA support
+#
+AC_DEFUN([OPAL_CHECK_CUDART],[
+OPAL_VAR_SCOPE_PUSH([cudart_save_CPPFLAGS cudart_save_LDFLAGS cudart_save_LIBS])
+
+cudart_save_CPPFLAGS="$CPPFLAGS"
+cudart_save_LDFLAGS="$LDFLAGS"
+cudart_save_LIBS="$LIBS"
+
+#
+# Check to see if the user provided paths for CUDART
+#
+AC_ARG_WITH([cudart],
+            [AS_HELP_STRING([--with-cudart=DIR],
+            [Path to the CUDA runtime library and header files])])
+AC_MSG_CHECKING([if --with-cudart is set])
+AC_ARG_WITH([cudart-libdir],
+            [AS_HELP_STRING([--with-cudart-libdir=DIR],
+                            [Search for CUDA runtime libraries in DIR])])
+
+####################################
+#### Check for CUDA runtime library
+####################################
+AS_IF([test "x$with_cudart" != "xno" || test "x$with_cudart" = "x"],
+      [opal_check_cudart_happy=no
+       AC_MSG_RESULT([not set (--with-cudart=$with_cudart)])],
+      [AS_IF([test ! -d "$with_cudart"],
+             [AC_MSG_RESULT([not found])
+              AC_MSG_WARN([Directory $with_cudart not found])]
+             [AS_IF([test "x`ls $with_cudart/include/cuda_runtime.h 2> /dev/null`" = "x"]
+                    [AC_MSG_RESULT([not found])
+                     AC_MSG_WARN([Could not find cuda_runtime.h in $with_cudart/include])]
+                    [opal_check_cudart_happy=yes
+                     opal_cudart_incdir="$with_cudart/include"])])])
+
+AS_IF([test "$opal_check_cudart_happy" = "no" && test "$with_cudart" != "no"],
+      [AC_PATH_PROG([nvcc_bin], [nvcc], ["not-found"])
+       AS_IF([test "$nvcc_bin" = "not-found"],
+             [AC_MSG_WARN([Could not find nvcc binary])],
+             [nvcc_dirname=`AS_DIRNAME([$nvcc_bin])`
+              with_cudart=$nvcc_dirname/../
+              opal_cudart_incdir=$nvcc_dirname/../include
+              opal_check_cudart_happy=yes])
+      ]
+      [])
+
+AS_IF([test x"$with_cudart_libdir" = "x"],
+      [with_cudart_libdir=$with_cudart/lib64/]
+      [])
+
+AS_IF([test "$opal_check_cudart_happy" = "yes"],
+    [OAC_CHECK_PACKAGE([cudart],
+                       [$1],
+                       [cuda_runtime.h],
+                       [cudart],
+                       [cudaMalloc],
+                       [opal_check_cudart_happy="yes"],
+                       [opal_check_cudart_happy="no"])],
+    [])
+
+
+AC_MSG_CHECKING([if have cuda runtime library support])
+if test "$opal_check_cudart_happy" = "yes"; then
+    AC_MSG_RESULT([yes (-I$opal_cudart_incdir)])
+    CUDART_SUPPORT=1
+    common_cudart_CPPFLAGS="-I$opal_cudart_incdir"
+    AC_SUBST([common_cudart_CPPFLAGS])
+else
+    AC_MSG_RESULT([no])
+    CUDART_SUPPORT=0
+fi
+
+
+OPAL_SUMMARY_ADD([Accelerators], [CUDART support], [], [$opal_check_cudart_happy])
+AM_CONDITIONAL([OPAL_cudart_support], [test "x$CUDART_SUPPORT" = "x1"])
+AC_DEFINE_UNQUOTED([OPAL_CUDART_SUPPORT],$CUDART_SUPPORT,
+                   [Whether we have cuda runtime library support])
+
+CPPFLAGS=${cudart_save_CPPFLAGS}
+LDFLAGS=${cudart_save_LDFLAGS}
+LIBS=${cudart_save_LIBS}
+OPAL_VAR_SCOPE_POP
+])dnl
diff --git a/opal/mca/accelerator/cuda/configure.m4 b/opal/mca/accelerator/cuda/configure.m4
index aa67623c8b2..2792d52c840 100644
--- a/opal/mca/accelerator/cuda/configure.m4
+++ b/opal/mca/accelerator/cuda/configure.m4
@@ -24,6 +24,7 @@ AC_DEFUN([MCA_opal_accelerator_cuda_CONFIG],[
     AC_CONFIG_FILES([opal/mca/accelerator/cuda/Makefile])
 
     OPAL_CHECK_CUDA([accelerator_cuda])
+    OPAL_CHECK_CUDART([accelerator_cudart])
 
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1],
@@ -33,4 +34,8 @@ AC_DEFUN([MCA_opal_accelerator_cuda_CONFIG],[
     AC_SUBST([accelerator_cuda_LDFLAGS])
     AC_SUBST([accelerator_cuda_LIBS])
 
+    AC_SUBST([accelerator_cudart_CPPFLAGS])
+    AC_SUBST([accelerator_cudart_LDFLAGS])
+    AC_SUBST([accelerator_cudart_LIBS])
+
 ])dnl

From c2c3d0e3f6003190fd80ac1da7847d60878aaac5 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 5 Apr 2023 16:29:51 -0400
Subject: [PATCH 11/74] Add CUDA stream-based allocator and memory pools

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda.h                    |   2 +
 ompi/mca/op/cuda/op_cuda_component.c          |  11 +-
 ompi/mca/op/cuda/op_cuda_functions.c          |  60 +++-----
 opal/mca/accelerator/accelerator.h            |  61 ++++++++
 .../accelerator/base/accelerator_base_frame.c |   6 +
 opal/mca/accelerator/cuda/Makefile.am         |  10 +-
 opal/mca/accelerator/cuda/accelerator_cuda.c  | 134 +++++++++++++++++-
 opal/mca/accelerator/cuda/accelerator_cuda.h  |  11 ++
 .../cuda/accelerator_cuda_component.c         |  75 +++++++++-
 .../cuda/help-accelerator-cuda.txt            |   7 +
 .../null/accelerator_null_component.c         |  35 ++++-
 11 files changed, 357 insertions(+), 55 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index cff3bbb55ef..94794da4967 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -50,9 +50,11 @@ typedef struct {
     /** The base op component struct */
     ompi_op_base_component_1_0_0_t super;
 
+#if 0
     /* a stream on which to schedule kernel calls */
     CUstream cu_stream;
     CUcontext *cu_ctx;
+#endif // 0
     int *cu_max_threads_per_block;
     CUdevice *cu_devices;
     int cu_num_devices;
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 26eae61929e..5190e98f50f 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -79,6 +79,7 @@ static int cuda_component_open(void)
      * component won't even be shown in ompi_info output (which is
      * probably not what you want).
      */
+    printf("op cuda_component_open\n");
     return OMPI_SUCCESS;
 }
 
@@ -88,7 +89,7 @@ static int cuda_component_open(void)
 static int cuda_component_close(void)
 {
     if (mca_op_cuda_component.cu_num_devices > 0) {
-        cuStreamDestroy(mca_op_cuda_component.cu_stream);
+        //cuStreamDestroy(mca_op_cuda_component.cu_stream);
         free(mca_op_cuda_component.cu_max_threads_per_block);
         mca_op_cuda_component.cu_max_threads_per_block = NULL;
         free(mca_op_cuda_component.cu_devices);
@@ -126,16 +127,20 @@ cuda_component_init_query(bool enable_progress_threads,
     CHECK(cuDeviceGetCount, (&num_devices));
     mca_op_cuda_component.cu_num_devices = num_devices;
     mca_op_cuda_component.cu_devices = (CUdevice*)malloc(num_devices*sizeof(CUdevice));
+#if 0
     mca_op_cuda_component.cu_ctx = (CUcontext*)malloc(num_devices*sizeof(CUcontext));
+#endif // 0
     mca_op_cuda_component.cu_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
     for (int i = 0; i < num_devices; ++i) {
         CHECK(cuDeviceGet, (&mca_op_cuda_component.cu_devices[i], i));
+#if 0
         rc = cuCtxCreate(&mca_op_cuda_component.cu_ctx[i],
                          0, mca_op_cuda_component.cu_devices[i]);
         if (CUDA_SUCCESS != rc) {
             CHECK(cuDevicePrimaryCtxRetain,
                   (&mca_op_cuda_component.cu_ctx[i], mca_op_cuda_component.cu_devices[i]));
         }
+#endif // 0
         rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_threads_per_block[i],
                                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                                   mca_op_cuda_component.cu_devices[i]);
@@ -145,6 +150,7 @@ cuda_component_init_query(bool enable_progress_threads,
         }
     }
 
+#if 0
     /* try to create a high-priority stream */
     rc = cuCtxGetStreamPriorityRange(&prio_lo, &prio_hi);
     if (CUDA_SUCCESS != rc) {
@@ -152,6 +158,8 @@ cuda_component_init_query(bool enable_progress_threads,
     } else {
         mca_op_cuda_component.cu_stream = 0;
     }
+#endif // 0
+    printf("op cuda_component_init_query\n");
     return OMPI_SUCCESS;
 }
 
@@ -177,5 +185,6 @@ cuda_component_op_query(struct ompi_op_t *op, int *priority)
         }
     }
     *priority = 50;
+    printf("op cuda_component_query\n");
     return (ompi_op_base_module_1_0_0_t *) module;
 }
diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 4ba8338397c..63978b72f8e 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -60,8 +60,10 @@ static inline void device_op_pre(const void *orig_source,
         *device = *source_device;
     }
 
+#if 0
     /* swap contexts */
     CHECK(cuCtxPushCurrent, (mca_op_cuda_component.cu_ctx[*device]));
+#endif // 0
 
     if (0 == target_rc || 0 == source_rc || *target_device != *source_device) {
         size_t nbytes;
@@ -70,37 +72,26 @@ static inline void device_op_pre(const void *orig_source,
 
         if (0 == target_rc) {
             // allocate memory on the device for the target buffer
-            CUdeviceptr dptr;
             //printf("copying target from device %d to host\n", *target_device);
-#if CUDA_VERSION >= 11020
-            CHECK(cuMemAllocAsync,   (&dptr, nbytes, mca_op_cuda_component.cu_stream));
-#else  // CUDA_VERSION >= 11020
-            CHECK(cuMemAlloc,   (&dptr, nbytes));
-#endif // CUDA_VERSION >= 11020
-            CHECK(cuMemcpyHtoDAsync, (dptr, orig_target, nbytes, mca_op_cuda_component.cu_stream));
-            *target = (void*)dptr;
+            opal_accelerator.mem_alloc_stream(*device, target, nbytes, opal_accelerator.default_stream);
+            //CHECK(cuMemAllocAsync,   (&dptr, nbytes, (CUstream*)opal_accelerator.default_stream->stream));
+            CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*target, orig_target, nbytes, *(CUstream*)opal_accelerator.default_stream->stream));
             *target_device = -1; // mark target device as host
         }
 
         if (0 == source_rc || *device != *source_device) {
             // allocate memory on the device for the source buffer
-            CUdeviceptr dptr;
             //printf("allocating source on device %d\n", *device);
-#if CUDA_VERSION >= 11020
-            CHECK(cuMemAllocAsync, (&dptr, nbytes, mca_op_cuda_component.cu_stream));
-#else  // CUDA_VERSION >= 11020
-            CHECK(cuMemAlloc,   (&dptr, nbytes));
-#endif // CUDA_VERSION >= 11020
-            *source = (void*)dptr;
+            opal_accelerator.mem_alloc_stream(*device, source, nbytes, opal_accelerator.default_stream);
             if (0 == source_rc) {
                 /* copy from host to device */
                 //printf("copying source from host to device %d\n", *device);
-                CHECK(cuMemcpyHtoDAsync, (dptr, orig_source, nbytes, mca_op_cuda_component.cu_stream));
+                CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source, orig_source, nbytes, *(CUstream*)opal_accelerator.default_stream->stream));
             } else {
                 /* copy from one device to another device */
                 /* TODO: does this actually work? Can we enable P2P? */
                 //printf("attempting cross-device copy for source\n");
-                CHECK(cuMemcpyDtoDAsync, (dptr, (CUdeviceptr)orig_source, nbytes, mca_op_cuda_component.cu_stream));
+                CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source, (CUdeviceptr)orig_source, nbytes, *(CUstream*)opal_accelerator.default_stream->stream));
             }
         }
     }
@@ -122,35 +113,22 @@ static inline void device_op_post(void *orig_target,
         ompi_datatype_type_size(dtype, &nbytes);
         nbytes *= count;
 
-        CHECK(cuMemcpyDtoHAsync, (orig_target, (CUdeviceptr)target, nbytes, mca_op_cuda_component.cu_stream));
+        CHECK(cuMemcpyDtoHAsync, (orig_target, (CUdeviceptr)target, nbytes, *(CUstream *)opal_accelerator.default_stream->stream));
     }
 
-#if CUDA_VERSION >= 11020
     /* cuMemFreeAsync is supported from CUDA 11.2.0 upwards */
     if (-1 == target_device) {
-        CHECK(cuMemFreeAsync, ((CUdeviceptr)target, mca_op_cuda_component.cu_stream));
+        opal_accelerator.mem_release_stream(device, target, opal_accelerator.default_stream);
+        //CHECK(cuMemFreeAsync, ((CUdeviceptr)target, mca_op_cuda_component.cu_stream));
     }
     if (source_device != device) {
-        CHECK(cuMemFreeAsync, ((CUdeviceptr)source, mca_op_cuda_component.cu_stream));
+        opal_accelerator.mem_release_stream(device, source, opal_accelerator.default_stream);
+        //CHECK(cuMemFreeAsync, ((CUdeviceptr)source, mca_op_cuda_component.cu_stream));
     }
-#endif // CUDA_VERSION >= 11020
 
     /* wait for all scheduled operations to complete */
-    CHECK(cuStreamSynchronize, (mca_op_cuda_component.cu_stream));
-
-#if CUDA_VERSION < 11020
-    /* cuMemFreeAsync is supported from CUDA 11.2.0 upwards */
-    if (-1 == target_device) {
-        CHECK(cuMemFree, ((CUdeviceptr)target));
-    }
-    if (source_device != device) {
-        CHECK(cuMemFree, ((CUdeviceptr)source));
-    }
-#endif // CUDA_VERSION < 11020
-
-    /* restore the context */
-    CUcontext ctx;
-    CHECK(cuCtxPopCurrent, (&ctx));
+    //CHECK(cuStreamSynchronize, (mca_op_cuda_component.cu_stream));
+    opal_accelerator.wait_stream(opal_accelerator.default_stream);
 }
 
 #define FUNC(name, type_name, type)                                 \
@@ -164,7 +142,7 @@ static inline void device_op_post(void *orig_target,
         int n = *count; \
         device_op_pre(in, inout, n, *dtype, (void**)&source, &source_device, (void**)&target, &target_device, \
                       &threads_per_block, &device); \
-        CUstream *stream = &mca_op_cuda_component.cu_stream;                        \
+        CUstream *stream = (CUstream*)opal_accelerator.default_stream->stream;                        \
         ompi_op_cuda_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, *stream); \
         device_op_post(inout, n, *dtype, source, source_device, target, target_device, device); \
     }
@@ -781,7 +759,7 @@ LOC_FUNC(minloc, long_double_int, <)
         const type *in1_ = (const type*)in1;                                        \
         const type *in2_ = (const type*)in2;                                        \
         int n = *count;                                                             \
-        CUstream *stream = &mca_op_cuda_component.cu_stream;                        \
+        CUstream *stream = (CUstream*)opal_accelerator.default_stream->stream;     \
         ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads, *stream>>>(in1_, int2_, out_, n); \
     }
 
@@ -809,7 +787,7 @@ LOC_FUNC(minloc, long_double_int, <)
         const type *in1_ = (const type*)in1;                                        \
         const type *in2_ = (const type*)in2;                                        \
         int n = *count;                                                             \
-        CUstream *stream = &mca_op_cuda_component.cu_stream;                        \
+        CUstream *stream = (CUstream*)opal_accelerator.default_stream->stream;     \
         ompi_op_cuda_3buff_##name##_##type_name##_kernel<<blocks, threads, *stream>>(in1_, in2_, out_, n); \
     }
 
@@ -863,7 +841,7 @@ LOC_FUNC(minloc, long_double_int, <)
         const ompi_op_predefined_##type_name##_t *a1 = (const ompi_op_predefined_##type_name##_t*) in1; \
         const ompi_op_predefined_##type_name##_t *a2 = (const ompi_op_predefined_##type_name##_t*) in2; \
         ompi_op_predefined_##type_name##_t *b = (ompi_op_predefined_##type_name##_t*) out;            \
-        CUstream *stream = &mca_op_cuda_component.cu_stream;                            \
+        CUstream *stream = (CUstream*)opal_accelerator.default_stream->stream;                       \
         ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads, *stream>>(a1, a2, b, n); \
     }
 
diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h
index efc951377ca..d25bfaea857 100644
--- a/opal/mca/accelerator/accelerator.h
+++ b/opal/mca/accelerator/accelerator.h
@@ -129,6 +129,14 @@ struct opal_accelerator_event_t {
 typedef struct opal_accelerator_event_t opal_accelerator_event_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_event_t);
 
+struct opal_accelerator_mempool_t {
+    opal_object_t super;
+    /* Memory pool object */
+    void *mempool;
+};
+typedef struct opal_accelerator_event_t opal_accelerator_mempool_t;
+OBJ_CLASS_DECLARATION(opal_accelerator_mempool_t);
+
 /**
  * Check whether a pointer belongs to an accelerator or not.
  * interfaces
@@ -303,6 +311,44 @@ typedef int (*opal_accelerator_base_module_mem_alloc_fn_t)(
 typedef int (*opal_accelerator_base_module_mem_release_fn_t)(
     int dev_id, void *ptr);
 
+/**
+ * Allocates size bytes memory from the device and sets ptr to the
+ * pointer of the allocated memory. The memory is not initialized.
+ * The allocation request is placed into the stream object.
+ * Any use of the memory must succeed the completion of this
+ * operation on the stream.
+ *
+ * @param[IN] dev_id         Associated device for the allocation or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[OUT] ptr           Returns pointer to allocated memory
+ * @param[IN] size           Size of memory to allocate
+ * @param[IN] stream         Stream into which to insert the allocation request
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_mem_alloc_stream_fn_t)(
+    int dev_id, void **ptr, size_t size, opal_accelerator_stream_t *stream);
+
+/**
+ * Frees the memory space pointed to by ptr which has been returned by
+ * a previous call to an opal_accelerator_base_module_mem_alloc_stream_fn_t().
+ * If the function is called on a ptr that has already been freed,
+ * undefined behavior occurs. If ptr is NULL, no operation is performed,
+ * and the function returns OPAL_SUCCESS.
+ * The release of the memory will be inserted into the stream and occurs after
+ * all previous operations have completed.
+ *
+ * @param[IN] dev_id         Associated device for the allocation or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] ptr            Pointer to free
+ * @param[IN] stream         Stream into which to insert the free operation
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_mem_release_stream_fn_t)(
+    int dev_id, void *ptr, opal_accelerator_stream_t *stream);
+
+
 /**
  * Retrieves the base address and/or size of a memory allocation of the
  * device.
@@ -394,10 +440,21 @@ typedef int (*opal_accelerator_base_module_device_can_access_peer_fn_t)(
 typedef int (*opal_accelerator_base_module_get_buffer_id_fn_t)(
     int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+/**
+ * Wait for the completion of all operations inserted into the stream.
+ *
+ * @param[IN] stram          The stream to wait for.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_wait_stream_fn_t)(opal_accelerator_stream_t *stream);
+
 /*
  * the standard public API data structure
  */
 typedef struct {
+    /* default stream pointer */
+    opal_accelerator_stream_t *default_stream;
     /* accelerator function table */
     opal_accelerator_base_module_check_addr_fn_t check_addr;
 
@@ -412,6 +469,8 @@ typedef struct {
 
     opal_accelerator_base_module_mem_alloc_fn_t mem_alloc;
     opal_accelerator_base_module_mem_release_fn_t mem_release;
+    opal_accelerator_base_module_mem_alloc_stream_fn_t mem_alloc_stream;
+    opal_accelerator_base_module_mem_release_stream_fn_t mem_release_stream;
     opal_accelerator_base_module_get_address_range_fn_t get_address_range;
 
     opal_accelerator_base_module_host_register_fn_t host_register;
@@ -422,6 +481,8 @@ typedef struct {
     opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer;
 
     opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;
+
+    opal_accelerator_base_module_wait_stream_fn_t wait_stream;
 } opal_accelerator_base_module_t;
 
 /**
diff --git a/opal/mca/accelerator/base/accelerator_base_frame.c b/opal/mca/accelerator/base/accelerator_base_frame.c
index fcaf86be94e..0721772bf1e 100644
--- a/opal/mca/accelerator/base/accelerator_base_frame.c
+++ b/opal/mca/accelerator/base/accelerator_base_frame.c
@@ -57,6 +57,12 @@ OBJ_CLASS_INSTANCE(
     NULL,
     NULL);
 
+OBJ_CLASS_INSTANCE(
+    opal_accelerator_mempool_t,
+    opal_object_t,
+    NULL,
+    NULL);
+
 MCA_BASE_FRAMEWORK_DECLARE(opal, accelerator, "OPAL Accelerator Framework",
                            opal_accelerator_base_frame_register, opal_accelerator_base_frame_open,
                            opal_accelerator_base_frame_close, mca_accelerator_base_static_components,
diff --git a/opal/mca/accelerator/cuda/Makefile.am b/opal/mca/accelerator/cuda/Makefile.am
index 5646890bab3..6f19b62cb63 100644
--- a/opal/mca/accelerator/cuda/Makefile.am
+++ b/opal/mca/accelerator/cuda/Makefile.am
@@ -34,11 +34,13 @@ mcacomponentdir = $(opallibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 
 mca_accelerator_cuda_la_SOURCES = $(sources)
-mca_accelerator_cuda_la_LDFLAGS = -module -avoid-version
+mca_accelerator_cuda_la_LDFLAGS = -module -avoid-version \
+        $(accelerator_cuda_LDFLAGS) $(accelerator_cudart_LDFLAGS)
 mca_accelerator_cuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
-        $(accelerator_cuda_LIBS)
+        $(accelerator_cuda_LIBS) $(accelerator_cudart_LIBS)
 
 noinst_LTLIBRARIES = $(component_noinst)
 libmca_accelerator_cuda_la_SOURCES =$(sources)
-libmca_accelerator_cuda_la_LDFLAGS = -module -avoid-version
-libmca_accelerator_cuda_la_LIBADD = $(accelerator_cuda_LIBS)
+libmca_accelerator_cuda_la_LDFLAGS = -module -avoid-version \
+        $(accelerator_cuda_LDFLAGS) $(accelerator_cudart_LDFLAGS)
+libmca_accelerator_cuda_la_LIBADD = $(accelerator_cuda_LIBS) $(accelerator_cudart_LIBS)
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index 60e432841c0..03020583224 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -16,6 +16,7 @@
 #include "opal_config.h"
 
 #include <cuda.h>
+#include <cuda_runtime_api.h>
 
 #include "accelerator_cuda.h"
 #include "opal/mca/accelerator/base/base.h"
@@ -38,6 +39,8 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
                              opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size);
 static int accelerator_cuda_mem_release(int dev_id, void *ptr);
+static int accelerator_cuda_mem_alloc_stream(int dev_id, void **ptr, size_t size, opal_accelerator_stream_t *stream);
+static int accelerator_cuda_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base,
                                               size_t *size);
 
@@ -50,8 +53,12 @@ static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int d
 
 static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream);
+
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
 {
+    &opal_accelerator_cuda_default_stream.base,
+
     accelerator_cuda_check_addr,
 
     accelerator_cuda_create_stream,
@@ -65,6 +72,8 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_memmove,
     accelerator_cuda_mem_alloc,
     accelerator_cuda_mem_release,
+    accelerator_cuda_mem_alloc_stream,
+    accelerator_cuda_mem_release_stream,
     accelerator_cuda_get_address_range,
 
     accelerator_cuda_host_register,
@@ -74,7 +83,9 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_get_device_pci_attr,
     accelerator_cuda_device_can_access_peer,
 
-    accelerator_cuda_get_buffer_id
+    accelerator_cuda_get_buffer_id,
+
+    accelerator_cuda_wait_stream
 };
 
 static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
@@ -495,14 +506,35 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
         return OPAL_ERR_BAD_PARAM;
     }
 
-    if (size > 0) {
-        result = cuMemAlloc((CUdeviceptr *) ptr, size);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            return OPAL_ERROR;
+#if CUDA_VERSION >= 11020
+    /* Try to allocate the memory from a memory pool, if available */
+    /* get the default pool */
+    cudaMemPool_t mpool;
+    result = cudaDeviceGetDefaultMemPool(&mpool, dev_id);
+    if (cudaSuccess == result) {
+        result = cudaMallocFromPoolAsync(ptr, size, mpool, opal_accelerator_cuda_alloc_stream);
+        if (cudaSuccess == result) {
+            /* this is a blocking function, so wait for the allocation to happen */
+            result = cuStreamSynchronize(opal_accelerator_cuda_alloc_stream);
+            if (cudaSuccess == result) {
+                return OPAL_SUCCESS;
+            }
         }
     }
+    if (cudaErrorNotSupported != result) {
+        opal_show_help("help-accelerator-cuda.txt", "cudaMallocFromPoolAsync failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    /* fall-back to regular allocation */
+#endif // CUDA_VERSION >= 11020
+
+    result = cuMemAlloc((CUdeviceptr *) ptr, size);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
     return 0;
 }
 
@@ -700,3 +732,91 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
     }
     return OPAL_SUCCESS;
 }
+
+
+static int accelerator_cuda_mem_alloc_stream(
+    int dev_id,
+    void **addr,
+    size_t size,
+    opal_accelerator_stream_t *stream)
+{
+#if CUDA_VERSION >= 11020
+    cudaError_t result;
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == stream || NULL == addr || 0 == size) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    /* Try to allocate the memory from a memory pool, if available */
+    /* get the default pool */
+    cudaMemPool_t mpool;
+    result = cudaDeviceGetDefaultMemPool(&mpool, dev_id);
+    if (cudaSuccess == result) {
+        result = cudaMallocFromPoolAsync(addr, size, mpool, *(cudaStream_t*)stream->stream);
+        if (cudaSuccess == result) {
+            return OPAL_SUCCESS;
+        }
+    }
+    if (cudaErrorNotSupported != result) {
+        opal_show_help("help-accelerator-cuda.txt", "cudaMallocFromPoolAsync failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    /* fall-back to regular stream allocation */
+
+    result = cudaMallocAsync(addr, size, *(cudaStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(cudaSuccess != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+#else
+    return accelerator_cuda_mem_alloc(dev_id, addr, size);
+#endif // CUDA_VERSION >= 11020
+}
+
+
+static int accelerator_cuda_mem_release_stream(
+    int dev_id,
+    void *addr,
+    opal_accelerator_stream_t *stream)
+{
+#if CUDA_VERSION >= 11020
+    cudaError_t result;
+
+    if (NULL == stream || NULL == addr) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = cudaFreeAsync(addr, *(cudaStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(cudaSuccess != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+#else
+    /* wait for everything on the device to complete */
+    accelerator_cuda_wait_stream(stream);
+    return accelerator_cuda_mem_release(dev_id, addr);
+#endif // CUDA_VERSION >= 11020
+}
+
+
+static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream)
+{
+    CUresult result;
+    result = cuStreamSynchronize(*(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
\ No newline at end of file
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index 694a4192231..7aff2530973 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -15,6 +15,7 @@
 #include "opal_config.h"
 
 #include <cuda.h>
+#include <cuda_runtime_api.h>
 
 #include "opal/mca/accelerator/accelerator.h"
 #include "opal/mca/threads/mutex.h"
@@ -37,14 +38,24 @@ struct opal_accelerator_cuda_event_t {
 typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t);
 
+struct opal_accelerator_cuda_mempool_t {
+    opal_accelerator_mempool_t base;
+};
+typedef struct opal_accelerator_cuda_mempool_t opal_accelerator_cuda_mempool_t;
+OBJ_CLASS_DECLARATION(opal_accelerator_cuda_mempool_t);
+
 /* Declare extern variables, defined in accelerator_cuda_component.c */
 OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream;
+OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_alloc_stream;
+OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream;
 OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock;
 
 OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_component;
 
 OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
 
+OPAL_DECLSPEC extern cudaMemPool_t *opal_accelerator_cuda_mempools;
+
 OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
 
 END_C_DECLS
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index d880ee5dca8..9115997ffe2 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -35,12 +35,16 @@
 
 /* Define global variables, used in accelerator_cuda.c */
 CUstream opal_accelerator_cuda_memcpy_stream = NULL;
+CUstream opal_accelerator_cuda_alloc_stream = NULL;
+opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream;
 opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
 
 /* Initialization lock for delayed cuda initialization */
 static opal_mutex_t accelerator_cuda_init_lock;
 static bool accelerator_cuda_init_complete = false;
 
+cudaMemPool_t *opal_accelerator_cuda_mempool;
+
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x)  STRINGIFY2(x)
 
@@ -122,6 +126,7 @@ static int accelerator_cuda_component_register(void)
 int opal_accelerator_cuda_delayed_init()
 {
     int result = OPAL_SUCCESS;
+    int prio_lo, prio_hi;
     CUcontext cuContext;
 
     /* Double checked locking to avoid having to
@@ -159,13 +164,39 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     }
 
+    /* Create stream for use in cuMemcpyAsync synchronous copies */
+    result = cuStreamCreate(&opal_accelerator_cuda_alloc_stream, 0);
+    if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        goto out;
+    }
+
+    /* Create a default stream to be used by various components.
+     * We try to create a high-priority stream and fall back to a regular stream.
+     */
+    CUstream *default_stream = malloc(sizeof(CUstream));
+    result = cuCtxGetStreamPriorityRange(&prio_lo, &prio_hi);
+    if (CUDA_SUCCESS != result) {
+        result = cuStreamCreateWithPriority(default_stream,
+                                            CU_STREAM_NON_BLOCKING, prio_hi);
+    } else {
+        result = cuStreamCreate(default_stream, 0);
+    }
+    if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        goto out;
+    }
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_default_stream, opal_accelerator_cuda_stream_t);
+    opal_accelerator_cuda_default_stream.base.stream = default_stream;
+
     result = cuMemHostRegister(&checkmem, sizeof(int), 0);
     if (result != CUDA_SUCCESS) {
         /* If registering the memory fails, print a message and continue.
          * This is not a fatal error. */
         opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister during init failed", true,
                        &checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, result, "checkmem");
-
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
                             "CUDA: cuMemHostRegister OK on test region");
@@ -216,8 +247,50 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
     if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) {
         cuStreamDestroy(opal_accelerator_cuda_memcpy_stream);
     }
+    if ((NULL != opal_accelerator_cuda_alloc_stream) && ctx_ok) {
+        cuStreamDestroy(opal_accelerator_cuda_alloc_stream);
+    }
+    if ((NULL != opal_accelerator_cuda_default_stream.base.stream) && ctx_ok) {
+        cuStreamDestroy(opal_accelerator_cuda_default_stream.base.stream);
+    }
+
 
     OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
     OBJ_DESTRUCT(&accelerator_cuda_init_lock);
     return;
 }
+
+#if 0
+static int opal_acclerator_cuda_init_mempools() {
+    cudaError_t result;
+    int num_devices;
+    cuDeviceGetCount(&num_devices);
+    if (num_devices == 0) {
+        return OPAL_SUCCESS;
+    }
+    opal_accelerator_cuda_mempools = malloc(num_devices*sizeof(*mp));
+    cudaMemPoolProps pp;
+    memset(&pp, 0, sizeof(pp));
+    pp.allocType = cudaMemAllocationTypePinned;
+    pp.handleTypes = cudaMemHandleTypeNone;
+    pp.location.id = devidx;
+    pp.location.type = cudaMemLocationTypeDevice;
+    for (int i = 0; i < num_devices; ++i) {
+
+    result = cudaMemPoolCreate(mp, &pp);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cudaMemPoolCreate failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+
+    }
+    *mpool = OBJ_NEW(opal_accelerator_cuda_mempool_t);
+    if (NULL == *mpool) {
+        return OPAL_ERR_OUT_OF_RESOURCE;
+    }
+    mpool->mpool = mp;
+
+    return OPAL_SUCCESS;
+}
+#endif
\ No newline at end of file
diff --git a/opal/mca/accelerator/cuda/help-accelerator-cuda.txt b/opal/mca/accelerator/cuda/help-accelerator-cuda.txt
index 2cf7a14bf5d..750344e3a78 100644
--- a/opal/mca/accelerator/cuda/help-accelerator-cuda.txt
+++ b/opal/mca/accelerator/cuda/help-accelerator-cuda.txt
@@ -262,3 +262,10 @@ Check the cuda.h file for what the return value means.
 A call to allocate memory within the CUDA support failed.  This is
 an unrecoverable error and will cause the program to abort.
   Hostname:  %s
+#
+[cudaMemPoolCreate failed]
+The call to cudaMemPoolCreate failed. This is highly unusual and should
+not happen.  Please report this error to the Open MPI developers.
+  Hostname:                         %s
+  cudaMemPoolCreate return value:   %d
+Check the cuda_runtime_api.h file for what the return value means.
\ No newline at end of file
diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c
index 4a0d307497b..93729822bf7 100644
--- a/opal/mca/accelerator/null/accelerator_null_component.c
+++ b/opal/mca/accelerator/null/accelerator_null_component.c
@@ -53,6 +53,8 @@ static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest,
 
 static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size);
 static int accelerator_null_mem_release(int dev_id, void *ptr);
+static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size, opal_accelerator_stream_t* stream);
+static int accelerator_null_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base, size_t *size);
 
 static int accelerator_null_host_register(int dev_id, void *ptr, size_t size);
@@ -64,6 +66,8 @@ static int accelerator_null_device_can_access_peer(int *access, int dev1, int de
 
 static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream);
+
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
@@ -104,6 +108,8 @@ opal_accelerator_null_component_t mca_accelerator_null_component = {{
 
 opal_accelerator_base_module_t opal_accelerator_null_module =
 {
+    NULL,
+
     accelerator_null_check_addr,
 
     accelerator_null_create_stream,
@@ -117,6 +123,8 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
     accelerator_null_memmove,
     accelerator_null_mem_alloc,
     accelerator_null_mem_release,
+    accelerator_null_mem_alloc_stream,
+    accelerator_null_mem_release_stream,
     accelerator_null_get_address_range,
 
     accelerator_null_host_register,
@@ -126,7 +134,9 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
     accelerator_null_get_device_pci_attr,
     accelerator_null_device_can_access_peer,
 
-    accelerator_null_get_buffer_id
+    accelerator_null_get_buffer_id,
+
+    accelerator_null_wait_stream
 };
 
 static int accelerator_null_open(void)
@@ -216,6 +226,23 @@ static int accelerator_null_mem_release(int dev_id, void *ptr)
     return OPAL_SUCCESS;
 }
 
+static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                             opal_accelerator_stream_t *stream)
+{
+    (void)stream;
+    *ptr = malloc(size);
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_null_mem_release_stream(int dev_id, void *ptr,
+                                               opal_accelerator_stream_t *stream)
+{
+    (void)stream;
+    free(ptr);
+    return OPAL_SUCCESS;
+}
+
+
 static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base,
                                               size_t *size)
 {
@@ -251,3 +278,9 @@ static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_acc
 {
     return OPAL_ERR_NOT_IMPLEMENTED;
 }
+
+
+static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream)
+{
+    return OPAL_SUCCESS;
+}
\ No newline at end of file

From 5df449c3523d417915c7c982ace1bab83525c912 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 6 Apr 2023 13:11:12 -0400
Subject: [PATCH 12/74] Don't memset the CUDA op component, we need the version

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_component.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 5190e98f50f..3e2f9c3e8b3 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -122,7 +122,7 @@ cuda_component_init_query(bool enable_progress_threads,
     int num_devices;
     int rc;
     int prio_lo, prio_hi;
-    memset(&mca_op_cuda_component, 0, sizeof(mca_op_cuda_component));
+    //memset(&mca_op_cuda_component, 0, sizeof(mca_op_cuda_component));
     cuInit(0);
     CHECK(cuDeviceGetCount, (&num_devices));
     mca_op_cuda_component.cu_num_devices = num_devices;
@@ -185,6 +185,5 @@ cuda_component_op_query(struct ompi_op_t *op, int *priority)
         }
     }
     *priority = 50;
-    printf("op cuda_component_query\n");
     return (ompi_op_base_module_1_0_0_t *) module;
 }

From 812d068406c5de0a690ab67393c1de19ac369138 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 6 Apr 2023 13:11:56 -0400
Subject: [PATCH 13/74] Set the memory pool release threshold

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/cuda/accelerator_cuda_component.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index 9115997ffe2..416f7970704 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -191,6 +191,11 @@ int opal_accelerator_cuda_delayed_init()
     OBJ_CONSTRUCT(&opal_accelerator_cuda_default_stream, opal_accelerator_cuda_stream_t);
     opal_accelerator_cuda_default_stream.base.stream = default_stream;
 
+    cudaMemPool_t mpool;
+    cuuint64_t threshold =  1*1024*1024;
+    cudaDeviceGetDefaultMemPool(&mpool, 0);
+    cudaMemPoolSetAttribute(mpool, cudaMemPoolAttrReleaseThreshold, &threshold);
+
     result = cuMemHostRegister(&checkmem, sizeof(int), 0);
     if (result != CUDA_SUCCESS) {
         /* If registering the memory fails, print a message and continue.

From a688c8432ed9d7c91dbb71ed7b293f16f2be8b97 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 11 Apr 2023 11:19:48 -0400
Subject: [PATCH 14/74] Implement device-compatible allocator to cache coll
 temporaries

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c      |  34 +--
 ompi/mca/coll/base/coll_base_frame.c          |  14 ++
 ompi/mca/coll/base/coll_base_functions.h      |   5 +
 ompi/mca/coll/base/coll_base_util.c           |  45 ++++
 ompi/mca/coll/base/coll_base_util.h           |  29 ++-
 opal/mca/accelerator/accelerator.h            |   4 +
 opal/mca/accelerator/cuda/accelerator_cuda.c  |  16 +-
 opal/mca/accelerator/cuda/accelerator_cuda.h  |   8 +-
 .../cuda/accelerator_cuda_component.c         |  39 +---
 .../null/accelerator_null_component.c         |  11 +-
 opal/mca/allocator/devicebucket/Makefile.am   |  49 +++++
 .../devicebucket/allocator_devicebucket.c     | 138 +++++++++++++
 .../allocator_devicebucket_alloc.c            | 195 ++++++++++++++++++
 .../allocator_devicebucket_alloc.h            | 190 +++++++++++++++++
 .../help-mca-allocator-devicebucket.txt       |  19 ++
 opal/mca/allocator/devicebucket/owner.txt     |   7 +
 16 files changed, 727 insertions(+), 76 deletions(-)
 create mode 100644 opal/mca/allocator/devicebucket/Makefile.am
 create mode 100644 opal/mca/allocator/devicebucket/allocator_devicebucket.c
 create mode 100644 opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
 create mode 100644 opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.h
 create mode 100644 opal/mca/allocator/devicebucket/help-mca-allocator-devicebucket.txt
 create mode 100644 opal/mca/allocator/devicebucket/owner.txt

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 1dd0ceee7d9..104da625bba 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -160,7 +160,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Allocate and initialize temporary send buffer */
     span = opal_datatype_span(&dtype->super, count, &gap);
-    inplacebuf_free = ompi_coll_base_allocate_op_tmpbuf(sbuf, rbuf, span, op, dtype, &inplacebuf_dev);
+    inplacebuf_free = ompi_coll_base_allocate_op_tmpbuf(sbuf, rbuf, span, op, dtype, &inplacebuf_dev, module);
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     inplacebuf = inplacebuf_free - gap;
 
@@ -266,14 +266,14 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
 
-    ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev);
+    ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev, module);
     return MPI_SUCCESS;
 
  error_hndl:
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
-    ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev);
+    ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev, module);
     return ret;
 }
 
@@ -402,10 +402,10 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
     max_real_segsize = true_extent + (max_segcount - 1) * extent;
 
     /* we don't care about where the send buffer is */
-    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[0]);
+    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[0], module);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[1]);
+        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[1], module);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -525,8 +525,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
 
     }
 
-    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0]);
-    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1]);
+    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0], module);
+    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1], module);
 
     return MPI_SUCCESS;
 
@@ -535,8 +535,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0]);
-    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1]);
+    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0], module);
+    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1], module);
     return ret;
 }
 
@@ -691,10 +691,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
     /* Allocate and initialize temporary buffers */
-    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[0]);
+    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[0], module);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[1]);
+        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[1], module);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -846,8 +846,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
 
     }
 
-    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0]);
-    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1]);
+    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0], module);
+    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1], module);
 
     return MPI_SUCCESS;
 
@@ -856,8 +856,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0]);
-    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1]);
+    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0], module);
+    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1], module);
     return ret;
 }
 
@@ -1010,7 +1010,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
     /* Temporary buffer for receiving messages */
     char *tmp_buf = NULL;
-    char *tmp_buf_raw = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, dsize, op, dtype, &tmp_buf_dev);
+    char *tmp_buf_raw = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, dsize, op, dtype, &tmp_buf_dev, module);
     if (NULL == tmp_buf_raw)
         return OMPI_ERR_OUT_OF_RESOURCE;
     tmp_buf = tmp_buf_raw - gap;
@@ -1239,7 +1239,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
   cleanup_and_return:
 
-    ompi_coll_base_free_tmpbuf(tmp_buf_raw, tmp_buf_dev);
+    ompi_coll_base_free_tmpbuf(tmp_buf_raw, tmp_buf_dev, module);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
diff --git a/ompi/mca/coll/base/coll_base_frame.c b/ompi/mca/coll/base/coll_base_frame.c
index 5bb6fe38ace..f7b4fa572d9 100644
--- a/ompi/mca/coll/base/coll_base_frame.c
+++ b/ompi/mca/coll/base/coll_base_frame.c
@@ -30,6 +30,7 @@
 #include "ompi/mca/mca.h"
 #include "opal/util/output.h"
 #include "opal/mca/base/base.h"
+#include "opal/mca/accelerator/accelerator.h"
 
 
 #include "ompi/mca/coll/coll.h"
@@ -70,6 +71,7 @@ static void
 coll_base_comm_construct(mca_coll_base_comm_t *data)
 {
     memset ((char *) data + sizeof (data->super), 0, sizeof (*data) - sizeof (data->super));
+    data->device_allocators = NULL;
 }
 
 static void
@@ -108,6 +110,18 @@ coll_base_comm_destruct(mca_coll_base_comm_t *data)
     if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
         ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bintree);
     }
+
+    if (NULL != data->device_allocators) {
+        int num_devices;
+        opal_accelerator.num_devices(&num_devices);
+        for (int i = 0; i < num_devices; ++i) {
+            if (NULL != data->device_allocators[i]) {
+                data->device_allocators[i]->alc_finalize(data->device_allocators[i]);
+            }
+        }
+        free(data->device_allocators);
+        data->device_allocators = NULL;
+    }
 }
 
 OBJ_CLASS_INSTANCE(mca_coll_base_comm_t, opal_object_t,
diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h
index 1c73d01d37e..eca2502493d 100644
--- a/ompi/mca/coll/base/coll_base_functions.h
+++ b/ompi/mca/coll/base/coll_base_functions.h
@@ -40,6 +40,8 @@
 /* need to include our own topo prototypes so we can malloc data on the comm correctly */
 #include "coll_base_topo.h"
 
+#include "opal/mca/allocator/allocator.h"
+
 /* some fixed value index vars to simplify certain operations */
 typedef enum COLLTYPE {
     ALLGATHER = 0,       /*  0 */
@@ -514,6 +516,9 @@ struct mca_coll_base_comm_t {
 
     /* in-order binary tree (root of the in-order binary tree is rank 0) */
     ompi_coll_tree_t *cached_in_order_bintree;
+
+    /* pointer to per-device memory cache */
+    mca_allocator_base_module_t **device_allocators;
 };
 typedef struct mca_coll_base_comm_t mca_coll_base_comm_t;
 OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t);
diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c
index 9465ef95c62..e6b0e74f958 100644
--- a/ompi/mca/coll/base/coll_base_util.c
+++ b/ompi/mca/coll/base/coll_base_util.c
@@ -30,6 +30,7 @@
 #include "ompi/mca/pml/pml.h"
 #include "coll_base_util.h"
 #include "coll_base_functions.h"
+#include "opal/mca/allocator/base/base.h"
 #include <ctype.h>
 
 int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount,
@@ -602,3 +603,47 @@ const char* mca_coll_base_colltype_to_str(int collid)
     }
     return colltype_translation_table[collid];
 }
+
+static void* ompi_coll_base_device_allocate_cb(void *ctx, size_t *size) {
+    int dev_id = (intptr_t)ctx;
+    void *ptr;
+    opal_accelerator.mem_alloc(dev_id, &ptr, *size);
+    return ptr;
+}
+
+static void ompi_coll_base_device_release_cb(void *ctx, void* ptr) {
+    int dev_id = (intptr_t)ctx;
+    opal_accelerator.mem_release(dev_id, ptr);
+}
+
+void *ompi_coll_base_allocate_on_device(int device, size_t size,
+                                        mca_coll_base_module_t *module)
+{
+    mca_allocator_base_module_t *allocator_module;
+    if (NULL == module->base_data->device_allocators) {
+        int num_dev;
+        opal_accelerator.num_devices(&num_dev);
+        module->base_data->device_allocators = calloc(num_dev, sizeof(mca_allocator_base_module_t *));
+    }
+    //printf("allocators %p module %p\n", module->base_data->device_allocators, module->base_data->device_allocators[device]);
+    if (NULL == (allocator_module = module->base_data->device_allocators[device])) {
+        mca_allocator_base_component_t *allocator_component;
+        allocator_component = mca_allocator_component_lookup("devicebucket");
+        assert(allocator_component != NULL);
+        allocator_module = allocator_component->allocator_init(false, ompi_coll_base_device_allocate_cb,
+                                                               ompi_coll_base_device_release_cb,
+                                                               (void*)(intptr_t)device);
+        assert(allocator_module != NULL);
+        module->base_data->device_allocators[device] = allocator_module;
+    }
+    //printf("allocator_module %p\n", allocator_module);
+    return allocator_module->alc_alloc(allocator_module, size, 0);
+}
+
+void ompi_coll_base_free_on_device(int device, void *ptr, mca_coll_base_module_t *module)
+{
+    mca_allocator_base_module_t *allocator_module;
+    assert(NULL != module->base_data->device_allocators);
+    allocator_module = module->base_data->device_allocators[device];
+    allocator_module->alc_free(allocator_module, ptr);
+}
\ No newline at end of file
diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h
index e841d08f0b2..25c5d79d143 100644
--- a/ompi/mca/coll/base/coll_base_util.h
+++ b/ompi/mca/coll/base/coll_base_util.h
@@ -206,6 +206,12 @@ int mca_coll_base_name_to_colltype(const char* name);
 
 /* device/host memory allocation functions */
 
+
+void *ompi_coll_base_allocate_on_device(int device, size_t size,
+                                        mca_coll_base_module_t *module);
+
+void ompi_coll_base_free_on_device(int device, void *ptr, mca_coll_base_module_t *module);
+
 /**
  * Returns a pointer to memory in the same memory domain as the receive or send buffer.
  * Device memory is allocated if either the receive buffer or the send buffer are
@@ -216,7 +222,7 @@ static inline
 void* ompi_coll_base_allocate_op_tmpbuf(
     const void *sendbuf, const void *recvbuf, size_t size,
     const struct ompi_op_t *op, const struct ompi_datatype_t *dtype,
-    int *device)
+    int *device, mca_coll_base_module_t *module)
 {
     void *res = NULL;
     uint64_t flags;
@@ -224,17 +230,18 @@ void* ompi_coll_base_allocate_op_tmpbuf(
     if ((NULL == op && NULL == dtype) || ompi_op_supports_device(op, dtype)) {
         /* if the recvbuf is on the device we take that device */
         if (NULL != recvbuf && 0 < opal_accelerator.check_addr(recvbuf, device, &flags)) {
-            if (OPAL_SUCCESS != opal_accelerator.mem_alloc(*device, &res, size)) {
-                /* fall back to the host */
-                res = NULL;
+            /* allocate cache on demand */
+            res = ompi_coll_base_allocate_on_device(*device, size, module);
+            if (NULL == res) {
+                // fallback to host
                 *device = -1;
             }
         } else if (MPI_IN_PLACE != sendbuf && NULL != sendbuf &&
                 0 < opal_accelerator.check_addr(sendbuf, device, &flags)) {
             /* send buffer is on a device so try to allocate memory there */
-            if (OPAL_SUCCESS != opal_accelerator.mem_alloc(*device, &res, size)) {
-                /* fall back to the host */
-                res = NULL;
+            res = ompi_coll_base_allocate_on_device(*device, size, module);
+            if (NULL == res) {
+                // fallback to host
                 *device = -1;
             }
         }
@@ -253,9 +260,9 @@ void* ompi_coll_base_allocate_op_tmpbuf(
 static inline
 void* ompi_coll_base_allocate_tmpbuf(
     const void *sendbuf, const void *recvbuf,
-    size_t size, int *device)
+    size_t size, int *device, mca_coll_base_module_t *module)
 {
-    return ompi_coll_base_allocate_op_tmpbuf(sendbuf, recvbuf, size, NULL, NULL, device);
+    return ompi_coll_base_allocate_op_tmpbuf(sendbuf, recvbuf, size, NULL, NULL, device, module);
 }
 
 /**
@@ -263,11 +270,11 @@ void* ompi_coll_base_allocate_tmpbuf(
  * or ompi_coll_base_allocate_tmpbuf.
  */
 static inline
-void ompi_coll_base_free_tmpbuf(void *tmpbuf, int device) {
+void ompi_coll_base_free_tmpbuf(void *tmpbuf, int device, mca_coll_base_module_t *module) {
     if (-1 == device) {
         free(tmpbuf);
     } else if (NULL != tmpbuf) {
-        opal_accelerator.mem_release(device, tmpbuf);
+        ompi_coll_base_free_on_device(device, tmpbuf, module);
     }
 }
 
diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h
index d25bfaea857..f631242f8a8 100644
--- a/opal/mca/accelerator/accelerator.h
+++ b/opal/mca/accelerator/accelerator.h
@@ -449,6 +449,8 @@ typedef int (*opal_accelerator_base_module_get_buffer_id_fn_t)(
  */
 typedef int (*opal_accelerator_base_module_wait_stream_fn_t)(opal_accelerator_stream_t *stream);
 
+typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_devices);
+
 /*
  * the standard public API data structure
  */
@@ -483,6 +485,8 @@ typedef struct {
     opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;
 
     opal_accelerator_base_module_wait_stream_fn_t wait_stream;
+
+    opal_accelerator_base_module_get_num_devices_fn_t num_devices;
 } opal_accelerator_base_module_t;
 
 /**
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index 03020583224..c8e22473f11 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -55,6 +55,8 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
 
 static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream);
 
+static int accelerator_cuda_get_num_devices(int *num_devices);
+
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
 {
     &opal_accelerator_cuda_default_stream.base,
@@ -85,18 +87,17 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
 
     accelerator_cuda_get_buffer_id,
 
-    accelerator_cuda_wait_stream
+    accelerator_cuda_wait_stream,
+    accelerator_cuda_get_num_devices
 };
 
 static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
     /* query the device from the context */
     int dev_id = -1;
     CUdevice ptr_dev;
-    int num_devices;
     cuCtxPushCurrent(mem_ctx);
     cuCtxGetDevice(&ptr_dev);
-    cuDeviceGetCount(&num_devices);
-    for (int i = 0; i < num_devices; ++i) {
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
         CUdevice dev;
         cuDeviceGet(&dev, i);
         if (dev == ptr_dev) {
@@ -819,4 +820,11 @@ static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream)
         return OPAL_ERROR;
     }
     return OPAL_SUCCESS;
+}
+
+
+static int accelerator_cuda_get_num_devices(int *num_devices)
+{
+    *num_devices = opal_accelerator_cuda_num_devices;
+    return OPAL_SUCCESS;
 }
\ No newline at end of file
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index 7aff2530973..a3c4d29acef 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -38,12 +38,6 @@ struct opal_accelerator_cuda_event_t {
 typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t);
 
-struct opal_accelerator_cuda_mempool_t {
-    opal_accelerator_mempool_t base;
-};
-typedef struct opal_accelerator_cuda_mempool_t opal_accelerator_cuda_mempool_t;
-OBJ_CLASS_DECLARATION(opal_accelerator_cuda_mempool_t);
-
 /* Declare extern variables, defined in accelerator_cuda_component.c */
 OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream;
 OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_alloc_stream;
@@ -54,7 +48,7 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp
 
 OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
 
-OPAL_DECLSPEC extern cudaMemPool_t *opal_accelerator_cuda_mempools;
+OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices;
 
 OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
 
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index 416f7970704..3f200bce55c 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -38,12 +38,12 @@ CUstream opal_accelerator_cuda_memcpy_stream = NULL;
 CUstream opal_accelerator_cuda_alloc_stream = NULL;
 opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream;
 opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
+int opal_accelerator_cuda_num_devices = 0;
 
 /* Initialization lock for delayed cuda initialization */
 static opal_mutex_t accelerator_cuda_init_lock;
 static bool accelerator_cuda_init_complete = false;
 
-cudaMemPool_t *opal_accelerator_cuda_mempool;
 
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x)  STRINGIFY2(x)
@@ -156,6 +156,8 @@ int opal_accelerator_cuda_delayed_init()
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
     }
 
+    cuDeviceGetCount(&opal_accelerator_cuda_num_devices);
+
     /* Create stream for use in cuMemcpyAsync synchronous copies */
     result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
@@ -264,38 +266,3 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
     OBJ_DESTRUCT(&accelerator_cuda_init_lock);
     return;
 }
-
-#if 0
-static int opal_acclerator_cuda_init_mempools() {
-    cudaError_t result;
-    int num_devices;
-    cuDeviceGetCount(&num_devices);
-    if (num_devices == 0) {
-        return OPAL_SUCCESS;
-    }
-    opal_accelerator_cuda_mempools = malloc(num_devices*sizeof(*mp));
-    cudaMemPoolProps pp;
-    memset(&pp, 0, sizeof(pp));
-    pp.allocType = cudaMemAllocationTypePinned;
-    pp.handleTypes = cudaMemHandleTypeNone;
-    pp.location.id = devidx;
-    pp.location.type = cudaMemLocationTypeDevice;
-    for (int i = 0; i < num_devices; ++i) {
-
-    result = cudaMemPoolCreate(mp, &pp);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-accelerator-cuda.txt", "cudaMemPoolCreate failed", true,
-                        OPAL_PROC_MY_HOSTNAME, result);
-        return OPAL_ERROR;
-    }
-
-    }
-    *mpool = OBJ_NEW(opal_accelerator_cuda_mempool_t);
-    if (NULL == *mpool) {
-        return OPAL_ERR_OUT_OF_RESOURCE;
-    }
-    mpool->mpool = mp;
-
-    return OPAL_SUCCESS;
-}
-#endif
\ No newline at end of file
diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c
index 93729822bf7..06eb7bffd37 100644
--- a/opal/mca/accelerator/null/accelerator_null_component.c
+++ b/opal/mca/accelerator/null/accelerator_null_component.c
@@ -68,6 +68,8 @@ static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_acc
 
 static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream);
 
+static int accelerator_null_get_num_devices(int *num_devices);
+
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
@@ -136,7 +138,8 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
 
     accelerator_null_get_buffer_id,
 
-    accelerator_null_wait_stream
+    accelerator_null_wait_stream,
+    accelerator_null_get_num_devices
 };
 
 static int accelerator_null_open(void)
@@ -283,4 +286,10 @@ static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_acc
 static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream)
 {
     return OPAL_SUCCESS;
+}
+
+static int accelerator_null_get_num_devices(int *num_devices)
+{
+    *num_devices = 0;
+    return OPAL_SUCCESS;
 }
\ No newline at end of file
diff --git a/opal/mca/allocator/devicebucket/Makefile.am b/opal/mca/allocator/devicebucket/Makefile.am
new file mode 100644
index 00000000000..466aad2671a
--- /dev/null
+++ b/opal/mca/allocator/devicebucket/Makefile.am
@@ -0,0 +1,49 @@
+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2023 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2017      IBM Corporation.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+dist_opaldata_DATA = help-mca-allocator-devicebucket.txt
+
+sources = \
+        allocator_devicebucket.c \
+        allocator_devicebucket_alloc.c \
+        allocator_devicebucket_alloc.h
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+if MCA_BUILD_opal_allocator_devicebucket_DSO
+component_noinst =
+component_install = mca_allocator_devicebucket.la
+else
+component_noinst = libmca_allocator_devicebucket.la
+component_install =
+endif
+
+mcacomponentdir = $(opallibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_allocator_devicebucket_la_SOURCES = $(sources)
+mca_allocator_devicebucket_la_LDFLAGS = -module -avoid-version
+mca_allocator_devicebucket_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_allocator_devicebucket_la_SOURCES = $(sources)
+libmca_allocator_devicebucket_la_LDFLAGS = -module -avoid-version
+
diff --git a/opal/mca/allocator/devicebucket/allocator_devicebucket.c b/opal/mca/allocator/devicebucket/allocator_devicebucket.c
new file mode 100644
index 00000000000..d5b4b0a9f54
--- /dev/null
+++ b/opal/mca/allocator/devicebucket/allocator_devicebucket.c
@@ -0,0 +1,138 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "opal_config.h"
+#include "opal/constants.h"
+#include "opal/mca/allocator/allocator.h"
+#include "opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.h"
+#include "opal/mca/base/mca_base_var.h"
+
+OBJ_CLASS_INSTANCE(mca_allocator_devicebucket_chunk_t, opal_list_item_t, NULL, NULL);
+
+struct mca_allocator_base_module_t *mca_allocator_devicebucket_module_init(
+    bool enable_mpi_threads, mca_allocator_base_component_segment_alloc_fn_t segment_alloc,
+    mca_allocator_base_component_segment_free_fn_t segment_free, void *context);
+
+int mca_allocator_devicebucket_module_open(void);
+
+int mca_allocator_devicebucket_module_close(void);
+
+void *mca_allocator_devicebucket_alloc_wrapper(struct mca_allocator_base_module_t *allocator, size_t size,
+                                         size_t align);
+
+static size_t mca_allocator_min_cache_size;
+static size_t mca_allocator_max_cache_size;
+
+int mca_allocator_devicebucket_finalize(struct mca_allocator_base_module_t *allocator)
+{
+    mca_allocator_devicebucket_t *mem_options = (mca_allocator_devicebucket_t *) allocator;
+
+    mca_allocator_devicebucket_cleanup(allocator);
+
+    OBJ_DESTRUCT(&mem_options->used_chunks);
+
+    free(mem_options->buckets);
+    free(allocator);
+
+    return (OPAL_SUCCESS);
+}
+
+struct mca_allocator_base_module_t *mca_allocator_devicebucket_module_init(
+    bool enable_mpi_threads, mca_allocator_base_component_segment_alloc_fn_t segment_alloc,
+    mca_allocator_base_component_segment_free_fn_t segment_free, void *context)
+{
+    size_t alloc_size = sizeof(mca_allocator_devicebucket_t);
+    mca_allocator_devicebucket_t *retval;
+    mca_allocator_devicebucket_t *allocator = (mca_allocator_devicebucket_t *) malloc(alloc_size);
+    if (NULL == allocator) {
+        return NULL;
+    }
+    retval = mca_allocator_devicebucket_init((mca_allocator_base_module_t *) allocator,
+                                             mca_allocator_min_cache_size, mca_allocator_min_cache_size,
+                                             segment_alloc, segment_free);
+    if (NULL == retval) {
+        free(allocator);
+        return NULL;
+    }
+    allocator->super.alc_alloc = mca_allocator_devicebucket_alloc_wrapper;
+    //allocator->super.alc_realloc = mca_allocator_devicebucket_realloc;
+    allocator->super.alc_realloc = NULL; // not supported
+    allocator->super.alc_free = mca_allocator_devicebucket_free;
+    allocator->super.alc_compact = mca_allocator_devicebucket_cleanup;
+    allocator->super.alc_finalize = mca_allocator_devicebucket_finalize;
+    allocator->super.alc_context = context;
+    return (mca_allocator_base_module_t *) allocator;
+}
+
+static int mca_allocator_devicebucket_module_register(void)
+{
+    mca_allocator_min_cache_size = 4*1024;      // 4K
+    mca_allocator_max_cache_size = 1*1024*1024; // 1M
+    (void) mca_base_component_var_register(&mca_allocator_devicebucket_component.allocator_version,
+                                           "min_cache_size", "Minimum allocation cache size",
+                                           MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0,
+                                           MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_LOCAL, &mca_allocator_min_cache_size);
+
+    (void) mca_base_component_var_register(&mca_allocator_devicebucket_component.allocator_version,
+                                           "max_cache_size",
+                                           "Maximum allocation cache size. Larger allocations will not be cached.",
+                                           MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0,
+                                           MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_LOCAL, &mca_allocator_max_cache_size);
+    return OPAL_SUCCESS;
+}
+
+int mca_allocator_devicebucket_module_open(void)
+{
+    return OPAL_SUCCESS;
+}
+
+int mca_allocator_devicebucket_module_close(void)
+{
+    return OPAL_SUCCESS;
+}
+
+void *mca_allocator_devicebucket_alloc_wrapper(struct mca_allocator_base_module_t *allocator, size_t size,
+                                         size_t align)
+{
+    if (0 == align) {
+        return mca_allocator_devicebucket_alloc(allocator, size);
+    }
+    return mca_allocator_devicebucket_alloc_align(allocator, size, align);
+}
+
+mca_allocator_base_component_t mca_allocator_devicebucket_component = {
+
+    /* First, the mca_base_module_t struct containing meta information
+       about the module itself */
+
+    {MCA_ALLOCATOR_BASE_VERSION_2_0_0,
+
+     "devicebucket", /* MCA module name */
+     OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, OPAL_RELEASE_VERSION,
+     mca_allocator_devicebucket_module_open,  /* module open */
+     mca_allocator_devicebucket_module_close, /* module close */
+     NULL, mca_allocator_devicebucket_module_register},
+    {/* The component is checkpoint ready */
+     MCA_BASE_METADATA_PARAM_CHECKPOINT},
+    mca_allocator_devicebucket_module_init};
diff --git a/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c b/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
new file mode 100644
index 00000000000..f3b6df4559c
--- /dev/null
+++ b/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007      IBM Corp.,  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "opal_config.h"
+#include "opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.h"
+#include "opal/constants.h"
+#include "opal/util/show_help.h"
+
+/**
+ * The define controls the size in bytes of the 1st bucket and hence every one
+ * afterwards.
+ */
+#define MCA_ALLOCATOR_BUCKET_1_SIZE 8
+/**
+ * This is the number of left bit shifts from 1 needed to get to the number of
+ * bytes in the initial memory buckets
+ */
+#define MCA_ALLOCATOR_BUCKET_1_BITSHIFTS 3
+
+static int max_devicebucket_idx;
+
+/*
+ * Initializes the mca_allocator_devicebucket_options_t data structure for the passed
+ * parameters.
+ */
+mca_allocator_devicebucket_t *
+mca_allocator_devicebucket_init(mca_allocator_base_module_t *mem,
+                                size_t min_cache_size, size_t max_cache_size,
+                                mca_allocator_base_component_segment_alloc_fn_t get_mem_funct,
+                                mca_allocator_base_component_segment_free_fn_t free_mem_funct)
+{
+    mca_allocator_devicebucket_t *mem_options = (mca_allocator_devicebucket_t *) mem;
+    size_t size;
+    /* if a bad value is used for the number of buckets, default to 30 */
+    int num_buckets = 1;
+    /* round min_cache_size down to pow2 */
+    size = 1;
+    while (size < min_cache_size) {
+        size <<= 1;
+    }
+    min_cache_size = size;
+    while (size < max_cache_size) {
+        size <<= 1;
+        num_buckets++;
+    }
+
+    max_devicebucket_idx = num_buckets - 1;
+
+    /* initialize the array of buckets */
+    size = sizeof(mca_allocator_devicebucket_bucket_t) * num_buckets;
+    mem_options->buckets = (mca_allocator_devicebucket_bucket_t *) malloc(size);
+    if (NULL == mem_options->buckets) {
+        return (NULL);
+    }
+    for (int i = 0; i < num_buckets; i++) {
+        OBJ_CONSTRUCT(&(mem_options->buckets[i].super), opal_lifo_t);
+        mem_options->buckets[i].size = (min_cache_size << i);
+    }
+    mem_options->num_buckets = num_buckets;
+    mem_options->get_mem_fn = get_mem_funct;
+    mem_options->free_mem_fn = free_mem_funct;
+    mem_options->min_cache_size = min_cache_size;
+    OBJ_CONSTRUCT(&mem_options->used_chunks, opal_hash_table_t);
+    opal_hash_table_init(&mem_options->used_chunks, 32);
+    OBJ_CONSTRUCT(&(mem_options->used_chunks_lock), opal_mutex_t);
+    return (mem_options);
+}
+
+/*
+ * Accepts a request for memory in a specific region defined by the
+ * mca_allocator_devicebucket_options_t struct and returns a pointer to memory in that
+ * region or NULL if there was an error
+ *
+ */
+void *mca_allocator_devicebucket_alloc(mca_allocator_base_module_t *mem, size_t size)
+{
+    mca_allocator_devicebucket_t *mem_options = (mca_allocator_devicebucket_t *) mem;
+    /* initialize for the later bit shifts */
+    int bucket_num = 0;
+    size_t bucket_size = mem_options->min_cache_size;
+    mca_allocator_devicebucket_chunk_t *chunk;
+
+    /* figure out which bucket it will come from. */
+    while (size > bucket_size) {
+        bucket_num++;
+        bucket_size <<= 1;
+    }
+
+    if (bucket_num >= mem_options->num_buckets) {
+        /* allocate directly */
+        return mem_options->get_mem_fn(mem_options->super.alc_context, &bucket_size);
+    }
+
+    /* see if there is already a free chunk */
+    chunk = (mca_allocator_devicebucket_chunk_t *)opal_lifo_pop(&(mem_options->buckets[bucket_num].super));
+    if (NULL == chunk) {
+        /* create a new allocation */
+        chunk = OBJ_NEW(mca_allocator_devicebucket_chunk_t);
+        if (NULL == chunk) {
+            return NULL;
+        }
+        chunk->addr = mem_options->get_mem_fn(mem_options->super.alc_context, &bucket_size);
+    }
+    /* store the chunk in the hash table so we can find it during free */
+    OPAL_THREAD_LOCK(&(mem_options->used_chunks_lock));
+    opal_hash_table_set_value_uint64(&(mem_options->used_chunks), (uint64_t)chunk->addr, chunk);
+    OPAL_THREAD_UNLOCK(&(mem_options->used_chunks_lock));
+    chunk->size = bucket_size;
+    return chunk->addr;
+}
+
+/*
+ * allocates an aligned region of memory
+ */
+void *mca_allocator_devicebucket_alloc_align(mca_allocator_base_module_t *mem, size_t size,
+                                       size_t alignment)
+{
+    return mca_allocator_devicebucket_alloc(mem, size);
+}
+
+/*
+ * function to reallocate the segment of memory
+ */
+void *mca_allocator_devicebucket_realloc(mca_allocator_base_module_t *mem, void *ptr, size_t size)
+{
+    mca_allocator_devicebucket_t *mem_options = (mca_allocator_devicebucket_t *) mem;
+    // TODO: do something nice here
+    return NULL;
+}
+
+/*
+ * Frees the passed region of memory
+ *
+ */
+void mca_allocator_devicebucket_free(mca_allocator_base_module_t *mem, void *ptr)
+{
+    mca_allocator_devicebucket_t *mem_options = (mca_allocator_devicebucket_t *) mem;
+    size_t bucket_size = mem_options->min_cache_size;
+    size_t allocated_size;
+    int bucket_num = 0;
+    mca_allocator_devicebucket_chunk_t *chunk;
+
+    OPAL_THREAD_LOCK(&(mem_options->used_chunks_lock));
+    opal_hash_table_get_value_uint64(&(mem_options->used_chunks), (uint64_t)ptr, (void**)&chunk);
+    opal_hash_table_remove_value_uint64(&(mem_options->used_chunks), (uint64_t)ptr);
+    OPAL_THREAD_UNLOCK(&(mem_options->used_chunks_lock));
+    size_t size = chunk->size;
+
+    /* figure out which bucket to put the chunk into. */
+    while (size > bucket_size) {
+        bucket_num++;
+        bucket_size <<= 1;
+    }
+
+    /* push into lifo */
+    opal_lifo_push(&(mem_options->buckets[bucket_num].super), &chunk->super);
+}
+
+/*
+ * Frees all the memory from all the buckets back to the system. Note that
+ * this function only frees memory that was previously freed with
+ * mca_allocator_devicebucket_free().
+ *
+ */
+int mca_allocator_devicebucket_cleanup(mca_allocator_base_module_t *mem)
+{
+    mca_allocator_devicebucket_t *mem_options = (mca_allocator_devicebucket_t *) mem;
+    mca_allocator_devicebucket_chunk_t *chunk;
+
+    for (int i = 0; i < mem_options->num_buckets; i++) {
+        while (NULL != (chunk = (mca_allocator_devicebucket_chunk_t *)opal_lifo_pop(&(mem_options->buckets[i].super)))) {
+            if (mem_options->free_mem_fn) {
+                mem_options->free_mem_fn(mem->alc_context, chunk->addr);
+            }
+            OBJ_RELEASE(chunk);
+        }
+    }
+    return OPAL_SUCCESS;
+}
diff --git a/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.h b/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.h
new file mode 100644
index 00000000000..b313bd91d6f
--- /dev/null
+++ b/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.h
@@ -0,0 +1,190 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *  A generic memory bucket allocator.
+ **/
+
+#ifndef ALLOCATOR_DEVICEBUCKET_ALLOC_H
+#define ALLOCATOR_DEVICEBUCKET_ALLOC_H
+
+#include "opal_config.h"
+#include "opal/mca/allocator/allocator.h"
+#include "opal/mca/threads/mutex.h"
+#include "opal/class/opal_lifo.h"
+#include "opal/class/opal_hash_table.h"
+#include <stdlib.h>
+#include <string.h>
+
+BEGIN_C_DECLS
+
+/**
+ * Structure for the header of each memory chunk
+ */
+struct mca_allocator_devicebucket_chunk_t {
+    opal_list_item_t super;
+    void *addr; // address
+    size_t size;
+};
+
+/**
+ * Typedef so we don't have to use struct
+ */
+typedef struct mca_allocator_devicebucket_chunk_t mca_allocator_devicebucket_chunk_t;
+
+OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_allocator_devicebucket_chunk_t);
+
+struct mca_allocator_devicebucket_bucket_t {
+    opal_lifo_t super;
+    size_t size;
+};
+
+/**
+ * Typedef so we don't have to use struct
+ */
+typedef struct mca_allocator_devicebucket_bucket_t mca_allocator_devicebucket_bucket_t;
+
+/**
+ * Structure that holds the necessary information for each area of memory
+ */
+struct mca_allocator_devicebucket_t {
+    mca_allocator_base_module_t super;      /**< makes this a child of class mca_allocator_t */
+    mca_allocator_devicebucket_bucket_t *buckets; /**< the array of buckets */
+    int num_buckets;                        /**< the number of buckets */
+    opal_hash_table_t used_chunks;
+    opal_mutex_t used_chunks_lock;
+    size_t min_cache_size;
+    mca_allocator_base_component_segment_alloc_fn_t get_mem_fn;
+    /**< pointer to the function to get more memory */
+    mca_allocator_base_component_segment_free_fn_t free_mem_fn;
+    /**< pointer to the function to free memory */
+};
+/**
+ * Typedef so we don't have to use struct
+ */
+typedef struct mca_allocator_devicebucket_t mca_allocator_devicebucket_t;
+
+/**
+ * Initializes the mca_allocator_devicebucket_options_t data structure for the passed
+ * parameters.
+ * @param mem a pointer to the mca_allocator_t struct to be filled in
+ * @param num_buckets The number of buckets the allocator will use
+ * @param get_mem_funct A pointer to the function that the allocator
+ * will use to get more memory
+ * @param free_mem_funct A pointer to the function that the allocator
+ * will use to free memory
+ *
+ * @retval Pointer to the initialized mca_allocator_devicebucket_options_t structure
+ * @retval NULL if there was an error
+ */
+mca_allocator_devicebucket_t *
+mca_allocator_devicebucket_init(mca_allocator_base_module_t *mem,
+                                size_t min_cache_size, size_t max_cache_size,
+                                mca_allocator_base_component_segment_alloc_fn_t get_mem_funct,
+                                mca_allocator_base_component_segment_free_fn_t free_mem_funct);
+/**
+ * Accepts a request for memory in a specific region defined by the
+ * mca_allocator_devicebucket_options_t struct and returns a pointer to memory in that
+ * region or NULL if there was an error
+ *
+ * @param mem A pointer to the appropriate struct for the area of memory.
+ * @param size The size of the requested area of memory
+ *
+ * @retval Pointer to the area of memory if the allocation was successful
+ * @retval NULL if the allocation was unsuccessful
+ */
+void *mca_allocator_devicebucket_alloc(mca_allocator_base_module_t *mem, size_t size);
+
+/**
+ * Accepts a request for memory in a specific region defined by the
+ * mca_allocator_devicebucket_options_t struct and aligned by the specified amount and
+ * returns a pointer to memory in that region or NULL if there was an error
+ *
+ * @param mem A pointer to the appropriate struct for the area of
+ * memory.
+ * @param size The size of the requested area of memory
+ * @param alignment The requested alignment of the new area of memory. This
+ * MUST be a power of 2.
+ *
+ * @retval Pointer to the area of memory if the allocation was successful
+ * @retval NULL if the allocation was unsuccessful
+ *
+ */
+void *mca_allocator_devicebucket_alloc_align(mca_allocator_base_module_t *mem, size_t size,
+                                       size_t alignment);
+
+/**
+ * Attempts to resize the passed region of memory into a larger or a smaller
+ * region. If it is unsuccessful, it will return NULL and the passed area of
+ * memory will be untouched.
+ *
+ * @param mem A pointer to the appropriate struct for the area of
+ * memory.
+ * @param size The size of the requested area of memory
+ * @param ptr A pointer to the region of memory to be resized
+ *
+ * @retval Pointer to the area of memory if the reallocation was successful
+ * @retval NULL if the allocation was unsuccessful
+ *
+ */
+void *mca_allocator_devicebucket_realloc(mca_allocator_base_module_t *mem, void *ptr, size_t size);
+
+/**
+ * Frees the passed region of memory
+ *
+ * @param mem A pointer to the appropriate struct for the area of
+ * memory.
+ * @param ptr A pointer to the region of memory to be freed
+ *
+ * @retval None
+ *
+ */
+void mca_allocator_devicebucket_free(mca_allocator_base_module_t *mem, void *ptr);
+
+/**
+ * Frees all the memory from all the buckets back to the system. Note that
+ * this function only frees memory that was previously freed with
+ * mca_allocator_devicebucket_free().
+ *
+ * @param mem A pointer to the appropriate struct for the area of
+ * memory.
+ *
+ * @retval None
+ *
+ */
+int mca_allocator_devicebucket_cleanup(mca_allocator_base_module_t *mem);
+
+/**
+ * Cleanup all resources held by this allocator.
+ *
+ * @param mem A pointer to the appropriate struct for the area of
+ * memory.
+ *
+ * @retval None
+ *
+ */
+int mca_allocator_devicebucket_finalize(mca_allocator_base_module_t *mem);
+
+OPAL_DECLSPEC extern mca_allocator_base_component_t mca_allocator_devicebucket_component;
+
+END_C_DECLS
+
+#endif /* ALLOCATOR_DEVICEBUCKET_ALLOC_H */
diff --git a/opal/mca/allocator/devicebucket/help-mca-allocator-devicebucket.txt b/opal/mca/allocator/devicebucket/help-mca-allocator-devicebucket.txt
new file mode 100644
index 00000000000..27edbb92fa4
--- /dev/null
+++ b/opal/mca/allocator/devicebucket/help-mca-allocator-devicebucket.txt
@@ -0,0 +1,19 @@
+# -*- text -*-
+#
+# Copyright (c) 2021      IBM Corporation.  All rights reserved
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+# This is the US/English help file for Open MPI's allocator bucket support
+#
+[buffer too large]
+ERROR: Requested buffer size %zu exceeds limit of %zu
+Consider setting "%s" to %d
+#
+[aligned buffer too large]
+ERROR: Requested aligned buffer size %zu exceeds limit of %zu
+Consider setting "%s" to %d
+#
diff --git a/opal/mca/allocator/devicebucket/owner.txt b/opal/mca/allocator/devicebucket/owner.txt
new file mode 100644
index 00000000000..c47a2d510b1
--- /dev/null
+++ b/opal/mca/allocator/devicebucket/owner.txt
@@ -0,0 +1,7 @@
+#
+# owner/status file
+# owner: institution that is responsible for this package
+# status: e.g. active, maintenance, unmaintained
+#
+owner: UTK
+status: maintenance

From bbd362d0481f996fec2ebc1e53036e6a5aba6313 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 12 Apr 2023 16:48:13 -0400
Subject: [PATCH 15/74] Fix devicebucket allocator for larger sizes

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 .../devicebucket/allocator_devicebucket.c     |  3 +-
 .../allocator_devicebucket_alloc.c            | 40 +++++++++++++------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/opal/mca/allocator/devicebucket/allocator_devicebucket.c b/opal/mca/allocator/devicebucket/allocator_devicebucket.c
index d5b4b0a9f54..17bd82b408e 100644
--- a/opal/mca/allocator/devicebucket/allocator_devicebucket.c
+++ b/opal/mca/allocator/devicebucket/allocator_devicebucket.c
@@ -67,7 +67,7 @@ struct mca_allocator_base_module_t *mca_allocator_devicebucket_module_init(
         return NULL;
     }
     retval = mca_allocator_devicebucket_init((mca_allocator_base_module_t *) allocator,
-                                             mca_allocator_min_cache_size, mca_allocator_min_cache_size,
+                                             mca_allocator_min_cache_size, mca_allocator_max_cache_size,
                                              segment_alloc, segment_free);
     if (NULL == retval) {
         free(allocator);
@@ -115,6 +115,7 @@ int mca_allocator_devicebucket_module_close(void)
 void *mca_allocator_devicebucket_alloc_wrapper(struct mca_allocator_base_module_t *allocator, size_t size,
                                          size_t align)
 {
+    //printf("mca_allocator_devicebucket_alloc_wrapper size %zu align %zu\n", size, align);
     if (0 == align) {
         return mca_allocator_devicebucket_alloc(allocator, size);
     }
diff --git a/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c b/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
index f3b6df4559c..cb38d50ff56 100644
--- a/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
+++ b/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
@@ -59,7 +59,7 @@ mca_allocator_devicebucket_init(mca_allocator_base_module_t *mem,
         size <<= 1;
         num_buckets++;
     }
-
+    //printf("min_cache_size %zu max_cache_size %zu num_buckets %d\n", min_cache_size, max_cache_size, num_buckets);
     max_devicebucket_idx = num_buckets - 1;
 
     /* initialize the array of buckets */
@@ -102,26 +102,30 @@ void *mca_allocator_devicebucket_alloc(mca_allocator_base_module_t *mem, size_t
         bucket_size <<= 1;
     }
 
+    //printf("mca_allocator_devicebucket_alloc checking bucket %d of %d for size %d\n", bucket_num, mem_options->num_buckets, bucket_size);
     if (bucket_num >= mem_options->num_buckets) {
         /* allocate directly */
-        return mem_options->get_mem_fn(mem_options->super.alc_context, &bucket_size);
-    }
-
-    /* see if there is already a free chunk */
-    chunk = (mca_allocator_devicebucket_chunk_t *)opal_lifo_pop(&(mem_options->buckets[bucket_num].super));
-    if (NULL == chunk) {
-        /* create a new allocation */
         chunk = OBJ_NEW(mca_allocator_devicebucket_chunk_t);
+        chunk->addr = mem_options->get_mem_fn(mem_options->super.alc_context, &size);
+        chunk->size = size;
+    } else {
+        /* see if there is already a free chunk */
+        chunk = (mca_allocator_devicebucket_chunk_t *)opal_lifo_pop(&(mem_options->buckets[bucket_num].super));
         if (NULL == chunk) {
-            return NULL;
+            /* create a new allocation */
+            chunk = OBJ_NEW(mca_allocator_devicebucket_chunk_t);
+            if (NULL == chunk) {
+                return NULL;
+            }
+            chunk->addr = mem_options->get_mem_fn(mem_options->super.alc_context, &bucket_size);
+            chunk->size = bucket_size;
         }
-        chunk->addr = mem_options->get_mem_fn(mem_options->super.alc_context, &bucket_size);
     }
     /* store the chunk in the hash table so we can find it during free */
     OPAL_THREAD_LOCK(&(mem_options->used_chunks_lock));
     opal_hash_table_set_value_uint64(&(mem_options->used_chunks), (uint64_t)chunk->addr, chunk);
     OPAL_THREAD_UNLOCK(&(mem_options->used_chunks_lock));
-    chunk->size = bucket_size;
+    //printf("Allocated chunk %p for address %p\n", chunk, chunk->addr);
     return chunk->addr;
 }
 
@@ -158,6 +162,11 @@ void mca_allocator_devicebucket_free(mca_allocator_base_module_t *mem, void *ptr
 
     OPAL_THREAD_LOCK(&(mem_options->used_chunks_lock));
     opal_hash_table_get_value_uint64(&(mem_options->used_chunks), (uint64_t)ptr, (void**)&chunk);
+    if (NULL == chunk) {
+        printf("Couldn't find chunk for address %p\n", ptr);
+        OPAL_THREAD_UNLOCK(&(mem_options->used_chunks_lock));
+        return;
+    }
     opal_hash_table_remove_value_uint64(&(mem_options->used_chunks), (uint64_t)ptr);
     OPAL_THREAD_UNLOCK(&(mem_options->used_chunks_lock));
     size_t size = chunk->size;
@@ -168,8 +177,13 @@ void mca_allocator_devicebucket_free(mca_allocator_base_module_t *mem, void *ptr
         bucket_size <<= 1;
     }
 
-    /* push into lifo */
-    opal_lifo_push(&(mem_options->buckets[bucket_num].super), &chunk->super);
+    if (bucket_num > mem_options->num_buckets) {
+        mem_options->free_mem_fn(mem_options->super.alc_context, ptr);
+        OBJ_RELEASE(chunk);
+    } else {
+        /* push into lifo */
+        opal_lifo_push(&(mem_options->buckets[bucket_num].super), &chunk->super);
+    }
 }
 
 /*

From 1fd66365f8c7b71306546be3f26628ce5610a534 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Fri, 14 Apr 2023 01:28:09 -0400
Subject: [PATCH 16/74] Fix the RDMA fallback protocol selection.

If the target process is unable to execute an RDMA operation it
instructs the origin to change the communication protocol. When this
happen theorigin must be informed to cancel all pending RDMA operations,
and release the rdma_frag.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/pml/ob1/pml_ob1_accelerator.c | 3 ++-
 ompi/mca/pml/ob1/pml_ob1_recvfrag.c    | 2 +-
 ompi/mca/pml/ob1/pml_ob1_recvreq.c     | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_accelerator.c b/ompi/mca/pml/ob1/pml_ob1_accelerator.c
index 737560db302..5526b3b3cbd 100644
--- a/ompi/mca/pml/ob1/pml_ob1_accelerator.c
+++ b/ompi/mca/pml/ob1/pml_ob1_accelerator.c
@@ -475,7 +475,7 @@ int mca_pml_ob1_accelerator_need_buffers(void * rreq,
  * best thing, but this may go away if CUDA IPC is supported everywhere in the
  * future. */
 void mca_pml_ob1_accelerator_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,
-                                      ompi_proc_t* errproc, char* btlinfo)
+					     ompi_proc_t* errproc, char* btlinfo)
 {
     mca_bml_base_endpoint_t* ep;
     int btl_verbose_stream = 0;
@@ -502,6 +502,7 @@ void mca_pml_ob1_accelerator_add_ipc_support(struct mca_btl_base_module_t* btl,
                 free(errhost);
             }
             ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_ACCELERATOR_GET;
+	    break;
         }
     }
 }
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
index a3db1458938..695da3e81ee 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
@@ -1266,7 +1266,7 @@ void mca_pml_ob1_recv_frag_callback_cid (mca_btl_base_module_t* btl,
 
     ob1_hdr_ntoh (hdr, hdr->hdr_common.hdr_type);
 
-    /* NTH: this should be ok as as all BTLs create a dummy segment */
+    /* NTH: this should be ok as all BTLs create a dummy segment */
     segments->seg_len -= offsetof (mca_pml_ob1_ext_match_hdr_t, hdr_match);
     segments->seg_addr.pval = (void *) hdr_match;
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index e67202868e8..8b87bf18256 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -393,7 +393,7 @@ static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *fr
 
     /* tell peer to fall back on send for this region */
     rc = mca_pml_ob1_recv_request_ack_send(NULL, proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
-                                           recvreq, frag->rdma_offset, frag->rdma_length, false);
+                                           recvreq, frag->rdma_offset, frag->rdma_length, true);
     MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
     return rc;
 }

From f2f0f2d8feccb29335468d12efab5fc021f68910 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 14 Apr 2023 13:48:28 -0400
Subject: [PATCH 17/74] Stream-based reduction and ddt copy and 3buff cuda
 kernels, adopted for allreduce recursive doubling

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/datatype/ompi_datatype.h                 |  16 +-
 ompi/mca/coll/base/coll_base_allreduce.c      |  79 ++++-
 ompi/mca/op/base/op_base_op_select.c          |   1 +
 ompi/mca/op/cuda/op_cuda.h                    |   5 +-
 ompi/mca/op/cuda/op_cuda_component.c          |   2 +-
 ompi/mca/op/cuda/op_cuda_functions.c          | 309 ++++++++----------
 ompi/mca/op/cuda/op_cuda_impl.h               |  16 -
 ompi/mca/op/op.h                              |  52 +++
 ompi/op/op.h                                  | 104 +++++-
 opal/datatype/opal_datatype.h                 |   5 +
 opal/datatype/opal_datatype_copy.c            |  39 ++-
 opal/datatype/opal_datatype_copy.h            |  41 ++-
 opal/mca/accelerator/accelerator.h            |  36 +-
 opal/mca/accelerator/cuda/accelerator_cuda.c  |  50 ++-
 opal/mca/accelerator/cuda/accelerator_cuda.h  |   2 +-
 .../cuda/accelerator_cuda_component.c         |  19 +-
 .../null/accelerator_null_component.c         |  20 +-
 17 files changed, 545 insertions(+), 251 deletions(-)

diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
index 0c77079b916..5069b7e90a5 100644
--- a/ompi/datatype/ompi_datatype.h
+++ b/ompi/datatype/ompi_datatype.h
@@ -275,8 +275,9 @@ ompi_datatype_set_element_count( const ompi_datatype_t* type, size_t count, size
 }
 
 static inline int32_t
-ompi_datatype_copy_content_same_ddt( const ompi_datatype_t* type, size_t count,
-                                     char* pDestBuf, char* pSrcBuf )
+ompi_datatype_copy_content_same_ddt_stream( const ompi_datatype_t* type, size_t count,
+                                            char* pDestBuf, char* pSrcBuf,
+                                            opal_accelerator_stream_t *stream )
 {
     int32_t length, rc;
     ptrdiff_t extent;
@@ -285,8 +286,8 @@ ompi_datatype_copy_content_same_ddt( const ompi_datatype_t* type, size_t count,
     while( 0 != count ) {
         length = INT_MAX;
         if( ((size_t)length) > count ) length = (int32_t)count;
-        rc = opal_datatype_copy_content_same_ddt( &type->super, length,
-                                                  pDestBuf, pSrcBuf );
+        rc = opal_datatype_copy_content_same_ddt_stream( &type->super, length,
+                                                  pDestBuf, pSrcBuf, stream );
         if( 0 != rc ) return rc;
         pDestBuf += ((ptrdiff_t)length) * extent;
         pSrcBuf  += ((ptrdiff_t)length) * extent;
@@ -295,6 +296,13 @@ ompi_datatype_copy_content_same_ddt( const ompi_datatype_t* type, size_t count,
     return 0;
 }
 
+static inline int32_t
+ompi_datatype_copy_content_same_ddt( const ompi_datatype_t* type, size_t count,
+                                     char* pDestBuf, char* pSrcBuf )
+{
+    return ompi_datatype_copy_content_same_ddt_stream(type, count, pDestBuf, pSrcBuf, NULL);
+}
+
 OMPI_DECLSPEC const ompi_datatype_t* ompi_datatype_match_size( int size, uint16_t datakind, uint16_t datalang );
 
 /*
diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 104da625bba..55c6a3203a3 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -164,15 +164,22 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     inplacebuf = inplacebuf_free - gap;
 
+    opal_accelerator_stream_t *stream;
+    opal_accelerator.get_default_stream(inplacebuf_dev, &stream);
+
+
     if (MPI_IN_PLACE == sbuf) {
-        ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf);
+        ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)rbuf, stream);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
+        tmpsend = (char*) inplacebuf;
     } else {
-        ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf);
+        tmpsend = (char*) sbuf;
+#if 0
+        ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)sbuf, stream);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
+#endif // 0
     }
 
-    tmpsend = (char*) inplacebuf;
     tmprecv = (char*) rbuf;
 
     /* Determine nearest power of two less than or equal to size */
@@ -189,6 +196,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     extra_ranks = size - adjsize;
     if (rank <  (2 * extra_ranks)) {
         if (0 == (rank % 2)) {
+            /* wait for tmpsend to be copied */
+            opal_accelerator.wait_stream(stream);
             ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
                                     MCA_COLL_BASE_TAG_ALLREDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
@@ -199,8 +208,21 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                                     MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                     MPI_STATUS_IGNORE));
             if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
+            if (tmpsend == sbuf) {
+                tmpsend = inplacebuf;
+                /* tmpsend = tmprecv (op) sbuf */
+                ompi_3buff_op_reduce_stream(op, sbuf, tmprecv, tmpsend, count, dtype, stream);
+            } else {
+                /* tmpsend = tmprecv (op) tmpsend */
+                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
+            }
+#if 0
+            ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)sbuf, stream);
+            if (ret < 0) { line = __LINE__; goto error_hndl; }
+            tmpsend = inplacebuf;
             /* tmpsend = tmprecv (op) tmpsend */
-            ompi_op_reduce(op, tmprecv, tmpsend, count, dtype);
+            ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
+#endif // 0
             newrank = rank >> 1;
         }
     } else {
@@ -219,6 +241,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         remote = (newremote < extra_ranks)?
             (newremote * 2 + 1):(newremote + extra_ranks);
 
+        /* wait for previous ops to complete to complete */
+        opal_accelerator.wait_stream(stream);
         /* Exchange the data */
         ret = ompi_coll_base_sendrecv_actual(tmpsend, count, dtype, remote,
                                              MCA_COLL_BASE_TAG_ALLREDUCE,
@@ -229,14 +253,41 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
         /* Apply operation */
         if (rank < remote) {
-            /* tmprecv = tmpsend (op) tmprecv */
-            ompi_op_reduce(op, tmpsend, tmprecv, count, dtype);
-            tmpswap = tmprecv;
-            tmprecv = tmpsend;
-            tmpsend = tmpswap;
+            /* TODO: use the 3buff variant here to avoid the copy */
+            if (tmpsend == sbuf) {
+                /* tmprecv = sbuf (op) tmprecv */
+                ompi_op_reduce_stream(op, sbuf, tmprecv, count, dtype, stream);
+                /* send the current recv buffer, and use the tmp buffer to receive */
+                tmpsend = tmprecv;
+                tmprecv = inplacebuf;
+#if 0
+                ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)rbuf, stream);
+                if (ret < 0) { line = __LINE__; goto error_hndl; }
+                tmprecv = inplacebuf;
+#endif // 0
+            } else {
+                /* tmprecv = tmpsend (op) tmprecv */
+                ompi_op_reduce_stream(op, tmpsend, tmprecv, count, dtype, stream);
+                /* swap send and receive buffers */
+                tmpswap = tmprecv;
+                tmprecv = tmpsend;
+                tmpsend = tmpswap;
+            }
         } else {
-            /* tmpsend = tmprecv (op) tmpsend */
-            ompi_op_reduce(op, tmprecv, tmpsend, count, dtype);
+            /* use the 3buff variant here to avoid the copy */
+            if (tmpsend == sbuf) {
+                /* tmpsend = tmprecv (op) sbuf */
+                tmpsend = inplacebuf;
+                ompi_3buff_op_reduce_stream(op, tmprecv, sbuf, tmpsend, count, dtype, stream);
+#if 0
+                ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)sbuf, stream);
+                if (ret < 0) { line = __LINE__; goto error_hndl; }
+                tmpsend = inplacebuf;
+#endif // 0
+            } else {
+                /* tmpsend = tmprecv (op) tmpsend */
+                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
+            }
         }
     }
 
@@ -253,6 +304,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
             if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
             tmpsend = (char*)rbuf;
         } else {
+            /* wait for previous ops to complete to complete */
+            opal_accelerator.wait_stream(stream);
             ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank - 1),
                                     MCA_COLL_BASE_TAG_ALLREDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
@@ -262,10 +315,12 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Ensure that the final result is in rbuf */
     if (tmpsend != rbuf) {
-        ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, tmpsend);
+        ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, (char*)rbuf, tmpsend, stream);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
 
+    /* wait for previous ops to complete to complete */
+    opal_accelerator.wait_stream(stream);
     ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev, module);
     return MPI_SUCCESS;
 
diff --git a/ompi/mca/op/base/op_base_op_select.c b/ompi/mca/op/base/op_base_op_select.c
index 09652a8ac32..344b4250a6a 100644
--- a/ompi/mca/op/base/op_base_op_select.c
+++ b/ompi/mca/op/base/op_base_op_select.c
@@ -162,6 +162,7 @@ int ompi_op_base_op_select(ompi_op_t *op)
                     if (NULL != op->o_device_op->do_intrinsic.modules[i]) {
                         OBJ_RELEASE(op->o_device_op->do_intrinsic.modules[i]);
                     }
+                    // TODO: function signatures don't match, fix it!
                     op->o_device_op->do_intrinsic.fns[i] = avail->ao_module->opm_fns[i];
                     op->o_device_op->do_intrinsic.modules[i] = avail->ao_module;
                     OBJ_RETAIN(avail->ao_module);
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index 94794da4967..0298b64b561 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -72,7 +72,10 @@ OMPI_DECLSPEC extern ompi_op_cuda_component_t
     mca_op_cuda_component;
 
 OMPI_DECLSPEC extern
-ompi_op_base_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+ompi_op_base_stream_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+OMPI_DECLSPEC extern
+ompi_op_base_3buff_stream_handler_fn_t ompi_op_cuda_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
 
 END_C_DECLS
 
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 3e2f9c3e8b3..a377e477bc7 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -175,7 +175,7 @@ cuda_component_op_query(struct ompi_op_t *op, int *priority)
     module->opm_device_enabled = true;
     for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
         module->opm_fns[i] = ompi_op_cuda_functions[op->o_f_to_c_index][i];
-        //module->opm_3buff_fns[i] = ompi_op_cuda_3buff_functions[op->o_f_to_c_index][i];
+        module->opm_3buff_fns[i] = ompi_op_cuda_3buff_functions[op->o_f_to_c_index][i];
 
         if( NULL != module->opm_fns[i] ) {
             OBJ_RETAIN(module);
diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 63978b72f8e..0873ea1fc33 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -29,43 +29,55 @@
 #include "ompi/mca/op/cuda/op_cuda_impl.h"
 
 
-static inline void device_op_pre(const void *orig_source,
+static inline void device_op_pre(const void *orig_source1,
+                                 void **source1,
+                                 int *source1_device,
+                                 const void *orig_source2,
+                                 void **source2,
+                                 int *source2_device,
                                  void *orig_target,
-                                 int count,
-                                 struct ompi_datatype_t *dtype,
-                                 void **source,
-                                 int *source_device,
                                  void **target,
                                  int *target_device,
+                                 int count,
+                                 struct ompi_datatype_t *dtype,
                                  int *threads_per_block,
-                                 int *device)
+                                 int *device,
+                                 opal_accelerator_stream_t *stream)
 {
-    uint64_t target_flags = -1, source_flags = -1;
-    int target_rc, source_rc;
+    uint64_t target_flags = -1, source1_flags = -1, source2_flags = -1;
+    int target_rc, source1_rc, source2_rc = -1;
 
     *target = orig_target;
-    *source = (void*)orig_source;
+    *source1 = (void*)orig_source1;
+    if (NULL != orig_source2) {
+        *source2 = (void*)orig_source2;
+    }
 
     target_rc = opal_accelerator.check_addr(*target, target_device, &target_flags);
-    source_rc = opal_accelerator.check_addr(*source, source_device, &source_flags);
+    source1_rc = opal_accelerator.check_addr(*source1, source1_device, &source1_flags);
     *device = *target_device;
 
+    if (NULL != orig_source2) {
+        source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
+        //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
+        //       orig_target, target_rc, *target_device, orig_source1, source1_rc, *source1_device, orig_source2, source2_rc, *source2_device, *device);
+    }
+
     //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
     //       target_rc, *target_device, source_rc, *source_device, *device);
 
-    if (0 == target_rc && 0 == source_rc) {
+    if (0 == target_rc && 0 == source1_rc && 0 == source2_rc) {
         /* no buffers are on any device, select device 0 */
         *device = 0;
     } else if (*target_device == -1) {
-        *device = *source_device;
+        if (*source1_device == -1 && NULL != orig_source2) {
+            *device = *source2_device;
+        } else {
+            *device = *source1_device;
+        }
     }
 
-#if 0
-    /* swap contexts */
-    CHECK(cuCtxPushCurrent, (mca_op_cuda_component.cu_ctx[*device]));
-#endif // 0
-
-    if (0 == target_rc || 0 == source_rc || *target_device != *source_device) {
+    if (0 == target_rc || 0 == source1_rc || *target_device != *source1_device) {
         size_t nbytes;
         ompi_datatype_type_size(dtype, &nbytes);
         nbytes *= count;
@@ -73,39 +85,61 @@ static inline void device_op_pre(const void *orig_source,
         if (0 == target_rc) {
             // allocate memory on the device for the target buffer
             //printf("copying target from device %d to host\n", *target_device);
-            opal_accelerator.mem_alloc_stream(*device, target, nbytes, opal_accelerator.default_stream);
-            //CHECK(cuMemAllocAsync,   (&dptr, nbytes, (CUstream*)opal_accelerator.default_stream->stream));
-            CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*target, orig_target, nbytes, *(CUstream*)opal_accelerator.default_stream->stream));
+            opal_accelerator.mem_alloc_stream(*device, target, nbytes, stream);
+            CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*target, orig_target, nbytes, *(CUstream*)stream->stream));
             *target_device = -1; // mark target device as host
         }
 
-        if (0 == source_rc || *device != *source_device) {
+        if (0 == source1_rc || *device != *source1_device) {
             // allocate memory on the device for the source buffer
             //printf("allocating source on device %d\n", *device);
-            opal_accelerator.mem_alloc_stream(*device, source, nbytes, opal_accelerator.default_stream);
-            if (0 == source_rc) {
+            opal_accelerator.mem_alloc_stream(*device, source1, nbytes, stream);
+            if (0 == source1_rc) {
                 /* copy from host to device */
                 //printf("copying source from host to device %d\n", *device);
-                CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source, orig_source, nbytes, *(CUstream*)opal_accelerator.default_stream->stream));
+                CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source1, orig_source1, nbytes, *(CUstream*)stream->stream));
             } else {
                 /* copy from one device to another device */
                 /* TODO: does this actually work? Can we enable P2P? */
                 //printf("attempting cross-device copy for source\n");
-                CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source, (CUdeviceptr)orig_source, nbytes, *(CUstream*)opal_accelerator.default_stream->stream));
+                CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source1, (CUdeviceptr)orig_source1, nbytes, *(CUstream*)stream->stream));
             }
         }
+
+    }
+    if (NULL != source2_device && *target_device != *source2_device) {
+        // allocate memory on the device for the source buffer
+        //printf("allocating source on device %d\n", *device);
+        size_t nbytes;
+        ompi_datatype_type_size(dtype, &nbytes);
+        nbytes *= count;
+
+        opal_accelerator.mem_alloc_stream(*device, source2, nbytes, stream);
+        if (0 == source2_rc) {
+            /* copy from host to device */
+            //printf("copying source from host to device %d\n", *device);
+            CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source2, orig_source2, nbytes, *(CUstream*)stream->stream));
+        } else {
+            /* copy from one device to another device */
+            /* TODO: does this actually work? Can we enable P2P? */
+            //printf("attempting cross-device copy for source\n");
+            CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source2, (CUdeviceptr)orig_source2, nbytes, *(CUstream*)stream->stream));
+        }
     }
     *threads_per_block = mca_op_cuda_component.cu_max_threads_per_block[*device];
 }
 
-static inline void device_op_post(void *orig_target,
-                                  int count,
-                                  struct ompi_datatype_t *dtype,
-                                  void *source,
-                                  int source_device,
+static inline void device_op_post(void *source1,
+                                  int source1_device,
+                                  void *source2,
+                                  int source2_device,
+                                  void *orig_target,
                                   void *target,
                                   int target_device,
-                                  int device)
+                                  int count,
+                                  struct ompi_datatype_t *dtype,
+                                  int device,
+                                  opal_accelerator_stream_t *stream)
 {
     if (-1 == target_device) {
 
@@ -113,38 +147,40 @@ static inline void device_op_post(void *orig_target,
         ompi_datatype_type_size(dtype, &nbytes);
         nbytes *= count;
 
-        CHECK(cuMemcpyDtoHAsync, (orig_target, (CUdeviceptr)target, nbytes, *(CUstream *)opal_accelerator.default_stream->stream));
+        CHECK(cuMemcpyDtoHAsync, (orig_target, (CUdeviceptr)target, nbytes, *(CUstream *)stream->stream));
     }
 
-    /* cuMemFreeAsync is supported from CUDA 11.2.0 upwards */
     if (-1 == target_device) {
-        opal_accelerator.mem_release_stream(device, target, opal_accelerator.default_stream);
+        opal_accelerator.mem_release_stream(device, target, stream);
         //CHECK(cuMemFreeAsync, ((CUdeviceptr)target, mca_op_cuda_component.cu_stream));
     }
-    if (source_device != device) {
-        opal_accelerator.mem_release_stream(device, source, opal_accelerator.default_stream);
+    if (source1_device != device) {
+        opal_accelerator.mem_release_stream(device, source1, stream);
+        //CHECK(cuMemFreeAsync, ((CUdeviceptr)source, mca_op_cuda_component.cu_stream));
+    }
+    if (NULL != source2 && source2_device != device) {
+        opal_accelerator.mem_release_stream(device, source2, stream);
         //CHECK(cuMemFreeAsync, ((CUdeviceptr)source, mca_op_cuda_component.cu_stream));
     }
-
-    /* wait for all scheduled operations to complete */
-    //CHECK(cuStreamSynchronize, (mca_op_cuda_component.cu_stream));
-    opal_accelerator.wait_stream(opal_accelerator.default_stream);
 }
 
-#define FUNC(name, type_name, type)                                 \
-    static \
-    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,  \
-                                                   struct ompi_datatype_t **dtype,           \
-                                                   struct ompi_op_base_module_1_0_0_t *module) { \
-        int threads_per_block; \
-        int source_device, target_device, device; \
-        type *source, *target; \
-        int n = *count; \
-        device_op_pre(in, inout, n, *dtype, (void**)&source, &source_device, (void**)&target, &target_device, \
-                      &threads_per_block, &device); \
-        CUstream *stream = (CUstream*)opal_accelerator.default_stream->stream;                        \
-        ompi_op_cuda_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, *stream); \
-        device_op_post(inout, n, *dtype, source, source_device, target, target_device, device); \
+#define FUNC(name, type_name, type)                                                                             \
+    static                                                                                                      \
+    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,                       \
+                                                   struct ompi_datatype_t **dtype,                              \
+                                                   opal_accelerator_stream_t *stream,                           \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {                \
+        int threads_per_block;                                                                                  \
+        int source_device, target_device, device;                                                               \
+        type *source, *target;                                                                                  \
+        int n = *count;                                                                                         \
+        device_op_pre(in, (void**)&source, &source_device, NULL, NULL, NULL,                                    \
+                      inout, (void**)&target, &target_device,                                                   \
+                      n, *dtype,                                                                                \
+                      &threads_per_block, &device, stream);                                                     \
+        CUstream *custream = (CUstream*)stream->stream;                                                         \
+        ompi_op_cuda_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, *custream);      \
+        device_op_post(source, source_device, NULL, -1, inout, target, target_device, n, *dtype, device, stream);\
     }
 
 #define OP_FUNC(name, type_name, type, op, ...) FUNC(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
@@ -162,45 +198,47 @@ static inline void device_op_post(void *orig_target,
 #define LOC_FUNC(name, type_name, op) FUNC(name, type_name, ompi_op_predefined_##type_name##_t)
 
 /* Dispatch Fortran types to C types */
-#define FORT_INT_FUNC(name, type_name, type)                                 \
-    static \
-    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,  \
-                                                   struct ompi_datatype_t **dtype,           \
+#define FORT_INT_FUNC(name, type_name, type)                                                     \
+    static                                                                                       \
+    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,        \
+                                                   struct ompi_datatype_t **dtype,               \
+                                                   opal_accelerator_stream_t *stream,            \
                                                    struct ompi_op_base_module_1_0_0_t *module) { \
                                                                                                  \
         _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
         switch(sizeof(type)) {  \
             case sizeof(int8_t):  \
-                ompi_op_cuda_2buff_##name##_int8_t(in, inout, count, dtype, module); \
+                ompi_op_cuda_2buff_##name##_int8_t(in, inout, count, dtype, stream, module); \
                 break; \
             case sizeof(int16_t): \
-                ompi_op_cuda_2buff_##name##_int16_t(in, inout, count, dtype, module); \
+                ompi_op_cuda_2buff_##name##_int16_t(in, inout, count, dtype, stream, module); \
                 break; \
             case sizeof(int32_t): \
-                ompi_op_cuda_2buff_##name##_int32_t(in, inout, count, dtype, module); \
+                ompi_op_cuda_2buff_##name##_int32_t(in, inout, count, dtype, stream, module); \
                 break; \
             case sizeof(int64_t): \
-                ompi_op_cuda_2buff_##name##_int64_t(in, inout, count, dtype, module); \
+                ompi_op_cuda_2buff_##name##_int64_t(in, inout, count, dtype, stream, module); \
                 break; \
         } \
     }
 
 /* Dispatch Fortran types to C types */
-#define FORT_FLOAT_FUNC(name, type_name, type)                                 \
-    static \
-    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,  \
-                                                   struct ompi_datatype_t **dtype,           \
-                                                   struct ompi_op_base_module_1_0_0_t *module) { \
+#define FORT_FLOAT_FUNC(name, type_name, type)                                                      \
+    static                                                                                          \
+    void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,           \
+                                                   struct ompi_datatype_t **dtype,                  \
+                                                   opal_accelerator_stream_t *stream,               \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {    \
         _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
         switch(sizeof(type)) {  \
             case sizeof(float):  \
-                ompi_op_cuda_2buff_##name##_float(in, inout, count, dtype, module);  \
+                ompi_op_cuda_2buff_##name##_float(in, inout, count, dtype, stream, module);  \
                 break;  \
             case sizeof(double): \
-                ompi_op_cuda_2buff_##name##_double(in, inout, count, dtype, module); \
+                ompi_op_cuda_2buff_##name##_double(in, inout, count, dtype, stream, module); \
                 break; \
             case sizeof(long double): \
-                ompi_op_cuda_2buff_##name##_long_double(in, inout, count, dtype, module); \
+                ompi_op_cuda_2buff_##name##_long_double(in, inout, count, dtype, stream, module); \
                 break; \
         } \
     }
@@ -739,57 +777,36 @@ LOC_FUNC(minloc, short_int, <)
 LOC_FUNC(minloc, long_double_int, <)
 
 
-#if 0
+
 /*
  *  This is a three buffer (2 input and 1 output) version of the reduction
  *    routines, needed for some optimizations.
  */
-#define OP_FUNC(name, type_name, type, op)                                 \
-    __device__ void                                                                 \
-    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type* in2, type *out, int n) {   \
-        int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
-        if (i < n) out[i] = in1[i] op in2[i];                                       \
-    }                                                                               \
-    void ompi_op_cuda_3buff_##name##_##type_name##(const void *in1, const void *in2, void *out, int *count,  \
-                                                   struct ompi_datatype_t **dtype,           \
-                                                   struct ompi_op_base_module_1_0_0_t *module) {     \
-        int threads = THREADS_PER_BLOCK;                                            \
-        int blocks  = *count / THREADS_PER_BLOCK;                                   \
-        type *out_ = (type*)out;                                                    \
-        const type *in1_ = (const type*)in1;                                        \
-        const type *in2_ = (const type*)in2;                                        \
-        int n = *count;                                                             \
-        CUstream *stream = (CUstream*)opal_accelerator.default_stream->stream;     \
-        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads, *stream>>>(in1_, int2_, out_, n); \
+#define FUNC_3BUF(name, type_name, type)                                                                        \
+    static                                                                                                      \
+    void ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,      \
+                                                   struct ompi_datatype_t **dtype,                              \
+                                                   opal_accelerator_stream_t *stream,                           \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {                \
+        int threads_per_block;                                                                                  \
+        int source1_device, source2_device, target_device, device;                                              \
+        type *source1, *source2, *target;                                                                       \
+        int n = *count;                                                                                         \
+        device_op_pre(in1, (void**)&source1, &source1_device,                                                   \
+                      in2, (void**)&source2, &source2_device,                                                   \
+                      out, (void**)&target, &target_device,                                                     \
+                      n, *dtype,                                                                                \
+                      &threads_per_block, &device, stream);                                                     \
+        CUstream *custream = (CUstream*)stream->stream;                                                         \
+        ompi_op_cuda_3buff_##name##_##type_name##_submit(source1, source2, target, n, threads_per_block, *custream);\
+        device_op_post(source1, source1_device, source2, source2_device, out, target, target_device, n, *dtype, device, stream);\
     }
 
 
-/*
- * Since all the functions in this file are essentially identical, we
- * use a macro to substitute in names and types.  The core operation
- * in all functions that use this macro is the same.
- *
- * This macro is for (out = op(in1, in2))
- */
-#define FUNC_FUNC(name, type_name, type)                                            \
-    __device__ void                                                                 \
-    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type *in2, type *out, int n) {   \
-        int i = blockIdx.x*blockDim.x + threadIdx.x;                                \
-        if (i < n) out[i] = current_func(in1[i], in2[i]);                           \
-    }                                                                               \
-    static void                                                                     \
-    ompi_op_cuda_3buff_##name##_##type_name##(const void *in1, const void *in2, void *out, int *count,  \
-                                              struct ompi_datatype_t **dtype,           \
-                                              struct ompi_op_base_module_1_0_0_t *module) { \
-        int threads = THREADS_PER_BLOCK;                                            \
-        int blocks  = *count / THREADS_PER_BLOCK;                                   \
-        type *out_ = (type*)out;                                                    \
-        const type *in1_ = (const type*)in1;                                        \
-        const type *in2_ = (const type*)in2;                                        \
-        int n = *count;                                                             \
-        CUstream *stream = (CUstream*)opal_accelerator.default_stream->stream;     \
-        ompi_op_cuda_3buff_##name##_##type_name##_kernel<<blocks, threads, *stream>>(in1_, in2_, out_, n); \
-    }
+#define OP_FUNC_3BUF(name, type_name, type, op, ...) FUNC_3BUF(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
+
+/* reuse the macro above, no work is actually done so we don't care about the func */
+#define FUNC_FUNC_3BUF(name, type_name, type, ...) FUNC_3BUF(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
 
 /*
  * Since all the functions in this file are essentially identical, we
@@ -798,52 +815,7 @@ LOC_FUNC(minloc, long_double_int, <)
  *
  * This macro is for minloc and maxloc
  */
-/*
-#define LOC_STRUCT(type_name, type1, type2) \
-  typedef struct { \
-      type1 v; \
-      type2 k; \
-  } ompi_op_predefined_##type_name##_t;
-*/
-
-#define LOC_FUNC(name, type_name, op) \
-    __device__ void                   \
-    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in1, \
-                                                     const ompi_op_predefined_##type_name##_t *in2, \
-                                                     ompi_op_predefined_##type_name##_t *out,       \
-                                                     int n)                                         \
-    {                                                                       \
-        int i = blockIdx.x*blockDim.x + threadIdx.x;                        \
-        if (i < n) {                                                        \
-            const ompi_op_predefined_##type_name##_t *a1 = &in1[i];         \
-            const ompi_op_predefined_##type_name##_t *a2 = &in2[i];         \
-            ompi_op_predefined_##type_name##_t *b = &out[i];                \
-            if (a1->v op a2->v) {                                           \
-                b->v = a1->v;                                               \
-                b->k = a1->k;                                               \
-            } else if (a1->v == a2->v) {                                    \
-                b->v = a1->v;                                               \
-                b->k = (a2->k < a1->k ? a2->k : a1->k);                     \
-            } else {                                                        \
-                b->v = a2->v;                                               \
-                b->k = a2->k;                                               \
-            }                                                               \
-        }                                                                   \
-    }                                                                       \
-    static void                                                             \
-    ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,\
-                                            struct ompi_datatype_t **dtype,             \
-                                            struct ompi_op_cuda_module_1_0_0_t *module) \
-    {                                                                                   \
-        int i;                                                                          \
-        int threads = THREADS_PER_BLOCK;                                                \
-        int blocks  = *count / THREADS_PER_BLOCK;                                       \
-        const ompi_op_predefined_##type_name##_t *a1 = (const ompi_op_predefined_##type_name##_t*) in1; \
-        const ompi_op_predefined_##type_name##_t *a2 = (const ompi_op_predefined_##type_name##_t*) in2; \
-        ompi_op_predefined_##type_name##_t *b = (ompi_op_predefined_##type_name##_t*) out;            \
-        CUstream *stream = (CUstream*)opal_accelerator.default_stream->stream;                       \
-        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<blocks, threads, *stream>>(a1, a2, b, n); \
-    }
+#define LOC_FUNC_3BUF(name, type_name, op) FUNC_3BUF(name, type_name, ompi_op_predefined_##type_name##_t)
 
 
 /*************************************************************************
@@ -1345,7 +1317,7 @@ LOC_STRUCT_3BUF(long_double_int, long double, int)
 /*************************************************************************
  * Max location
  *************************************************************************/
-
+#if 0
 #if OMPI_HAVE_FORTRAN_REAL
 LOC_FUNC_3BUF(maxloc, 2real, >)
 #endif
@@ -1355,6 +1327,7 @@ LOC_FUNC_3BUF(maxloc, 2double_precision, >)
 #if OMPI_HAVE_FORTRAN_INTEGER
 LOC_FUNC_3BUF(maxloc, 2integer, >)
 #endif
+#endif // 0
 LOC_FUNC_3BUF(maxloc, float_int, >)
 LOC_FUNC_3BUF(maxloc, double_int, >)
 LOC_FUNC_3BUF(maxloc, long_int, >)
@@ -1365,7 +1338,7 @@ LOC_FUNC_3BUF(maxloc, long_double_int, >)
 /*************************************************************************
  * Min location
  *************************************************************************/
-
+#if 0
 #if OMPI_HAVE_FORTRAN_REAL
 LOC_FUNC_3BUF(minloc, 2real, <)
 #endif
@@ -1375,13 +1348,13 @@ LOC_FUNC_3BUF(minloc, 2double_precision, <)
 #if OMPI_HAVE_FORTRAN_INTEGER
 LOC_FUNC_3BUF(minloc, 2integer, <)
 #endif
+#endif // 0
 LOC_FUNC_3BUF(minloc, float_int, <)
 LOC_FUNC_3BUF(minloc, double_int, <)
 LOC_FUNC_3BUF(minloc, long_int, <)
 LOC_FUNC_3BUF(minloc, 2int, <)
 LOC_FUNC_3BUF(minloc, short_int, <)
 LOC_FUNC_3BUF(minloc, long_double_int, <)
-#endif // 0
 
 /*
  * Helpful defines, because there's soooo many names!
@@ -1599,7 +1572,7 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
     (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
      OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
 
-ompi_op_base_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+ompi_op_base_stream_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
     {
         /* Corresponds to MPI_OP_NULL */
         [OMPI_OP_BASE_FORTRAN_NULL] = {
@@ -1685,8 +1658,7 @@ ompi_op_base_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OM
 
     };
 
-#if 0
-ompi_op_base_3buff_handler_fn_t ompi_op_base_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+ompi_op_base_3buff_stream_handler_fn_t ompi_op_cuda_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
     {
         /* Corresponds to MPI_OP_NULL */
         [OMPI_OP_BASE_FORTRAN_NULL] = {
@@ -1769,5 +1741,4 @@ ompi_op_base_3buff_handler_fn_t ompi_op_base_3buff_functions[OMPI_OP_BASE_FORTRA
                ACCUMULATE */
             NULL,
         },
-    };
-#endif // 0
\ No newline at end of file
+    };
\ No newline at end of file
diff --git a/ompi/mca/op/cuda/op_cuda_impl.h b/ompi/mca/op/cuda/op_cuda_impl.h
index 9be5ec8b9f3..7ab95cd446b 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.h
+++ b/ompi/mca/op/cuda/op_cuda_impl.h
@@ -398,8 +398,6 @@ LOC_FUNC_SIG(minloc, long_double_int, <)
  * Max
  *************************************************************************/
 
-#undef current_func
-#define current_func(a, b) ((a) > (b) ? (a) : (b))
 /* C integer */
 FUNC_FUNC_3BUF_SIG(max,   int8_t,   int8_t)
 FUNC_FUNC_3BUF_SIG(max,  uint8_t,  uint8_t)
@@ -464,8 +462,6 @@ FUNC_FUNC_3BUF_SIG(max, fortran_real16, ompi_fortran_real16_t)
  * Min
  *************************************************************************/
 
-#undef current_func
-#define current_func(a, b) ((a) < (b) ? (a) : (b))
 /* C integer */
 FUNC_FUNC_3BUF_SIG(min,   int8_t,   int8_t)
 FUNC_FUNC_3BUF_SIG(min,  uint8_t,  uint8_t)
@@ -677,8 +673,6 @@ OP_FUNC_3BUF_SIG(prod, c_long_double_complex, long double _Complex, *)
  * Logical AND
  *************************************************************************/
 
-#undef current_func
-#define current_func(a, b) ((a) && (b))
 /* C integer */
 FUNC_FUNC_3BUF_SIG(land,   int8_t,   int8_t)
 FUNC_FUNC_3BUF_SIG(land,  uint8_t,  uint8_t)
@@ -702,8 +696,6 @@ FUNC_FUNC_3BUF_SIG(land, bool, bool)
  * Logical OR
  *************************************************************************/
 
-#undef current_func
-#define current_func(a, b) ((a) || (b))
 /* C integer */
 FUNC_FUNC_3BUF_SIG(lor,   int8_t,   int8_t)
 FUNC_FUNC_3BUF_SIG(lor,  uint8_t,  uint8_t)
@@ -727,8 +719,6 @@ FUNC_FUNC_3BUF_SIG(lor, bool, bool)
  * Logical XOR
  *************************************************************************/
 
-#undef current_func
-#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
 /* C integer */
 FUNC_FUNC_3BUF_SIG(lxor,   int8_t,   int8_t)
 FUNC_FUNC_3BUF_SIG(lxor,  uint8_t,  uint8_t)
@@ -752,8 +742,6 @@ FUNC_FUNC_3BUF_SIG(lxor, bool, bool)
  * Bitwise AND
  *************************************************************************/
 
-#undef current_func
-#define current_func(a, b) ((a) & (b))
 /* C integer */
 FUNC_FUNC_3BUF_SIG(band,   int8_t,   int8_t)
 FUNC_FUNC_3BUF_SIG(band,  uint8_t,  uint8_t)
@@ -792,8 +780,6 @@ FUNC_FUNC_3BUF_SIG(band, byte, char)
  * Bitwise OR
  *************************************************************************/
 
-#undef current_func
-#define current_func(a, b) ((a) | (b))
 /* C integer */
 FUNC_FUNC_3BUF_SIG(bor,   int8_t,   int8_t)
 FUNC_FUNC_3BUF_SIG(bor,  uint8_t,  uint8_t)
@@ -832,8 +818,6 @@ FUNC_FUNC_3BUF_SIG(bor, byte, char)
  * Bitwise XOR
  *************************************************************************/
 
-#undef current_func
-#define current_func(a, b) ((a) ^ (b))
 /* C integer */
 FUNC_FUNC_3BUF_SIG(bxor,   int8_t,   int8_t)
 FUNC_FUNC_3BUF_SIG(bxor,  uint8_t,  uint8_t)
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 3da811788e6..838daf6e765 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -85,6 +85,7 @@
 #include "ompi_config.h"
 
 #include "opal/class/opal_object.h"
+#include "opal/mca/accelerator/accelerator.h"
 #include "ompi/mca/mca.h"
 
 /*
@@ -266,6 +267,21 @@ typedef void (*ompi_op_base_handler_fn_1_0_0_t)(const void *, void *, int *,
 
 typedef ompi_op_base_handler_fn_1_0_0_t ompi_op_base_handler_fn_t;
 
+/**
+ * Typedef for 2-buffer op functions.
+ *
+ * We don't use MPI_User_function because this would create a
+ * confusing dependency loop between this file and mpi.h.  So this is
+ * repeated code, but it's better this way (and this typedef will
+ * never change, so there's not much of a maintenance worry).
+ */
+typedef void (*ompi_op_base_stream_handler_fn_1_0_0_t)(const void *, void *, int *,
+                                                       struct ompi_datatype_t **,
+                                                       opal_accelerator_stream_t *stream,
+                                                       struct ompi_op_base_module_1_0_0_t *);
+
+typedef ompi_op_base_stream_handler_fn_1_0_0_t ompi_op_base_stream_handler_fn_t;
+
 /*
  * Typedef for 3-buffer (two input and one output) op functions.
  */
@@ -277,6 +293,18 @@ typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(const void *,
 
 typedef ompi_op_base_3buff_handler_fn_1_0_0_t ompi_op_base_3buff_handler_fn_t;
 
+/*
+ * Typedef for 3-buffer (two input and one output) op functions.
+ */
+typedef void (*ompi_op_base_3buff_stream_handler_fn_1_0_0_t)(const void *,
+                                                             const void *,
+                                                             void *, int *,
+                                                             struct ompi_datatype_t **,
+                                                             opal_accelerator_stream_t*,
+                                                             struct ompi_op_base_module_1_0_0_t *);
+
+typedef ompi_op_base_3buff_stream_handler_fn_1_0_0_t ompi_op_base_3buff_stream_handler_fn_t;
+
 /**
  * Op component initialization
  *
@@ -406,6 +434,18 @@ typedef struct ompi_op_base_op_fns_1_0_0_t {
 
 typedef ompi_op_base_op_fns_1_0_0_t ompi_op_base_op_fns_t;
 
+/**
+ * Struct that is used in op.h to hold all the function pointers and
+ * pointers to the corresopnding modules (so that we can properly
+ * RETAIN/RELEASE them)
+ */
+typedef struct ompi_op_base_op_stream_fns_1_0_0_t {
+    ompi_op_base_stream_handler_fn_1_0_0_t fns[OMPI_OP_BASE_TYPE_MAX];
+    ompi_op_base_module_t *modules[OMPI_OP_BASE_TYPE_MAX];
+} ompi_op_base_op_stream_fns_1_0_0_t;
+
+typedef ompi_op_base_op_stream_fns_1_0_0_t ompi_op_base_op_stream_fns_t;
+
 /**
  * Struct that is used in op.h to hold all the function pointers and
  * pointers to the corresopnding modules (so that we can properly
@@ -418,6 +458,18 @@ typedef struct ompi_op_base_op_3buff_fns_1_0_0_t {
 
 typedef ompi_op_base_op_3buff_fns_1_0_0_t ompi_op_base_op_3buff_fns_t;
 
+/**
+ * Struct that is used in op.h to hold all the function pointers and
+ * pointers to the corresopnding modules (so that we can properly
+ * RETAIN/RELEASE them)
+ */
+typedef struct ompi_op_base_op_3buff_stream_fns_1_0_0_t {
+    ompi_op_base_3buff_stream_handler_fn_1_0_0_t fns[OMPI_OP_BASE_TYPE_MAX];
+    ompi_op_base_module_t *modules[OMPI_OP_BASE_TYPE_MAX];
+} ompi_op_base_op_3buff_stream_fns_1_0_0_t;
+
+typedef ompi_op_base_op_3buff_stream_fns_1_0_0_t ompi_op_base_op_3buff_stream_fns_t;
+
 /*
  * Macro for use in modules that are of type op v2.0.0
  */
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 8b2f2270552..7b726430e51 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -126,8 +126,8 @@ enum ompi_op_type {
 /* device op information */
 struct ompi_device_op_t {
     opal_accelerator_stream_t *do_stream;
-    ompi_op_base_op_fns_t do_intrinsic;
-    ompi_op_base_op_3buff_fns_t do_3buff_intrinsic;
+    ompi_op_base_op_stream_fns_t do_intrinsic;
+    ompi_op_base_op_3buff_stream_fns_t do_3buff_intrinsic;
 };
 typedef struct ompi_device_op_t ompi_device_op_t;
 
@@ -548,9 +548,10 @@ static inline bool ompi_op_supports_device(const ompi_op_t * op, const ompi_data
  * optimization).  If you give it an intrinsic op with a datatype that
  * is not defined to have that operation, it is likely to seg fault.
  */
-static inline void ompi_op_reduce(ompi_op_t * op, void *source,
-                                  void *target, size_t full_count,
-                                  ompi_datatype_t * dtype)
+static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
+                                         void *target, size_t full_count,
+                                         ompi_datatype_t * dtype,
+                                         opal_accelerator_stream_t *stream)
 {
     MPI_Fint f_dtype, f_count;
     int count = full_count;
@@ -579,7 +580,7 @@ static inline void ompi_op_reduce(ompi_op_t * op, void *source,
             }
             shift = done_count * ext;
             // Recurse one level in iterations of 'int'
-            ompi_op_reduce(op, (char*)source + shift, (char*)target + shift, iter_count, dtype);
+            ompi_op_reduce_stream(op, (char*)source + shift, (char*)target + shift, iter_count, dtype, stream);
             done_count += iter_count;
         }
         return;
@@ -639,9 +640,18 @@ static inline void ompi_op_reduce(ompi_op_t * op, void *source,
             if (NULL == op->o_device_op) {
                 abort(); // TODO: be more graceful!
             }
+            opal_accelerator_stream_t *actual_stream = stream;
+            bool flush_stream = false;
+            if (NULL == stream) {
+                opal_accelerator.get_default_stream(target_dev_id, &actual_stream);
+                flush_stream = true;
+            }
             op->o_device_op->do_intrinsic.fns[dtype_id](source, target,
-                                                     &count, &dtype,
-                                                     op->o_device_op->do_intrinsic.modules[dtype_id]);
+                                                        &count, &dtype, actual_stream,
+                                                        op->o_device_op->do_intrinsic.modules[dtype_id]);
+            if (flush_stream) {
+                opal_accelerator.wait_stream(actual_stream);
+            }
         } else {
             op->o_func.intrinsic.fns[dtype_id](source, target,
                                                &count, &dtype,
@@ -667,6 +677,13 @@ static inline void ompi_op_reduce(ompi_op_t * op, void *source,
     return;
 }
 
+static inline void ompi_op_reduce(ompi_op_t * op, void *source,
+                                  void *target, size_t full_count,
+                                  ompi_datatype_t * dtype)
+{
+    ompi_op_reduce_stream(op, source, target, full_count, dtype, NULL);
+}
+
 static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, void * restrict source2,
                                        void * restrict result, int count, struct ompi_datatype_t *dtype)
 {
@@ -697,6 +714,77 @@ static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, v
  *
  * Otherwise, this function is the same as ompi_op_reduce.
  */
+static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, void *source1,
+                                               void *source2, void *target,
+                                               int count, ompi_datatype_t * dtype,
+                                               opal_accelerator_stream_t *stream)
+{
+    void *restrict src1;
+    void *restrict src2;
+    void *restrict tgt;
+    src1 = source1;
+    src2 = source2;
+    tgt = target;
+
+    if (OPAL_UNLIKELY(!ompi_op_is_intrinsic (op))) {
+        /* no 3buff variants for user-defined ops */
+        ompi_3buff_op_user (op, src1, src2, tgt, count, dtype);
+        return;
+    }
+
+    bool use_device_op = false;
+    int source1_dev_id, source2_dev_id, target_dev_id;
+    uint64_t source1_flags, source2_flags, target_flags;
+    int target_check_addr = opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
+    int source1_check_addr = opal_accelerator.check_addr(source1, &source1_dev_id, &source1_flags);
+    int source2_check_addr = opal_accelerator.check_addr(source2, &source2_dev_id, &source2_flags);
+    /* check if either of the buffers is on a device and if so make sure we can
+     * access handle it properly */
+    if (target_check_addr > 0 || source1_check_addr > 0 || source2_check_addr > 0) {
+        if (ompi_datatype_is_predefined(dtype) &&
+            0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC) &&
+            NULL != op->o_device_op) {
+            use_device_op = true;
+        } else {
+            /* TODO: can we be more graceful here? */
+            abort();
+        }
+    }
+
+    /* For intrinsics, we also pass the corresponding op module */
+    if (0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC)) {
+        int dtype_id;
+        if (!ompi_datatype_is_predefined(dtype)) {
+            ompi_datatype_t *dt = ompi_datatype_get_single_predefined_type_from_args(dtype);
+            dtype_id = ompi_op_ddt_map[dt->id];
+        } else {
+            dtype_id = ompi_op_ddt_map[dtype->id];
+        }
+        if (use_device_op) {
+            if (NULL == op->o_device_op) {
+                abort(); // TODO: be more graceful!
+            }
+            opal_accelerator_stream_t *actual_stream = stream;
+            bool flush_stream = false;
+            if (NULL == stream) {
+                opal_accelerator.get_default_stream(target_dev_id, &actual_stream);
+                flush_stream = true;
+            }
+            op->o_device_op->do_3buff_intrinsic.fns[dtype_id](source1, source2, target,
+                                                              &count, &dtype, actual_stream,
+                                                              op->o_device_op->do_3buff_intrinsic.modules[dtype_id]);
+            if (flush_stream) {
+                opal_accelerator.wait_stream(actual_stream);
+            }
+        } else {
+            op->o_3buff_intrinsic.fns[dtype_id](source1, source2, target,
+                                                &count, &dtype,
+                                                op->o_func.intrinsic.modules[dtype_id]);
+        }
+    }
+}
+
+
 static inline void ompi_3buff_op_reduce(ompi_op_t * op, void *source1,
                                         void *source2, void *target,
                                         int count, ompi_datatype_t * dtype)
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 5f7fc53fa7d..375b0475fef 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -42,6 +42,7 @@
 #include <stddef.h>
 
 #include "opal/class/opal_object.h"
+#include "opal/mca/accelerator/accelerator.h"
 
 BEGIN_C_DECLS
 
@@ -309,6 +310,10 @@ OPAL_DECLSPEC int32_t opal_datatype_copy_content_same_ddt(const opal_datatype_t
                                                           int32_t count, char *pDestBuf,
                                                           char *pSrcBuf);
 
+OPAL_DECLSPEC int32_t opal_datatype_copy_content_same_ddt_stream(const opal_datatype_t *datatype, int32_t count,
+                                                                 char *destination_base, char *source_base,
+                                                                 opal_accelerator_stream_t *stream);
+
 OPAL_DECLSPEC int opal_datatype_compute_ptypes(opal_datatype_t *datatype);
 
 /*
diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c
index e10ea97b1bb..d4d6189d3d9 100644
--- a/opal/datatype/opal_datatype_copy.c
+++ b/opal/datatype/opal_datatype_copy.c
@@ -55,7 +55,8 @@
         }                                                                                   \
     } while (0)
 
-static void *opal_datatype_accelerator_memcpy(void *dest, const void *src, size_t size)
+static void *opal_datatype_accelerator_memcpy(void *dest, const void *src, size_t size,
+                                              opal_accelerator_stream_t *stream)
 {
     int res;
     int dev_id;
@@ -71,8 +72,13 @@ static void *opal_datatype_accelerator_memcpy(void *dest, const void *src, size_
         0 >= opal_accelerator.check_addr(src, &dev_id, &flags)) {
         return memcpy(dest, src, size);
     }
-    res = opal_accelerator.mem_copy(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
-                                  dest, src, size, MCA_ACCELERATOR_TRANSFER_UNSPEC);
+    if (NULL != stream) {
+        res = opal_accelerator.mem_copy_async(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
+                                              dest, src, size, stream, MCA_ACCELERATOR_TRANSFER_UNSPEC);
+    } else {
+        res = opal_accelerator.mem_copy(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
+                                        dest, src, size, MCA_ACCELERATOR_TRANSFER_UNSPEC);
+    }
     if (OPAL_SUCCESS != res) {
         opal_output(0, "Error in accelerator memcpy");
         abort();
@@ -80,7 +86,8 @@ static void *opal_datatype_accelerator_memcpy(void *dest, const void *src, size_
     return dest;
 }
 
-static void *opal_datatype_accelerator_memmove(void *dest, const void *src, size_t size)
+static void *opal_datatype_accelerator_memmove(void *dest, const void *src, size_t size,
+                                               opal_accelerator_stream_t *stream)
 {
     int res;
     int dev_id;
@@ -96,8 +103,13 @@ static void *opal_datatype_accelerator_memmove(void *dest, const void *src, size
         0 >= opal_accelerator.check_addr(src, &dev_id, &flags)) {
         return memmove(dest, src, size);
     }
-    res = opal_accelerator.mem_move(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
-                                    dest, src, size, MCA_ACCELERATOR_TRANSFER_UNSPEC);
+    if (NULL == stream) {
+        res = opal_accelerator.mem_move(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
+                                        dest, src, size, MCA_ACCELERATOR_TRANSFER_UNSPEC);
+    } else {
+        res = opal_accelerator.mem_move_async(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
+                                              dest, src, size, stream, MCA_ACCELERATOR_TRANSFER_UNSPEC);
+    }
     if (OPAL_SUCCESS != res) {
         opal_output(0, "Error in accelerator memmove");
         abort();
@@ -121,11 +133,12 @@ static void *opal_datatype_accelerator_memmove(void *dest, const void *src, size
 #define MEM_OP opal_datatype_accelerator_memmove
 #include "opal_datatype_copy.h"
 
-int32_t opal_datatype_copy_content_same_ddt(const opal_datatype_t *datatype, int32_t count,
-                                            char *destination_base, char *source_base)
+int32_t opal_datatype_copy_content_same_ddt_stream(const opal_datatype_t *datatype, int32_t count,
+                                                   char *destination_base, char *source_base,
+                                                   opal_accelerator_stream_t *stream)
 {
     ptrdiff_t extent;
-    int32_t (*fct)(const opal_datatype_t *, int32_t, char *, char *);
+    int32_t (*fct)(const opal_datatype_t *, int32_t, char *, char *, opal_accelerator_stream_t*);
 
     DO_DEBUG(opal_output(0, "opal_datatype_copy_content_same_ddt( %p, %d, dst %p, src %p )\n",
                          (void *) datatype, count, (void *) destination_base,
@@ -157,5 +170,11 @@ int32_t opal_datatype_copy_content_same_ddt(const opal_datatype_t *datatype, int
             fct = overlap_accelerator_copy_content_same_ddt;
         }
     }
-    return fct(datatype, count, destination_base, source_base);
+    return fct(datatype, count, destination_base, source_base, stream);
 }
+
+int32_t opal_datatype_copy_content_same_ddt(const opal_datatype_t *datatype, int32_t count,
+                                            char *destination_base, char *source_base)
+{
+    return opal_datatype_copy_content_same_ddt_stream(datatype, count, destination_base, source_base, NULL);
+}
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_copy.h b/opal/datatype/opal_datatype_copy.h
index 1e10b03ed27..dba8de0baf1 100644
--- a/opal/datatype/opal_datatype_copy.h
+++ b/opal/datatype/opal_datatype_copy.h
@@ -44,7 +44,7 @@
 static inline void _predefined_data(const dt_elem_desc_t *ELEM, const opal_datatype_t *DATATYPE,
                                     unsigned char *SOURCE_BASE, size_t TOTAL_COUNT, size_t COUNT,
                                     unsigned char *SOURCE, unsigned char *DESTINATION,
-                                    size_t *SPACE)
+                                    size_t *SPACE, opal_accelerator_stream_t *stream)
 {
     const ddt_elem_desc_t *_elem = &((ELEM)->elem);
     unsigned char *_source = (SOURCE) + _elem->disp;
@@ -69,7 +69,7 @@ static inline void _predefined_data(const dt_elem_desc_t *ELEM, const opal_datat
         DO_DEBUG(opal_output(0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n",
                              STRINGIFY(MEM_OP_NAME), (void *) _destination, (void *) _source,
                              do_now_bytes, *(SPACE) -_i * do_now_bytes););
-        MEM_OP(_destination, _source, do_now_bytes);
+        MEM_OP(_destination, _source, do_now_bytes, stream);
         _destination += _elem->extent;
         _source += _elem->extent;
     }
@@ -79,7 +79,7 @@ static inline void _predefined_data(const dt_elem_desc_t *ELEM, const opal_datat
 static inline void _contiguous_loop(const dt_elem_desc_t *ELEM, const opal_datatype_t *DATATYPE,
                                     unsigned char *SOURCE_BASE, size_t TOTAL_COUNT, size_t COUNT,
                                     unsigned char *SOURCE, unsigned char *DESTINATION,
-                                    size_t *SPACE)
+                                    size_t *SPACE, opal_accelerator_stream_t *stream)
 {
     ddt_loop_desc_t *_loop = (ddt_loop_desc_t *) (ELEM);
     ddt_endloop_desc_t *_end_loop = (ddt_endloop_desc_t *) ((ELEM) + _loop->items);
@@ -91,7 +91,7 @@ static inline void _contiguous_loop(const dt_elem_desc_t *ELEM, const opal_datat
         _copy_loops *= _end_loop->size;
         OPAL_DATATYPE_SAFEGUARD_POINTER(_source, _copy_loops, (SOURCE_BASE), (DATATYPE),
                                         (TOTAL_COUNT));
-        MEM_OP(_destination, _source, _copy_loops);
+        MEM_OP(_destination, _source, _copy_loops, stream);
     } else {
         for (size_t _i = 0; _i < _copy_loops; _i++) {
             OPAL_DATATYPE_SAFEGUARD_POINTER(_source, _end_loop->size, (SOURCE_BASE), (DATATYPE),
@@ -100,7 +100,7 @@ static inline void _contiguous_loop(const dt_elem_desc_t *ELEM, const opal_datat
                                  "copy 3. %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n",
                                  STRINGIFY(MEM_OP_NAME), (void *) _destination, (void *) _source,
                                  _end_loop->size, *(SPACE) -_i * _end_loop->size););
-            MEM_OP(_destination, _source, _end_loop->size);
+            MEM_OP(_destination, _source, _end_loop->size, stream);
             _source += _loop->extent;
             _destination += _loop->extent;
         }
@@ -110,7 +110,8 @@ static inline void _contiguous_loop(const dt_elem_desc_t *ELEM, const opal_datat
 }
 
 static inline int32_t _copy_content_same_ddt(const opal_datatype_t *datatype, int32_t count,
-                                             char *destination_base, char *source_base)
+                                             char *destination_base, char *source_base,
+                                             opal_accelerator_stream_t *stream)
 {
     dt_stack_t *pStack;  /* pointer to the position on the stack */
     int32_t stack_pos;   /* index of the stack level */
@@ -148,13 +149,20 @@ static inline int32_t _copy_content_same_ddt(const opal_datatype_t *datatype, in
                 DO_DEBUG(opal_output(0, "copy c1. %s( %p, %p, %lu ) => space %lu\n",
                                      STRINGIFY(MEM_OP_NAME), (void *) destination, (void *) source,
                                      (unsigned long) memop_chunk, (unsigned long) total_length););
-                MEM_OP(destination, source, memop_chunk);
+                MEM_OP(destination, source, memop_chunk, stream);
                 destination += memop_chunk;
                 source += memop_chunk;
                 total_length -= memop_chunk;
             }
             return 0; /* completed */
         }
+        opal_accelerator_stream_t *actual_stream = stream;
+        bool flush_stream = false;
+        if (NULL == stream) {
+            /* TODO: figure out the stream */
+            opal_accelerator.get_default_stream(0, &actual_stream);
+            flush_stream = true;
+        }
         for (pos_desc = 0; (int32_t) pos_desc < count; pos_desc++) {
             OPAL_DATATYPE_SAFEGUARD_POINTER(destination, datatype->size,
                                             (unsigned char *) destination_base, datatype, count);
@@ -164,10 +172,13 @@ static inline int32_t _copy_content_same_ddt(const opal_datatype_t *datatype, in
                                  STRINGIFY(MEM_OP_NAME), (void *) destination, (void *) source,
                                  (unsigned long) datatype->size,
                                  (unsigned long) (iov_len_local - (pos_desc * datatype->size))););
-            MEM_OP(destination, source, datatype->size);
+            MEM_OP(destination, source, datatype->size, actual_stream);
             destination += extent;
             source += extent;
         }
+        if (flush_stream) {
+            opal_accelerator.wait_stream(actual_stream);
+        }
         return 0; /* completed */
     }
 
@@ -185,11 +196,18 @@ static inline int32_t _copy_content_same_ddt(const opal_datatype_t *datatype, in
 
     UPDATE_INTERNAL_COUNTERS(description, 0, pElem, count_desc);
 
+    opal_accelerator_stream_t *actual_stream = stream;
+    bool flush_stream = false;
+    if (NULL == stream) {
+        /* TODO: figure out the stream */
+        opal_accelerator.get_default_stream(0, &actual_stream);
+        flush_stream = true;
+    }
     while (1) {
         while (OPAL_LIKELY(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA)) {
             /* now here we have a basic datatype */
             _predefined_data(pElem, datatype, (unsigned char *) source_base, count, count_desc,
-                             source, destination, &iov_len_local);
+                             source, destination, &iov_len_local, stream);
             pos_desc++; /* advance to the next data */
             UPDATE_INTERNAL_COUNTERS(description, pos_desc, pElem, count_desc);
         }
@@ -202,6 +220,9 @@ static inline int32_t _copy_content_same_ddt(const opal_datatype_t *datatype, in
             if (--(pStack->count) == 0) { /* end of loop */
                 if (stack_pos == 0) {
                     assert(iov_len_local == 0);
+                    if (flush_stream) {
+                        opal_accelerator.wait_stream(actual_stream);
+                    }
                     return 0; /* completed */
                 }
                 stack_pos--;
@@ -229,7 +250,7 @@ static inline int32_t _copy_content_same_ddt(const opal_datatype_t *datatype, in
             ptrdiff_t local_disp = (ptrdiff_t) source;
             if (pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) {
                 _contiguous_loop(pElem, datatype, (unsigned char *) source_base, count, count_desc,
-                                 source, destination, &iov_len_local);
+                                 source, destination, &iov_len_local, actual_stream);
                 pos_desc += pElem->loop.items + 1;
                 goto update_loop_description;
             }
diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h
index f631242f8a8..34c6e3147ad 100644
--- a/opal/mca/accelerator/accelerator.h
+++ b/opal/mca/accelerator/accelerator.h
@@ -137,6 +137,16 @@ struct opal_accelerator_mempool_t {
 typedef struct opal_accelerator_event_t opal_accelerator_mempool_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_mempool_t);
 
+/**
+ * Query the default stream.
+ *
+ * @param[OUT] stream        Set to the default stream.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_get_default_stream_fn_t)(
+    int dev_id, opal_accelerator_stream_t **stream);
+
 /**
  * Check whether a pointer belongs to an accelerator or not.
  * interfaces
@@ -281,6 +291,29 @@ typedef int (*opal_accelerator_base_module_memmove_fn_t)(
     int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
     opal_accelerator_transfer_type_t type);
 
+/**
+ * Copies memory asynchronously from src to dest. Memory of dest and src
+ * may overlap. Optionally can specify the transfer type to
+ * avoid pointer detection for performance. The operations will be enqueued
+ * into the provided stream but are not guaranteed to be complete upon return.
+ *
+ * @param[IN] dest_dev_id    Associated device to copy to or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] src_dev_id     Associated device to copy from or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] dest           Destination to copy memory to
+ * @param[IN] src            Source to copy memory from
+ * @param[IN] size           Size of memory to copy
+ * @param[IN] stream         Stream to perform asynchronous move on
+ * @param[IN] type           Transfer type field for performance
+ *                           Can be set to MCA_ACCELERATOR_TRANSFER_UNSPEC
+ *                           if caller is unsure of the transfer direction.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_memmove_async_fn_t)(
+    int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+    opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 /**
  * Allocates size bytes memory from the device and sets ptr to the
  * pointer of the allocated memory. The memory is not initialized.
@@ -456,7 +489,7 @@ typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_device
  */
 typedef struct {
     /* default stream pointer */
-    opal_accelerator_stream_t *default_stream;
+    opal_accelerator_base_get_default_stream_fn_t get_default_stream;
     /* accelerator function table */
     opal_accelerator_base_module_check_addr_fn_t check_addr;
 
@@ -467,6 +500,7 @@ typedef struct {
 
     opal_accelerator_base_module_memcpy_async_fn_t mem_copy_async;
     opal_accelerator_base_module_memcpy_fn_t mem_copy;
+    opal_accelerator_base_module_memmove_async_fn_t mem_move_async;
     opal_accelerator_base_module_memmove_fn_t mem_move;
 
     opal_accelerator_base_module_mem_alloc_fn_t mem_alloc;
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index c8e22473f11..ce271ba3cb9 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -24,6 +24,7 @@
 #include "opal/util/show_help.h"
 #include "opal/util/proc.h"
 /* Accelerator API's */
+static int accelerator_cuda_get_default_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags);
 static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream);
 
@@ -35,6 +36,8 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                              opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size);
@@ -59,7 +62,7 @@ static int accelerator_cuda_get_num_devices(int *num_devices);
 
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
 {
-    &opal_accelerator_cuda_default_stream.base,
+    accelerator_cuda_get_default_stream,
 
     accelerator_cuda_check_addr,
 
@@ -71,6 +74,7 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
 
     accelerator_cuda_memcpy_async,
     accelerator_cuda_memcpy,
+    accelerator_cuda_memmove_async,
     accelerator_cuda_memmove,
     accelerator_cuda_mem_alloc,
     accelerator_cuda_mem_release,
@@ -250,6 +254,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     return 1;
 }
 
+static int accelerator_cuda_get_default_stream(int dev_id, opal_accelerator_stream_t **stream)
+{
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    *stream = &opal_accelerator_cuda_default_stream;
+    return OPAL_SUCCESS;
+}
+
 static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream)
 {
     CUresult result;
@@ -438,13 +452,13 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
      * Additionally, cuMemcpy is not necessarily always synchronous. See:
      * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html
      * TODO: Add optimizations for type field */
-    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, *(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
                        size, result);
         return OPAL_ERROR;
     }
-    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
+    result = cuStreamSynchronize(*(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -453,11 +467,12 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
     return OPAL_SUCCESS;
 }
 
-static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
-                             opal_accelerator_transfer_type_t type)
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type)
 {
     CUdeviceptr tmp;
     CUresult result;
+    void *ptr;
 
     int delayed_init = opal_accelerator_cuda_delayed_init();
     if (OPAL_UNLIKELY(0 != delayed_init)) {
@@ -468,29 +483,42 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = cuMemAlloc(&tmp, size);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    result = accelerator_cuda_mem_alloc_stream(src_dev_id, &ptr, size, stream);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
         return OPAL_ERROR;
     }
-    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    tmp = (CUdeviceptr)ptr;
+    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, *(CUstream*)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size,
                        result);
         return OPAL_ERROR;
     }
-    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, *(CUstream*)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp,
                        size, result);
         return OPAL_ERROR;
     }
-    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
+    return accelerator_cuda_mem_release_stream(src_dev_id, ptr, stream);
+}
+
+static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                    opal_accelerator_transfer_type_t type)
+{
+    int ret;
+    CUresult result;
+
+    ret = accelerator_cuda_memmove_async(dest_dev_id, src_dev_id, dest, src, size, &opal_accelerator_cuda_memcpy_stream, type);
+    if (OPAL_SUCCESS != ret) {
+        return OPAL_ERROR;
+    }
+    result = accelerator_cuda_wait_stream(&opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
-    cuMemFree(tmp);
     return OPAL_SUCCESS;
 }
 
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index a3c4d29acef..dd09be2325d 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -39,7 +39,7 @@ typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t);
 
 /* Declare extern variables, defined in accelerator_cuda_component.c */
-OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream;
+OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream;
 OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_alloc_stream;
 OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream;
 OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock;
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index 3f200bce55c..e84beb4264e 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -34,9 +34,9 @@
 #include "opal/sys/atomic.h"
 
 /* Define global variables, used in accelerator_cuda.c */
-CUstream opal_accelerator_cuda_memcpy_stream = NULL;
+opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream = {0};
 CUstream opal_accelerator_cuda_alloc_stream = NULL;
-opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream;
+opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream = {0};
 opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
 int opal_accelerator_cuda_num_devices = 0;
 
@@ -159,12 +159,16 @@ int opal_accelerator_cuda_delayed_init()
     cuDeviceGetCount(&opal_accelerator_cuda_num_devices);
 
     /* Create stream for use in cuMemcpyAsync synchronous copies */
-    result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0);
+    CUstream memcpy_stream;
+    result = cuStreamCreate(&memcpy_stream, 0);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         goto out;
     }
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_cuda_stream_t);
+    opal_accelerator_cuda_memcpy_stream.base.stream = malloc(sizeof(CUstream));
+    *(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream = memcpy_stream;
 
     /* Create stream for use in cuMemcpyAsync synchronous copies */
     result = cuStreamCreate(&opal_accelerator_cuda_alloc_stream, 0);
@@ -220,6 +224,9 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
 {
     OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
     OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_default_stream, opal_accelerator_stream_t);
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_stream_t);
+
     /* First check if the support is enabled.  In the case that the user has
      * turned it off, we do not need to continue with any CUDA specific
      * initialization.  Do this after MCA parameter registration. */
@@ -251,14 +258,14 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
     if (CUDA_SUCCESS != result) {
         ctx_ok = 0;
     }
-    if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) {
-        cuStreamDestroy(opal_accelerator_cuda_memcpy_stream);
+    if ((NULL != opal_accelerator_cuda_memcpy_stream.base.stream) && ctx_ok) {
+        OBJ_DESTRUCT(&opal_accelerator_cuda_memcpy_stream);
     }
     if ((NULL != opal_accelerator_cuda_alloc_stream) && ctx_ok) {
         cuStreamDestroy(opal_accelerator_cuda_alloc_stream);
     }
     if ((NULL != opal_accelerator_cuda_default_stream.base.stream) && ctx_ok) {
-        cuStreamDestroy(opal_accelerator_cuda_default_stream.base.stream);
+        OBJ_DESTRUCT(&opal_accelerator_cuda_default_stream);
     }
 
 
diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c
index 06eb7bffd37..b9b002e81ed 100644
--- a/opal/mca/accelerator/null/accelerator_null_component.c
+++ b/opal/mca/accelerator/null/accelerator_null_component.c
@@ -27,6 +27,8 @@
 const char *opal_accelerator_null_component_version_string
     = "OPAL null accelerator MCA component version " OPAL_VERSION;
 
+static opal_accelerator_stream_t default_stream;
+
 /*
  * Component API functions
  */
@@ -37,8 +39,10 @@ static opal_accelerator_base_module_t* accelerator_null_init(void);
 static void accelerator_null_finalize(opal_accelerator_base_module_t* module);
 
 /* Accelerator API's */
+static int accelerator_null_get_default_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int accelerator_null_check_addr(const void *addr, int *dev_id, uint64_t *flags);
 
+static int accelerator_null_get_default_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int accelerator_null_create_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int accelerator_null_create_event(int dev_id, opal_accelerator_event_t **event);
 static int accelerator_null_record_event(int dev_id, opal_accelerator_event_t *event, opal_accelerator_stream_t *stream);
@@ -48,6 +52,8 @@ static int accelerator_null_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                                    size_t size, opal_accelerator_transfer_type_t type);
+static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                     opal_accelerator_transfer_type_t type);
 
@@ -110,7 +116,7 @@ opal_accelerator_null_component_t mca_accelerator_null_component = {{
 
 opal_accelerator_base_module_t opal_accelerator_null_module =
 {
-    NULL,
+    accelerator_null_get_default_stream,
 
     accelerator_null_check_addr,
 
@@ -122,6 +128,7 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
 
     accelerator_null_memcpy_async,
     accelerator_null_memcpy,
+    accelerator_null_memmove_async,
     accelerator_null_memmove,
     accelerator_null_mem_alloc,
     accelerator_null_mem_release,
@@ -174,6 +181,11 @@ static int accelerator_null_check_addr(const void *addr, int *dev_id, uint64_t *
     return 0;
 }
 
+static int accelerator_null_get_default_stream(int dev_id, opal_accelerator_stream_t **stream)
+{
+    *stream = &default_stream;
+    return OPAL_SUCCESS;
+}
 static int accelerator_null_create_stream(int dev_id, opal_accelerator_stream_t **stream)
 {
     *stream = OBJ_NEW(opal_accelerator_stream_t);
@@ -217,6 +229,12 @@ static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest,
     return OPAL_SUCCESS;
 }
 
+static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type)
+{
+    memmove(dest, src, size);
+    return OPAL_SUCCESS;
+}
 static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size)
 {
     *ptr = malloc(size);

From 8f5b5037b78529b613dbaf09803c60c3b4725f5c Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 19 Apr 2023 12:32:27 -0400
Subject: [PATCH 18/74] Remove extra copies from allreduce redscat and ring

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c | 109 +++++++++++++++--------
 1 file changed, 72 insertions(+), 37 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 55c6a3203a3..63b7c3ab710 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -216,13 +216,6 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                 /* tmpsend = tmprecv (op) tmpsend */
                 ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
             }
-#if 0
-            ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)sbuf, stream);
-            if (ret < 0) { line = __LINE__; goto error_hndl; }
-            tmpsend = inplacebuf;
-            /* tmpsend = tmprecv (op) tmpsend */
-            ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
-#endif // 0
             newrank = rank >> 1;
         }
     } else {
@@ -241,6 +234,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         remote = (newremote < extra_ranks)?
             (newremote * 2 + 1):(newremote + extra_ranks);
 
+        bool have_next_iter = ((distance << 1) < adjsize);
+
         /* wait for previous ops to complete to complete */
         opal_accelerator.wait_stream(stream);
         /* Exchange the data */
@@ -253,40 +248,46 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
         /* Apply operation */
         if (rank < remote) {
-            /* TODO: use the 3buff variant here to avoid the copy */
             if (tmpsend == sbuf) {
+                /* special case: 1st iteration takes one input from the sbuf */
                 /* tmprecv = sbuf (op) tmprecv */
                 ompi_op_reduce_stream(op, sbuf, tmprecv, count, dtype, stream);
                 /* send the current recv buffer, and use the tmp buffer to receive */
                 tmpsend = tmprecv;
                 tmprecv = inplacebuf;
-#if 0
-                ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)rbuf, stream);
-                if (ret < 0) { line = __LINE__; goto error_hndl; }
-                tmprecv = inplacebuf;
-#endif // 0
-            } else {
+            } else if (have_next_iter || tmprecv == recv) {
+                /* All iterations, and the last if tmprecv is the recv buffer */
                 /* tmprecv = tmpsend (op) tmprecv */
                 ompi_op_reduce_stream(op, tmpsend, tmprecv, count, dtype, stream);
                 /* swap send and receive buffers */
                 tmpswap = tmprecv;
                 tmprecv = tmpsend;
                 tmpsend = tmpswap;
+            } else {
+                /* Last iteration if tmprecv is not the recv buffer, then tmpsend is */
+                /* Make sure we reduce into the receive buffer
+                 * tmpsend = tmprecv (op) tmpsend */
+                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
             }
         } else {
-            /* use the 3buff variant here to avoid the copy */
             if (tmpsend == sbuf) {
+                /* First iteration: use input from sbuf */
                 /* tmpsend = tmprecv (op) sbuf */
                 tmpsend = inplacebuf;
-                ompi_3buff_op_reduce_stream(op, tmprecv, sbuf, tmpsend, count, dtype, stream);
-#if 0
-                ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)sbuf, stream);
-                if (ret < 0) { line = __LINE__; goto error_hndl; }
-                tmpsend = inplacebuf;
-#endif // 0
-            } else {
+                if (have_next_iter || tmpsend == recv) {
+                    ompi_3buff_op_reduce_stream(op, tmprecv, sbuf, tmpsend, count, dtype, stream);
+                } else {
+                    ompi_op_reduce_stream(op, sbuf, tmprecv, count, dtype, stream);
+                    tmpsend = tmprecv;
+                }
+            } else if (have_next_iter || tmpsend == rbuf) {
+                /* All other iterations: reduce into tmpsend for next iteration */
                 /* tmpsend = tmprecv (op) tmpsend */
                 ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
+            } else {
+                /* Last iteration: reduce into rbuf and set tmpsend to rbuf (needed at the end) */
+                ompi_op_reduce_stream(op, tmpsend, tmprecv, count, dtype, stream);
+                tmpsend = tmprecv;
             }
         }
     }
@@ -319,7 +320,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
 
-    /* wait for previous ops to complete to complete */
+    /* wait for previous ops to complete */
     opal_accelerator.wait_stream(stream);
     ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev, module);
     return MPI_SUCCESS;
@@ -465,10 +466,13 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
     }
 
     /* Handle MPI_IN_PLACE */
+    bool use_sbuf = (MPI_IN_PLACE != sbuf);
+#if 0
     if (MPI_IN_PLACE != sbuf) {
         ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
+#endif // 0
 
     /* Computation loop */
 
@@ -522,13 +526,19 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
 
         /* Apply operation on previous block: result goes to rbuf
            rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
-        */
+         */
         block_offset = ((prevblock < split_rank)?
                         ((ptrdiff_t)prevblock * early_segcount) :
                         ((ptrdiff_t)prevblock * late_segcount + split_rank));
         block_count = ((prevblock < split_rank)? early_segcount : late_segcount);
         tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
-        ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
+        if (use_sbuf) {
+            /* tmprecv = inbuf[inbi ^ 0x1] (op) sbuf */
+            ompi_3buff_op_reduce(op, inbuf[inbi ^ 0x1], sbuf, tmprecv, block_count, dtype);
+        } else {
+            /* tmprecv = inbuf[inbi ^ 0x1] (op) tmprecv */
+            ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
+        }
 
         /* send previous block to send_to */
         ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to,
@@ -1070,12 +1080,6 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
         return OMPI_ERR_OUT_OF_RESOURCE;
     tmp_buf = tmp_buf_raw - gap;
 
-    if (sbuf != MPI_IN_PLACE) {
-        err = ompi_datatype_copy_content_same_ddt(dtype, count, (char *)rbuf,
-                                                  (char *)sbuf);
-        if (MPI_SUCCESS != err) { goto cleanup_and_return; }
-    }
-
     /*
      * Step 1. Reduce the number of processes to the nearest lower power of two
      * p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes.
@@ -1096,9 +1100,16 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     int vrank, step, wsize;
     int nprocs_rem = comm_size - nprocs_pof2;
 
+    opal_accelerator_stream_t *stream;
+    opal_accelerator.get_default_stream(tmp_buf_dev, &stream);
+
     if (rank < 2 * nprocs_rem) {
         int count_lhalf = count / 2;
         int count_rhalf = count - count_lhalf;
+        const void *send_buf = sbuf;
+        if (MPI_IN_PLACE == sbuf) {
+            send_buf = rbuf;
+        }
 
         if (rank % 2 != 0) {
             /*
@@ -1106,7 +1117,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
              * Send the left half of the input vector to the left neighbor,
              * Recv the right half of the input vector from the left neighbor
              */
-            err = ompi_coll_base_sendrecv(rbuf, count_lhalf, dtype, rank - 1,
+            err = ompi_coll_base_sendrecv(send_buf, count_lhalf, dtype, rank - 1,
                                           MCA_COLL_BASE_TAG_ALLREDUCE,
                                           (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
                                           count_rhalf, dtype, rank - 1,
@@ -1114,9 +1125,20 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
                                           MPI_STATUS_IGNORE, rank);
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
-            /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
-                           (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
+            /* Reduce on the right half of the buffers (result in rbuf)
+             * We're not using a stream here, the reduction will make sure that the result is available upon return */
+            if (MPI_IN_PLACE == sbuf) {
+                /* rbuf = sbuf (op) tmp_buf */
+                ompi_3buff_op_reduce(op,
+                                     (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
+                                     (char *)sbuf + (ptrdiff_t)count_lhalf * extent,
+                                     (char *)rbuf + count_lhalf * extent,
+                                     count_rhalf, dtype);
+            } else {
+                /* rbuf = rbuf (op) tmp_buf */
+                ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
+                               (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
+            }
 
             /* Send the right half to the left neighbor */
             err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -1134,7 +1156,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
              * Send the right half of the input vector to the right neighbor,
              * Recv the left half of the input vector from the right neighbor
              */
-            err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
+            err = ompi_coll_base_sendrecv((char *)send_buf + (ptrdiff_t)count_lhalf * extent,
                                           count_rhalf, dtype, rank + 1,
                                           MCA_COLL_BASE_TAG_ALLREDUCE,
                                           tmp_buf, count_lhalf, dtype, rank + 1,
@@ -1143,7 +1165,15 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype);
+            if (MPI_IN_PLACE == sbuf) {
+                /* rbuf = sbuf (op) tmp_buf */
+                ompi_3buff_op_reduce_stream(op, sbuf, tmp_buf, rbuf, count_lhalf, dtype, stream);
+
+            } else {
+                /* rbuf = rbuf (op) tmp_buf */
+                ompi_op_reduce_stream(op, tmp_buf, rbuf, count_lhalf, dtype, stream);
+            }
+
 
             /* Recv the right half from the right neighbor */
             err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -1152,12 +1182,17 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
                                     MPI_STATUS_IGNORE));
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
+            /* wait for the op to complete */
+            opal_accelerator.wait_stream(stream);
+
             vrank = rank / 2;
         }
     } else { /* rank >= 2 * nprocs_rem */
         vrank = rank - nprocs_rem;
     }
 
+    /* At this point the input data has been accumulated into the rbuf */
+
     /*
      * Step 2. Reduce-scatter implemented with recursive vector halving and
      * recursive distance doubling. We have p' = 2^{\floor{\log_2 p}}

From 1c68d17dfe6a35650ef61d985e81126011cd8100 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 19 Apr 2023 12:35:00 -0400
Subject: [PATCH 19/74] Allow ops and memcpy on managed memory from the host

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/op/op.h                                 | 23 +++++++++++++++-----
 opal/datatype/opal_convertor.c               |  2 ++
 opal/mca/accelerator/cuda/accelerator_cuda.c | 12 ++++++++++
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/ompi/op/op.h b/ompi/op/op.h
index 7b726430e51..0d9ed54fc1b 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -622,8 +622,14 @@ static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
             NULL != op->o_device_op) {
             use_device_op = true;
         } else {
-            /* TODO: can we be more graceful here? */
-            abort();
+            /* check whether we can access the memory from the host */
+            if ((target_check_addr == 0 || (target_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) &&
+                (source_check_addr == 0 || (source_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY))) {
+                /* nothing to be done, we won't need device-capable ops */
+            } else {
+                fprintf(stderr, "3buff op: no suitable op module found for device memory!\n");
+                abort();
+            }
         }
     }
 
@@ -742,12 +748,19 @@ static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, void *source1,
      * access handle it properly */
     if (target_check_addr > 0 || source1_check_addr > 0 || source2_check_addr > 0) {
         if (ompi_datatype_is_predefined(dtype) &&
-            0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC) &&
+            op->o_flags & OMPI_OP_FLAGS_INTRINSIC &&
             NULL != op->o_device_op) {
             use_device_op = true;
         } else {
-            /* TODO: can we be more graceful here? */
-            abort();
+            /* check whether we can access the memory from the host */
+            if ((target_check_addr  == 0 || (target_flags  & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) &&
+                (source1_check_addr == 0 || (source1_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) &&
+                (source2_check_addr == 0 || (source2_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY))) {
+                /* nothing to be done, we won't need device-capable ops */
+            } else {
+                fprintf(stderr, "3buff op: no suitable op module found for device memory!\n");
+                abort();
+            }
         }
     }
 
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 8550683a60d..24d89b0a3ba 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -50,6 +50,8 @@ static void *opal_convertor_accelerator_memcpy(void *dest, const void *src, size
     int res;
     if (!(convertor->flags & CONVERTOR_ACCELERATOR)) {
         return MEMCPY(dest, src, size);
+    } else if (convertor->flags & CONVERTOR_ACCELERATOR_UNIFIED) {
+        return MEMCPY(dest, src, size);
     }
 
     res = opal_accelerator.mem_copy(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index ce271ba3cb9..cd028504a48 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -444,6 +444,8 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERR_BAD_PARAM;
     }
 
+#if 0
+
     /* Async copy then synchronize is the default behavior as some applications
      * cannot utilize synchronous copies. In addition, host memory does not need
      * to be page-locked if an Async memory copy is done (It just makes it synchronous
@@ -459,6 +461,8 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERROR;
     }
     result = cuStreamSynchronize(*(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream);
+#endif 0
+    result = cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -535,6 +539,14 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
         return OPAL_ERR_BAD_PARAM;
     }
 
+    /* prefer managed memory */
+    result = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
+    if (cudaSuccess == result) {
+        return OPAL_SUCCESS;
+    }
+
+    /* fall-back to discrete memory */
+
 #if CUDA_VERSION >= 11020
     /* Try to allocate the memory from a memory pool, if available */
     /* get the default pool */

From 70dde0f5be2cfc0d97012c2313ad54b025f2022c Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 19 Apr 2023 12:35:29 -0400
Subject: [PATCH 20/74] reduce_local: add support for device memory

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 test/datatype/Makefile.am    |   5 +-
 test/datatype/reduce_local.c | 527 ++++++++++++++++++++++++++---------
 2 files changed, 399 insertions(+), 133 deletions(-)

diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 3d6fd3289b5..9ae81e3dff9 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -96,8 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
 unpack_hetero_LDADD = \
         $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la
 
+
 reduce_local_SOURCES = reduce_local.c
-reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
+reduce_local_CPPFLAGS= $(accelerator_cudart_CPPFLAGS) $(accelerator_cuda_CPPFLAGS)
+reduce_local_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)\
+        $(accelerator_cuda_LDFLAGS) $(accelerator_cudart_LDFLAGS)
 reduce_local_LDADD = \
         $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
         $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la
diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index 17259cd2b18..b412479a93a 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -20,6 +20,9 @@
 #include <sys/time.h>
 #include <unistd.h>
 
+// TODO: detect through configure
+#define HAVE_CUDA 1
+
 #include "mpi.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/datatype/ompi_datatype.h"
@@ -123,25 +126,25 @@ static int build_do_ops(char *optarg, int *do_ops)
 }
 
 /* clang-format off */
-#define MPI_OP_TEST(OPNAME, MPIOP, MPITYPE, TYPE, INBUF, INOUT_BUF, CHECK_BUF, COUNT, TYPE_PREFIX) \
+#define MPI_OP_TEST(OPNAME, MPIOP, MPITYPE, TYPE, INIT_INBUF, INBUF, INIT_INOUT_BUF, INOUT_BUF, CHECK_BUF, COUNT, TYPE_PREFIX) \
 do { \
-    const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
-    TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
     skip_op_type = 0; \
+    allocator->memcpy(INBUF, INIT_INBUF, sizeof(TYPE) * (COUNT)); \
     for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
         duration[_k] = 0.0; \
         for(int _r = repeats; _r > 0; _r--) { \
-            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
+            allocator->memcpy(INOUT_BUF, INIT_INOUT_BUF, sizeof(TYPE) * (COUNT)); \
             tstart = MPI_Wtime(); \
-            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
+            MPI_Reduce_local(INBUF+_k, INOUT_BUF+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
             tend = MPI_Wtime(); \
             duration[_k] += (tend - tstart); \
             if( check ) { \
+                allocator->memcpy(CHECK_BUF, INOUT_BUF, sizeof(TYPE) * (COUNT)); \
                 for( i = 0; i < (COUNT)-_k; i++ ) { \
-                    if(((_p2+_k)[i]) == (((_p1+_k)[i]) OPNAME ((_p3+_k)[i]))) \
+                    if(((CHECK_BUF+_k)[i]) == (((INIT_INBUF+_k)[i]) OPNAME ((INIT_INOUT_BUF+_k)[i]))) \
                         continue; \
                     printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
-                           _k, i, (_p1+_k)[i], (#OPNAME), (_p3+_k)[i], (_p2+_k)[i]); \
+                           _k, i, (INBUF+_k)[i], (#OPNAME), (INIT_INOUT_BUF+_k)[i], (INOUT_BUF+_k)[i]); \
                     correctness = 0; \
                     break; \
                 } \
@@ -151,22 +154,22 @@ do { \
     goto check_and_continue; \
 } while (0)
 
-#define MPI_OP_MINMAX_TEST(OPNAME, MPIOP, MPITYPE, TYPE, INBUF, INOUT_BUF, CHECK_BUF, COUNT, TYPE_PREFIX) \
+#define MPI_OP_MINMAX_TEST(OPNAME, MPIOP, MPITYPE, TYPE, INIT_INBUF, INBUF, INIT_INOUT_BUF, INOUT_BUF, CHECK_BUF, COUNT, TYPE_PREFIX) \
 do { \
-    const TYPE *_p1 = ((TYPE*)(INBUF)), *_p3 = ((TYPE*)(CHECK_BUF)); \
-    TYPE *_p2 = ((TYPE*)(INOUT_BUF)); \
     skip_op_type = 0; \
+    allocator->memcpy(INBUF, INIT_INBUF, sizeof(TYPE) * (COUNT)); \
     for(int _k = 0; _k < min((COUNT), max_shift); +_k++ ) { \
         duration[_k] = 0.0; \
         for(int _r = repeats; _r > 0; _r--) { \
-            memcpy(_p2, _p3, sizeof(TYPE) * (COUNT)); \
+            allocator->memcpy(INOUT_BUF, INIT_INOUT_BUF, sizeof(TYPE) * (COUNT)); \
             tstart = MPI_Wtime(); \
-            MPI_Reduce_local(_p1+_k, _p2+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
+            MPI_Reduce_local(INBUF+_k, INOUT_BUF+_k, (COUNT)-_k, (MPITYPE), (MPIOP)); \
             tend = MPI_Wtime(); \
             duration[_k] += (tend - tstart); \
             if( check ) { \
+                allocator->memcpy(CHECK_BUF, INOUT_BUF, sizeof(TYPE) * (COUNT)); \
                 for( i = 0; i < (COUNT); i++ ) { \
-                    TYPE _v1 = *(_p1+_k), _v2 = *(_p2+_k), _v3 = *(_p3+_k); \
+                    TYPE _v1 = *(INIT_INBUF+_k), _v2 = *(CHECK_BUF+_k), _v3 = *(INIT_INOUT_BUF+_k); \
                     if(_v2 == OPNAME(_v1, _v3)) \
                         continue; \
                     printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
@@ -181,19 +184,100 @@ do { \
 } while (0)
 /* clang-format on */
 
+static
+void *host_allocate(size_t size, size_t align) {
+    void *ptr;
+    posix_memalign(&ptr, align, size);
+    return ptr;
+}
+static void host_free(void *ptr) {
+    free(ptr);
+}
+static void host_init(void) {
+    // nothing to do
+}
+static void host_fini(void) {
+    // nothing to do
+}
+static void* host_memcpy(void *dst, const void *src, size_t size) {
+    return memcpy(dst, src, size);
+}
+
+typedef void*(*allocate_fn_t)(size_t, size_t);
+typedef void*(*memcpy_fn_t)(void*, const void*, size_t);
+typedef void(*free_fn_t)(void*);
+typedef void(*init_fn_t)(void);
+typedef void(*fini_fn_t)(void);
+
+enum ALLOCATOR_FLAGS {
+    ALLOCATOR_DISCRETE = 1,
+};
+
+typedef struct {
+    int           flags;
+    init_fn_t     init;
+    allocate_fn_t allocate;
+    memcpy_fn_t   memcpy;
+    free_fn_t     free;
+    fini_fn_t     fini;
+} allocator_t;
+
+static allocator_t host_allocator = {
+    .flags     = 0,
+    .init     = &host_init,
+    .allocate = &host_allocate,
+    .memcpy   = &host_memcpy,
+    .free     = &host_free,
+    .fini     = &host_fini};
+
+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+static void cuda_init() {
+    // nothing to be done
+}
+static void *cuda_allocate(size_t size, size_t align) {
+    (void)align; // ignored
+    void *ptr;
+    int err;
+    if (cudaSuccess != (err = cudaMalloc(&ptr, size))) {
+        fprintf(stderr, "cudaMalloc failed to allocate %zuB: %s", size, cudaGetErrorName(err));
+        return NULL;
+    }
+    return ptr;
+}
+static void* cuda_memcpy(void *dst, const void *src, size_t size) {
+    cudaMemcpy(dst, src, size, cudaMemcpyDefault);
+    return dst;
+}
+static void cuda_free(void *ptr) {
+    cudaFree(ptr);
+}
+static void cuda_fini() {
+    // nothing to be done
+}
+static allocator_t cuda_allocator = {
+    .flags    = ALLOCATOR_DISCRETE,
+    .init     = &cuda_init,
+    .allocate = &cuda_allocate,
+    .memcpy   = &cuda_memcpy,
+    .free     = &cuda_free,
+    .fini     = &cuda_fini};
+#endif
+
 int main(int argc, char **argv)
 {
-    static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL;
+    static void *in_buf = NULL, *inout_buf = NULL, *inout_check_buf = NULL, *init_in_buf = NULL, *init_inout_buf = NULL;
     int count, type_size = 8, rank, size, provided, correctness = 1;
     int repeats = 1, i, c, op1_alignment = 0, res_alignment = 0;
     int max_shift = 4;
     double *duration, tstart, tend;
+    allocator_t *allocator = &host_allocator;
     bool check = true;
     char type[5] = "uifd", *op = "sum", *mpi_type;
     int lower = 1, upper = 1000000, skip_op_type;
     MPI_Op mpi_op;
 
-    while (-1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:vfh"))) {
+    while (-1 != (c = getopt(argc, argv, "l:u:r:t:o:i:s:n:1:2:d:vfh"))) {
         switch (c) {
         case 'l':
             lower = atoi(optarg);
@@ -267,6 +351,21 @@ int main(int argc, char **argv)
                 exit(-1);
             }
             break;
+        case 'd':
+            if (0 == strncmp("host", optarg, 4)) {
+                // default allocator
+                break;
+            } else
+#ifdef HAVE_CUDA
+            if (0 == strncmp("cuda", optarg, 4)) {
+                allocator = &cuda_allocator;
+                break;
+            } else
+#endif
+            {
+                fprintf(stderr, "Unsupported allocator: %s\n", optarg);
+                // fall-through
+            }
         case 'h':
             fprintf(stdout,
                     "%s options are:\n"
@@ -277,6 +376,11 @@ int main(int argc, char **argv)
                     " -r <number> : number of repetitions for each test\n"
                     " -o <op> : comma separated list of operations to execute among\n"
                     "           sum, min, max, prod, bor, bxor, band\n"
+                    " -d <memory-space> : host"
+#ifdef HAVE_CUDA
+                    ", cuda"
+#endif
+                    "\n"
                     " -i <number> : shift on all buffers to check alignment\n"
                     " -1 <number> : (mis)alignment in elements for the first op\n"
                     " -2 <number> : (mis)alignment in elements for the result\n"
@@ -291,13 +395,16 @@ int main(int argc, char **argv)
     if (!do_ops_built) { /* not yet done, take the default */
         build_do_ops("all", do_ops);
     }
-    posix_memalign(&in_buf, 64, (upper + op1_alignment) * sizeof(double));
-    posix_memalign(&inout_buf, 64, (upper + res_alignment) * sizeof(double));
-    posix_memalign(&inout_check_buf, 64, upper * sizeof(double));
-    duration = (double *) malloc(max_shift * sizeof(double));
-
     ompi_mpi_init(argc, argv, MPI_THREAD_SERIALIZED, &provided, false);
 
+    allocator->init();
+    in_buf    = allocator->allocate((upper + op1_alignment) * sizeof(double), 64);
+    inout_buf = allocator->allocate((upper + op1_alignment) * sizeof(double), 64);
+    init_in_buf = malloc((upper + op1_alignment) * sizeof(double));
+    init_inout_buf = malloc((upper + op1_alignment) * sizeof(double));
+    duration = (double *) malloc(max_shift * sizeof(double));
+    inout_check_buf = malloc(upper * sizeof(double));
+
     rank = ompi_comm_rank(MPI_COMM_WORLD);
     (void) rank;
     size = ompi_comm_size(MPI_COMM_WORLD);
@@ -318,39 +425,55 @@ int main(int argc, char **argv)
                                                       + op1_alignment * sizeof(int8_t)),
                                *inout_int8 = (int8_t *) ((char *) inout_buf
                                                          + res_alignment * sizeof(int8_t)),
-                               *inout_int8_for_check = (int8_t *) inout_check_buf;
+                               *inout_int8_for_check = (int8_t *) inout_check_buf,
+                               *init_inout_int8 = (int8_t *)init_inout_buf,
+                               *init_in_int8 = (int8_t *)init_in_buf;
                         for (i = 0; i < count; i++) {
-                            in_int8[i] = 5;
-                            inout_int8[i] = inout_int8_for_check[i] = -3;
+                            init_in_int8[i] = 5;
+                            init_inout_int8[i] = -3;
                         }
                         mpi_type = "MPI_INT8_T";
 
                         if (0 == strcmp(op, "sum")) {
-                            MPI_OP_TEST(+, mpi_op, MPI_INT8_T, int8_t, in_int8, inout_int8,
+                            MPI_OP_TEST(+, mpi_op, MPI_INT8_T, int8_t,
+                                        init_in_int8, in_int8,
+                                        init_inout_int8, inout_int8,
                                         inout_int8_for_check, count, PRId8);
                         }
                         if (0 == strcmp(op, "bor")) {
-                            MPI_OP_TEST(|, mpi_op, MPI_INT8_T, int8_t, in_int8, inout_int8,
+                            MPI_OP_TEST(|, mpi_op, MPI_INT8_T, int8_t,
+                                        init_in_int8, in_int8,
+                                        init_inout_int8, inout_int8,
                                         inout_int8_for_check, count, PRId8);
                         }
                         if (0 == strcmp(op, "bxor")) {
-                            MPI_OP_TEST(^, mpi_op, MPI_INT8_T, int8_t, in_int8, inout_int8,
+                            MPI_OP_TEST(^, mpi_op, MPI_INT8_T, int8_t,
+                                        init_in_int8, in_int8,
+                                        init_inout_int8, inout_int8,
                                         inout_int8_for_check, count, PRId8);
                         }
                         if (0 == strcmp(op, "prod")) {
-                            MPI_OP_TEST(*, mpi_op, MPI_INT8_T, int8_t, in_int8, inout_int8,
+                            MPI_OP_TEST(*, mpi_op, MPI_INT8_T, int8_t,
+                                        init_in_int8, in_int8,
+                                        init_inout_int8, inout_int8,
                                         inout_int8_for_check, count, PRId8);
                         }
                         if (0 == strcmp(op, "band")) {
-                            MPI_OP_TEST(&, mpi_op, MPI_INT8_T, int8_t, in_int8, inout_int8,
+                            MPI_OP_TEST(&, mpi_op, MPI_INT8_T, int8_t,
+                                        init_in_int8, in_int8,
+                                        init_inout_int8, inout_int8,
                                         inout_int8_for_check, count, PRId8);
                         }
                         if (0 == strcmp(op, "max")) {
-                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_INT8_T, int8_t, in_int8, inout_int8,
+                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_INT8_T, int8_t,
+                                               init_in_int8, in_int8,
+                                               init_inout_int8, inout_int8,
                                                inout_int8_for_check, count, PRId8);
                         }
                         if (0 == strcmp(op, "min")) { // intentionally reversed in and out
-                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_INT8_T, int8_t, in_int8, inout_int8,
+                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_INT8_T, int8_t,
+                                               init_in_int8, in_int8,
+                                               init_inout_int8, inout_int8,
                                                inout_int8_for_check, count, PRId8);
                         }
                     }
@@ -359,40 +482,56 @@ int main(int argc, char **argv)
                                                          + op1_alignment * sizeof(int16_t)),
                                 *inout_int16 = (int16_t *) ((char *) inout_buf
                                                             + res_alignment * sizeof(int16_t)),
-                                *inout_int16_for_check = (int16_t *) inout_check_buf;
+                                *inout_int16_for_check = (int16_t *) inout_check_buf,
+                                *init_inout_int16 = (int16_t *)init_inout_buf,
+                                *init_in_int16 = (int16_t *)init_in_buf;
                         for (i = 0; i < count; i++) {
-                            in_int16[i] = 5;
-                            inout_int16[i] = inout_int16_for_check[i] = -3;
+                            init_in_int16[i] = 5;
+                            init_inout_int16[i] = -3;
                         }
                         mpi_type = "MPI_INT16_T";
 
                         if (0 == strcmp(op, "sum")) {
-                            MPI_OP_TEST(+, mpi_op, MPI_INT16_T, int16_t, in_int16, inout_int16,
+                            MPI_OP_TEST(+, mpi_op, MPI_INT16_T, int16_t,
+                                        init_in_int16, in_int16,
+                                        init_inout_int16, inout_int16,
                                         inout_int16_for_check, count, PRId16);
                         }
                         if (0 == strcmp(op, "bor")) {
-                            MPI_OP_TEST(|, mpi_op, MPI_INT16_T, int16_t, in_int16, inout_int16,
+                            MPI_OP_TEST(|, mpi_op, MPI_INT16_T, int16_t,
+                                        init_in_int16, in_int16,
+                                        init_inout_int16, inout_int16,
                                         inout_int16_for_check, count, PRId16);
                         }
                         if (0 == strcmp(op, "bxor")) {
-                            MPI_OP_TEST(^, mpi_op, MPI_INT16_T, int16_t, in_int16, inout_int16,
+                            MPI_OP_TEST(^, mpi_op, MPI_INT16_T, int16_t,
+                                        init_in_int16, in_int16,
+                                        init_inout_int16, inout_int16,
                                         inout_int16_for_check, count, PRId16);
                         }
                         if (0 == strcmp(op, "prod")) {
-                            MPI_OP_TEST(*, mpi_op, MPI_INT16_T, int16_t, in_int16, inout_int16,
+                            MPI_OP_TEST(*, mpi_op, MPI_INT16_T, int16_t,
+                                        init_in_int16, in_int16,
+                                        init_inout_int16, inout_int16,
                                         inout_int16_for_check, count, PRId16);
                         }
                         if (0 == strcmp(op, "band")) {
-                            MPI_OP_TEST(&, mpi_op, MPI_INT16_T, int16_t, in_int16, inout_int16,
+                            MPI_OP_TEST(&, mpi_op, MPI_INT16_T, int16_t,
+                                        init_in_int16, in_int16,
+                                        init_inout_int16, inout_int16,
                                         inout_int16_for_check, count, PRId16);
                         }
                         if (0 == strcmp(op, "max")) {
-                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_INT16_T, int16_t, in_int16,
-                                               inout_int16, inout_int16_for_check, count, PRId16);
+                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_INT16_T, int16_t,
+                                               init_in_int16, in_int16,
+                                               init_inout_int16, inout_int16,
+                                               inout_int16_for_check, count, PRId16);
                         }
                         if (0 == strcmp(op, "min")) { // intentionally reversed in and out
-                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_INT16_T, int16_t, in_int16,
-                                               inout_int16, inout_int16_for_check, count, PRId16);
+                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_INT16_T, int16_t,
+                                               init_in_int16, in_int16,
+                                               init_inout_int16, inout_int16,
+                                               inout_int16_for_check, count, PRId16);
                         }
                     }
                     if (32 == type_size) {
@@ -400,40 +539,56 @@ int main(int argc, char **argv)
                                                          + op1_alignment * sizeof(int32_t)),
                                 *inout_int32 = (int32_t *) ((char *) inout_buf
                                                             + res_alignment * sizeof(int32_t)),
-                                *inout_int32_for_check = (int32_t *) inout_check_buf;
+                                *inout_int32_for_check = (int32_t *) inout_check_buf,
+                                *init_inout_int32 = (int32_t *)init_inout_buf,
+                                *init_in_int32 = (int32_t *)init_in_buf;
                         for (i = 0; i < count; i++) {
-                            in_int32[i] = 5;
-                            inout_int32[i] = inout_int32_for_check[i] = 3;
+                            init_in_int32[i] = 5;
+                            init_inout_int32[i] = inout_int32_for_check[i] = 3;
                         }
                         mpi_type = "MPI_INT32_T";
 
                         if (0 == strcmp(op, "sum")) {
-                            MPI_OP_TEST(+, mpi_op, MPI_INT32_T, int32_t, in_int32, inout_int32,
+                            MPI_OP_TEST(+, mpi_op, MPI_INT32_T, int32_t,
+                                        init_in_int32, in_int32,
+                                        init_inout_int32, inout_int32,
                                         inout_int32_for_check, count, PRId32);
                         }
                         if (0 == strcmp(op, "bor")) {
-                            MPI_OP_TEST(|, mpi_op, MPI_INT32_T, int32_t, in_int32, inout_int32,
+                            MPI_OP_TEST(|, mpi_op, MPI_INT32_T, int32_t,
+                                        init_in_int32, in_int32,
+                                        init_inout_int32, inout_int32,
                                         inout_int32_for_check, count, PRId32);
                         }
                         if (0 == strcmp(op, "bxor")) {
-                            MPI_OP_TEST(^, mpi_op, MPI_INT32_T, int32_t, in_int32, inout_int32,
+                            MPI_OP_TEST(^, mpi_op, MPI_INT32_T, int32_t,
+                                        init_in_int32, in_int32,
+                                        init_inout_int32, inout_int32,
                                         inout_int32_for_check, count, PRId32);
                         }
                         if (0 == strcmp(op, "prod")) {
-                            MPI_OP_TEST(*, mpi_op, MPI_INT32_T, int32_t, in_int32, inout_int32,
+                            MPI_OP_TEST(*, mpi_op, MPI_INT32_T, int32_t,
+                                        init_in_int32, in_int32,
+                                        init_inout_int32, inout_int32,
                                         inout_int32_for_check, count, PRId32);
                         }
                         if (0 == strcmp(op, "band")) {
-                            MPI_OP_TEST(&, mpi_op, MPI_INT32_T, int32_t, in_int32, inout_int32,
+                            MPI_OP_TEST(&, mpi_op, MPI_INT32_T, int32_t,
+                                        init_in_int32, in_int32,
+                                        init_inout_int32, inout_int32,
                                         inout_int32_for_check, count, PRId32);
                         }
                         if (0 == strcmp(op, "max")) {
-                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_INT32_T, int32_t, in_int32,
-                                               inout_int32, inout_int32_for_check, count, PRId32);
+                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_INT32_T, int32_t,
+                                               init_in_int32, in_int32,
+                                               init_inout_int32, inout_int32,
+                                               inout_int32_for_check, count, PRId32);
                         }
                         if (0 == strcmp(op, "min")) { // intentionally reversed in and out
-                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_INT32_T, int32_t, in_int32,
-                                               inout_int32, inout_int32_for_check, count, PRId32);
+                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_INT32_T, int32_t,
+                                               init_in_int32, in_int32,
+                                               init_inout_int32, inout_int32,
+                                               inout_int32_for_check, count, PRId32);
                         }
                     }
                     if (64 == type_size) {
@@ -441,40 +596,56 @@ int main(int argc, char **argv)
                                                          + op1_alignment * sizeof(int64_t)),
                                 *inout_int64 = (int64_t *) ((char *) inout_buf
                                                             + res_alignment * sizeof(int64_t)),
-                                *inout_int64_for_check = (int64_t *) inout_check_buf;
+                                *inout_int64_for_check = (int64_t *) inout_check_buf,
+                                *init_inout_int64 = (int64_t *)init_inout_buf,
+                                *init_in_int64 = (int64_t *)init_in_buf;
                         for (i = 0; i < count; i++) {
-                            in_int64[i] = 5;
-                            inout_int64[i] = inout_int64_for_check[i] = 3;
+                            init_in_int64[i] = 5;
+                            init_inout_int64[i] = 3;
                         }
                         mpi_type = "MPI_INT64_T";
 
                         if (0 == strcmp(op, "sum")) {
-                            MPI_OP_TEST(+, mpi_op, MPI_INT64_T, int64_t, in_int64, inout_int64,
+                            MPI_OP_TEST(+, mpi_op, MPI_INT64_T, int64_t,
+                                        init_in_int64, in_int64,
+                                        init_inout_int64, inout_int64,
                                         inout_int64_for_check, count, PRId64);
                         }
                         if (0 == strcmp(op, "bor")) {
-                            MPI_OP_TEST(|, mpi_op, MPI_INT64_T, int64_t, in_int64, inout_int64,
+                            MPI_OP_TEST(|, mpi_op, MPI_INT64_T, int64_t,
+                                        init_in_int64, in_int64,
+                                        init_inout_int64, inout_int64,
                                         inout_int64_for_check, count, PRId64);
                         }
                         if (0 == strcmp(op, "bxor")) {
-                            MPI_OP_TEST(^, mpi_op, MPI_INT64_T, int64_t, in_int64, inout_int64,
+                            MPI_OP_TEST(^, mpi_op, MPI_INT64_T, int64_t,
+                                        init_in_int64, in_int64,
+                                        init_inout_int64, inout_int64,
                                         inout_int64_for_check, count, PRId64);
                         }
                         if (0 == strcmp(op, "prod")) {
-                            MPI_OP_TEST(*, mpi_op, MPI_INT64_T, int64_t, in_int64, inout_int64,
+                            MPI_OP_TEST(*, mpi_op, MPI_INT64_T, int64_t,
+                                        init_in_int64, in_int64,
+                                        init_inout_int64, inout_int64,
                                         inout_int64_for_check, count, PRId64);
                         }
                         if (0 == strcmp(op, "band")) {
-                            MPI_OP_TEST(&, mpi_op, MPI_INT64_T, int64_t, in_int64, inout_int64,
+                            MPI_OP_TEST(&, mpi_op, MPI_INT64_T, int64_t,
+                                        init_in_int64, in_int64,
+                                        init_inout_int64, inout_int64,
                                         inout_int64_for_check, count, PRId64);
                         }
                         if (0 == strcmp(op, "max")) {
-                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_INT64_T, int64_t, in_int64,
-                                               inout_int64, inout_int64_for_check, count, PRId64);
+                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_INT64_T, int64_t,
+                                               init_in_int64, in_int64,
+                                               init_inout_int64, inout_int64,
+                                               inout_int64_for_check, count, PRId64);
                         }
                         if (0 == strcmp(op, "min")) { // intentionally reversed in and out
-                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_INT64_T, int64_t, in_int64,
-                                               inout_int64, inout_int64_for_check, count, PRId64);
+                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_INT64_T, int64_t,
+                                               init_in_int64, in_int64,
+                                               init_inout_int64, inout_int64,
+                                               inout_int64_for_check, count, PRId64);
                         }
                     }
                 }
@@ -485,40 +656,56 @@ int main(int argc, char **argv)
                                                          + op1_alignment * sizeof(uint8_t)),
                                 *inout_uint8 = (uint8_t *) ((char *) inout_buf
                                                             + res_alignment * sizeof(uint8_t)),
-                                *inout_uint8_for_check = (uint8_t *) inout_check_buf;
+                                *inout_uint8_for_check = (uint8_t *) inout_check_buf,
+                                *init_inout_uint8 = (uint8_t *)init_inout_buf,
+                                *init_in_uint8 = (uint8_t *)init_in_buf;
                         for (i = 0; i < count; i++) {
-                            in_uint8[i] = 5;
-                            inout_uint8[i] = inout_uint8_for_check[i] = 2;
+                            init_in_uint8[i] = 5;
+                            init_inout_uint8[i] = 2;
                         }
                         mpi_type = "MPI_UINT8_T";
 
                         if (0 == strcmp(op, "sum")) {
-                            MPI_OP_TEST(+, mpi_op, MPI_UINT8_T, uint8_t, in_uint8, inout_uint8,
+                            MPI_OP_TEST(+, mpi_op, MPI_UINT8_T, uint8_t,
+                                        init_in_uint8, in_uint8,
+                                        init_inout_uint8, inout_uint8,
                                         inout_uint8_for_check, count, PRIu8);
                         }
                         if (0 == strcmp(op, "bor")) {
-                            MPI_OP_TEST(|, mpi_op, MPI_UINT8_T, uint8_t, in_uint8, inout_uint8,
+                            MPI_OP_TEST(|, mpi_op, MPI_UINT8_T, uint8_t,
+                                        init_in_uint8, in_uint8,
+                                        init_inout_uint8, inout_uint8,
                                         inout_uint8_for_check, count, PRIu8);
                         }
                         if (0 == strcmp(op, "bxor")) {
-                            MPI_OP_TEST(^, mpi_op, MPI_UINT8_T, uint8_t, in_uint8, inout_uint8,
+                            MPI_OP_TEST(^, mpi_op, MPI_UINT8_T, uint8_t,
+                                        init_in_uint8, in_uint8,
+                                        init_inout_uint8, inout_uint8,
                                         inout_uint8_for_check, count, PRIu8);
                         }
                         if (0 == strcmp(op, "prod")) {
-                            MPI_OP_TEST(*, mpi_op, MPI_UINT8_T, uint8_t, in_uint8, inout_uint8,
+                            MPI_OP_TEST(*, mpi_op, MPI_UINT8_T, uint8_t,
+                                        init_in_uint8, in_uint8,
+                                        init_inout_uint8, inout_uint8,
                                         inout_uint8_for_check, count, PRIu8);
                         }
                         if (0 == strcmp(op, "band")) {
-                            MPI_OP_TEST(&, mpi_op, MPI_UINT8_T, uint8_t, in_uint8, inout_uint8,
+                            MPI_OP_TEST(&, mpi_op, MPI_UINT8_T, uint8_t,
+                                        init_in_uint8, in_uint8,
+                                        init_inout_uint8, inout_uint8,
                                         inout_uint8_for_check, count, PRIu8);
                         }
                         if (0 == strcmp(op, "max")) {
-                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_UINT8_T, uint8_t, in_uint8,
-                                               inout_uint8, inout_uint8_for_check, count, PRIu8);
+                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_UINT8_T, uint8_t,
+                                               init_in_uint8, in_uint8,
+                                               init_inout_uint8, inout_uint8,
+                                               inout_uint8_for_check, count, PRIu8);
                         }
                         if (0 == strcmp(op, "min")) { // intentionally reversed in and out
-                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_UINT8_T, uint8_t, in_uint8,
-                                               inout_uint8, inout_uint8_for_check, count, PRIu8);
+                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_UINT8_T, uint8_t,
+                                               init_in_uint8, in_uint8,
+                                               init_inout_uint8, inout_uint8,
+                                               inout_uint8_for_check, count, PRIu8);
                         }
                     }
                     if (16 == type_size) {
@@ -526,40 +713,56 @@ int main(int argc, char **argv)
                                                             + op1_alignment * sizeof(uint16_t)),
                                  *inout_uint16 = (uint16_t *) ((char *) inout_buf
                                                                + res_alignment * sizeof(uint16_t)),
-                                 *inout_uint16_for_check = (uint16_t *) inout_check_buf;
+                                 *inout_uint16_for_check = (uint16_t *) inout_check_buf,
+                                 *init_inout_uint16 = (uint16_t *)init_inout_buf,
+                                 *init_in_uint16 = (uint16_t *)init_in_buf;
                         for (i = 0; i < count; i++) {
-                            in_uint16[i] = 5;
-                            inout_uint16[i] = inout_uint16_for_check[i] = 1234;
+                            init_in_uint16[i] = 5;
+                            init_inout_uint16[i] = 1234;
                         }
                         mpi_type = "MPI_UINT16_T";
 
                         if (0 == strcmp(op, "sum")) {
-                            MPI_OP_TEST(+, mpi_op, MPI_UINT16_T, uint16_t, in_uint16, inout_uint16,
+                            MPI_OP_TEST(+, mpi_op, MPI_UINT16_T, uint16_t,
+                                        init_in_uint16, in_uint16,
+                                        init_inout_uint16, inout_uint16,
                                         inout_uint16_for_check, count, PRIu16);
                         }
                         if (0 == strcmp(op, "bor")) {
-                            MPI_OP_TEST(|, mpi_op, MPI_UINT16_T, uint16_t, in_uint16, inout_uint16,
+                            MPI_OP_TEST(|, mpi_op, MPI_UINT16_T, uint16_t,
+                                        init_in_uint16, in_uint16,
+                                        init_inout_uint16, inout_uint16,
                                         inout_uint16_for_check, count, PRIu16);
                         }
                         if (0 == strcmp(op, "bxor")) {
-                            MPI_OP_TEST(^, mpi_op, MPI_UINT16_T, uint16_t, in_uint16, inout_uint16,
+                            MPI_OP_TEST(^, mpi_op, MPI_UINT16_T, uint16_t,
+                                        init_in_uint16, in_uint16,
+                                        init_inout_uint16, inout_uint16,
                                         inout_uint16_for_check, count, PRIu16);
                         }
                         if (0 == strcmp(op, "prod")) {
-                            MPI_OP_TEST(*, mpi_op, MPI_UINT16_T, uint16_t, in_uint16, inout_uint16,
+                            MPI_OP_TEST(*, mpi_op, MPI_UINT16_T, uint16_t,
+                                        init_in_uint16, in_uint16,
+                                        init_inout_uint16, inout_uint16,
                                         inout_uint16_for_check, count, PRIu16);
                         }
                         if (0 == strcmp(op, "band")) {
-                            MPI_OP_TEST(&, mpi_op, MPI_UINT16_T, uint16_t, in_uint16, inout_uint16,
+                            MPI_OP_TEST(&, mpi_op, MPI_UINT16_T, uint16_t,
+                                        init_in_uint16, in_uint16,
+                                        init_inout_uint16, inout_uint16,
                                         inout_uint16_for_check, count, PRIu16);
                         }
                         if (0 == strcmp(op, "max")) {
-                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_UINT16_T, uint16_t, in_uint16,
-                                               inout_uint16, inout_uint16_for_check, count, PRIu16);
+                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_UINT16_T, uint16_t,
+                                               init_in_uint16, in_uint16,
+                                               init_inout_uint16, inout_uint16,
+                                               inout_uint16_for_check, count, PRIu16);
                         }
                         if (0 == strcmp(op, "min")) { // intentionally reversed in and out
-                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_UINT16_T, uint16_t, in_uint16,
-                                               inout_uint16, inout_uint16_for_check, count, PRIu16);
+                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_UINT16_T, uint16_t,
+                                               init_in_uint16, in_uint16,
+                                               init_inout_uint16, inout_uint16,
+                                               inout_uint16_for_check, count, PRIu16);
                         }
                     }
                     if (32 == type_size) {
@@ -567,40 +770,56 @@ int main(int argc, char **argv)
                                                             + op1_alignment * sizeof(uint32_t)),
                                  *inout_uint32 = (uint32_t *) ((char *) inout_buf
                                                                + res_alignment * sizeof(uint32_t)),
-                                 *inout_uint32_for_check = (uint32_t *) inout_check_buf;
+                                 *inout_uint32_for_check = (uint32_t *) inout_check_buf,
+                                 *init_inout_uint32 = (uint32_t *)init_inout_buf,
+                                 *init_in_uint32 = (uint32_t *)init_in_buf;
                         for (i = 0; i < count; i++) {
-                            in_uint32[i] = 5;
-                            inout_uint32[i] = inout_uint32_for_check[i] = 3;
+                            init_in_uint32[i] = 5;
+                            init_inout_uint32[i] = 3;
                         }
                         mpi_type = "MPI_UINT32_T";
 
                         if (0 == strcmp(op, "sum")) {
-                            MPI_OP_TEST(+, mpi_op, MPI_UINT32_T, uint32_t, in_uint32, inout_uint32,
+                            MPI_OP_TEST(+, mpi_op, MPI_UINT32_T, uint32_t,
+                                        init_in_uint32, in_uint32,
+                                        init_inout_uint32, inout_uint32,
                                         inout_uint32_for_check, count, PRIu32);
                         }
                         if (0 == strcmp(op, "bor")) {
-                            MPI_OP_TEST(|, mpi_op, MPI_UINT32_T, uint32_t, in_uint32, inout_uint32,
+                            MPI_OP_TEST(|, mpi_op, MPI_UINT32_T, uint32_t,
+                                        init_in_uint32, in_uint32,
+                                        init_inout_uint32, inout_uint32,
                                         inout_uint32_for_check, count, PRIu32);
                         }
                         if (0 == strcmp(op, "bxor")) {
-                            MPI_OP_TEST(^, mpi_op, MPI_UINT32_T, uint32_t, in_uint32, inout_uint32,
+                            MPI_OP_TEST(^, mpi_op, MPI_UINT32_T, uint32_t,
+                                        init_in_uint32, in_uint32,
+                                        init_inout_uint32, inout_uint32,
                                         inout_uint32_for_check, count, PRIu32);
                         }
                         if (0 == strcmp(op, "prod")) {
-                            MPI_OP_TEST(*, mpi_op, MPI_UINT32_T, uint32_t, in_uint32, inout_uint32,
+                            MPI_OP_TEST(*, mpi_op, MPI_UINT32_T, uint32_t,
+                                        init_in_uint32, in_uint32,
+                                        init_inout_uint32, inout_uint32,
                                         inout_uint32_for_check, count, PRIu32);
                         }
                         if (0 == strcmp(op, "band")) {
-                            MPI_OP_TEST(&, mpi_op, MPI_UINT32_T, uint32_t, in_uint32, inout_uint32,
+                            MPI_OP_TEST(&, mpi_op, MPI_UINT32_T, uint32_t,
+                                        init_in_uint32, in_uint32,
+                                        init_inout_uint32, inout_uint32,
                                         inout_uint32_for_check, count, PRIu32);
                         }
                         if (0 == strcmp(op, "max")) {
-                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_UINT32_T, uint32_t, in_uint32,
-                                               inout_uint32, inout_uint32_for_check, count, PRIu32);
+                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_UINT32_T, uint32_t,
+                                               init_in_uint32, in_uint32,
+                                               init_inout_uint32, inout_uint32,
+                                               inout_uint32_for_check, count, PRIu32);
                         }
                         if (0 == strcmp(op, "min")) { // intentionally reversed in and out
-                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_UINT32_T, uint32_t, in_uint32,
-                                               inout_uint32, inout_uint32_for_check, count, PRIu32);
+                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_UINT32_T, uint32_t,
+                                               init_in_uint32, in_uint32,
+                                               init_inout_uint32, inout_uint32,
+                                               inout_uint32_for_check, count, PRIu32);
                         }
                     }
                     if (64 == type_size) {
@@ -608,40 +827,56 @@ int main(int argc, char **argv)
                                                             + op1_alignment * sizeof(uint64_t)),
                                  *inout_uint64 = (uint64_t *) ((char *) inout_buf
                                                                + res_alignment * sizeof(uint64_t)),
-                                 *inout_uint64_for_check = (uint64_t *) inout_check_buf;
+                                 *inout_uint64_for_check = (uint64_t *) inout_check_buf,
+                                 *init_inout_uint64 = (uint64_t *)init_inout_buf,
+                                 *init_in_uint64 = (uint64_t *)init_in_buf;
                         for (i = 0; i < count; i++) {
-                            in_uint64[i] = 5;
-                            inout_uint64[i] = inout_uint64_for_check[i] = 32433;
+                            init_in_uint64[i] = 5;
+                            init_inout_uint64[i] = 32433;
                         }
                         mpi_type = "MPI_UINT64_T";
 
                         if (0 == strcmp(op, "sum")) {
-                            MPI_OP_TEST(+, mpi_op, MPI_UINT64_T, uint64_t, in_uint64, inout_uint64,
+                            MPI_OP_TEST(+, mpi_op, MPI_UINT64_T, uint64_t,
+                                        init_in_uint64, in_uint64,
+                                        init_inout_uint64, inout_uint64,
                                         inout_uint64_for_check, count, PRIu64);
                         }
                         if (0 == strcmp(op, "bor")) {
-                            MPI_OP_TEST(|, mpi_op, MPI_UINT64_T, uint64_t, in_uint64, inout_uint64,
+                            MPI_OP_TEST(|, mpi_op, MPI_UINT64_T, uint64_t,
+                                        init_in_uint64, in_uint64,
+                                        init_inout_uint64, inout_uint64,
                                         inout_uint64_for_check, count, PRIu64);
                         }
                         if (0 == strcmp(op, "bxor")) {
-                            MPI_OP_TEST(^, mpi_op, MPI_UINT64_T, uint64_t, in_uint64, inout_uint64,
+                            MPI_OP_TEST(^, mpi_op, MPI_UINT64_T, uint64_t,
+                                        init_in_uint64, in_uint64,
+                                        init_inout_uint64, inout_uint64,
                                         inout_uint64_for_check, count, PRIu64);
                         }
                         if (0 == strcmp(op, "prod")) {
-                            MPI_OP_TEST(*, mpi_op, MPI_UINT64_T, uint64_t, in_uint64, inout_uint64,
+                            MPI_OP_TEST(*, mpi_op, MPI_UINT64_T, uint64_t,
+                                        init_in_uint64, in_uint64,
+                                        init_inout_uint64, inout_uint64,
                                         inout_uint64_for_check, count, PRIu64);
                         }
                         if (0 == strcmp(op, "band")) {
-                            MPI_OP_TEST(&, mpi_op, MPI_UINT64_T, uint64_t, in_uint64, inout_uint64,
+                            MPI_OP_TEST(&, mpi_op, MPI_UINT64_T, uint64_t,
+                                        init_in_uint64, in_uint64,
+                                        init_inout_uint64, inout_uint64,
                                         inout_uint64_for_check, count, PRIu64);
                         }
                         if (0 == strcmp(op, "max")) {
-                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_UINT64_T, uint64_t, in_uint64,
-                                               inout_uint64, inout_uint64_for_check, count, PRIu64);
+                            MPI_OP_MINMAX_TEST(max, mpi_op, MPI_UINT64_T, uint64_t,
+                                               init_in_uint64, in_uint64,
+                                               init_inout_uint64, inout_uint64,
+                                               inout_uint64_for_check, count, PRIu64);
                         }
                         if (0 == strcmp(op, "min")) {
-                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_UINT64_T, uint64_t, in_uint64,
-                                               inout_uint64, inout_uint64_for_check, count, PRIu64);
+                            MPI_OP_MINMAX_TEST(min, mpi_op, MPI_UINT64_T, uint64_t,
+                                               init_in_uint64, in_uint64,
+                                               init_inout_uint64, inout_uint64,
+                                               inout_uint64_for_check, count, PRIu64);
                         }
                     }
                 }
@@ -650,27 +885,37 @@ int main(int argc, char **argv)
                     float *in_float = (float *) ((char *) in_buf + op1_alignment * sizeof(float)),
                           *inout_float = (float *) ((char *) inout_buf
                                                     + res_alignment * sizeof(float)),
-                          *inout_float_for_check = (float *) inout_check_buf;
+                          *inout_float_for_check = (float *) inout_check_buf,
+                          *init_inout_float = (float *)init_inout_buf,
+                          *init_in_float = (float *)init_in_buf;
                     for (i = 0; i < count; i++) {
-                        in_float[i] = 1000.0 + 1;
-                        inout_float[i] = inout_float_for_check[i] = 100.0 + 2;
+                        init_in_float[i] = 1000.0 + 1;
+                        init_inout_float[i] = 100.0 + 2;
                     }
                     mpi_type = "MPI_FLOAT";
 
                     if (0 == strcmp(op, "sum")) {
-                        MPI_OP_TEST(+, mpi_op, MPI_FLOAT, float, in_float, inout_float,
+                        MPI_OP_TEST(+, mpi_op, MPI_FLOAT, float,
+                                    init_in_float, in_float,
+                                    init_inout_float, inout_float,
                                     inout_float_for_check, count, "f");
                     }
                     if (0 == strcmp(op, "prod")) {
-                        MPI_OP_TEST(*, mpi_op, MPI_FLOAT, float, in_float, inout_float,
+                        MPI_OP_TEST(*, mpi_op, MPI_FLOAT, float,
+                                    init_in_float, in_float,
+                                    init_inout_float, inout_float,
                                     inout_float_for_check, count, "f");
                     }
                     if (0 == strcmp(op, "max")) {
-                        MPI_OP_MINMAX_TEST(max, mpi_op, MPI_FLOAT, float, in_float, inout_float,
+                        MPI_OP_MINMAX_TEST(max, mpi_op, MPI_FLOAT, float,
+                                           init_in_float, in_float,
+                                           init_inout_float, inout_float,
                                            inout_float_for_check, count, "f");
                     }
                     if (0 == strcmp(op, "min")) {
-                        MPI_OP_MINMAX_TEST(min, mpi_op, MPI_FLOAT, float, in_float, inout_float,
+                        MPI_OP_MINMAX_TEST(min, mpi_op, MPI_FLOAT, float,
+                                           init_in_float, in_float,
+                                           init_inout_float, inout_float,
                                            inout_float_for_check, count, "f");
                     }
                 }
@@ -680,27 +925,37 @@ int main(int argc, char **argv)
                                                     + op1_alignment * sizeof(double)),
                            *inout_double = (double *) ((char *) inout_buf
                                                        + res_alignment * sizeof(double)),
-                           *inout_double_for_check = (double *) inout_check_buf;
+                           *inout_double_for_check = (double *) inout_check_buf,
+                           *init_inout_double = (double *)init_inout_buf,
+                           *init_in_double = (double *)init_in_buf;
                     for (i = 0; i < count; i++) {
-                        in_double[i] = 10.0 + 1;
-                        inout_double[i] = inout_double_for_check[i] = 1.0 + 2;
+                        init_in_double[i] = 10.0 + 1;
+                        init_inout_double[i] = 1.0 + 2;
                     }
                     mpi_type = "MPI_DOUBLE";
 
                     if (0 == strcmp(op, "sum")) {
-                        MPI_OP_TEST(+, mpi_op, MPI_DOUBLE, double, in_double, inout_double,
+                        MPI_OP_TEST(+, mpi_op, MPI_DOUBLE, double,
+                                    init_in_double, in_double,
+                                    init_inout_double, inout_double,
                                     inout_double_for_check, count, "g");
                     }
                     if (0 == strcmp(op, "prod")) {
-                        MPI_OP_TEST(*, mpi_op, MPI_DOUBLE, double, in_double, inout_double,
+                        MPI_OP_TEST(*, mpi_op, MPI_DOUBLE, double,
+                                    init_in_double, in_double,
+                                    init_inout_double, inout_double,
                                     inout_double_for_check, count, "f");
                     }
                     if (0 == strcmp(op, "max")) {
-                        MPI_OP_MINMAX_TEST(max, mpi_op, MPI_DOUBLE, double, in_double, inout_double,
+                        MPI_OP_MINMAX_TEST(max, mpi_op, MPI_DOUBLE, double,
+                                           init_in_double, in_double,
+                                           init_inout_double, inout_double,
                                            inout_double_for_check, count, "f");
                     }
                     if (0 == strcmp(op, "min")) {
-                        MPI_OP_MINMAX_TEST(min, mpi_op, MPI_DOUBLE, double, in_double, inout_double,
+                        MPI_OP_MINMAX_TEST(min, mpi_op, MPI_DOUBLE, double,
+                                           init_in_double, in_double,
+                                           init_inout_double, inout_double,
                                            inout_double_for_check, count, "f");
                     }
                 }
@@ -713,11 +968,19 @@ int main(int argc, char **argv)
                 printf("\n");
         }
     }
-    ompi_mpi_finalize();
 
-    free(in_buf);
-    free(inout_buf);
-    free(inout_check_buf);
+    /* clean up allocator */
+    allocator->free(in_buf);
+    allocator->free(inout_buf);
+    allocator->free(inout_check_buf);
+    allocator->fini();
+
+    if (allocator->flags & ALLOCATOR_DISCRETE) {
+        free(init_in_buf);
+        free(init_inout_buf);
+    }
+
+    ompi_mpi_finalize();
 
     return (0 == total_errors) ? 0 : -1;
 }

From e603bcc3a10cc4da8658bc07dfd287eccb802341 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 19 Apr 2023 12:37:23 -0400
Subject: [PATCH 21/74] Draft of ompi_op_select_device

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/op/op.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/ompi/op/op.h b/ompi/op/op.h
index 0d9ed54fc1b..c48013c67b8 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -819,6 +819,34 @@ static inline void ompi_3buff_op_reduce(ompi_op_t * op, void *source1,
     }
 }
 
+
+#if 0
+/**
+ * Determine where the op can run most efficiently. Uses some heuristic based
+ * on information from opal_accelerator to determine whether it would be more
+ * efficient to run on a device or on the host.
+ *
+ * Either source or target can be NULL, in which case they will be ignored.
+ *
+ * Returns -1 for host, or the device number [0..NUMDEV-1] otherwise.
+ */
+static inline void ompi_op_select_device(ompi_op_t *op, const void *source,
+                                         const void *target, size_t count,
+                                         ompi_datatype_t *dtype, int *device)
+{
+    if (OPAL_LIKELY(ompi_op_is_intrinsic (op))) {
+        int source_dev_id = -1, target_dev_id = -1;
+        uint64_t source_flags, target_flags;
+        int target_check_addr = -1;
+        if (target != )opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
+        int source_check_addr = opal_accelerator.check_addr(source, &source_dev_id, &source_flags);
+        if (target_
+    } else {
+        *device = -1;
+    }
+}
+#endif // 0
+
 END_C_DECLS
 
 #endif /* OMPI_OP_H */

From 60dd44600608faca1f29b27fbf8487b64c18f011 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 19 Apr 2023 15:02:16 -0400
Subject: [PATCH 22/74] Second draft of ompi_op_select_device

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/op/op.h | 57 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/ompi/op/op.h b/ompi/op/op.h
index c48013c67b8..566f18bc01d 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -820,7 +820,7 @@ static inline void ompi_3buff_op_reduce(ompi_op_t * op, void *source1,
 }
 
 
-#if 0
+//#if 0
 /**
  * Determine where the op can run most efficiently. Uses some heuristic based
  * on information from opal_accelerator to determine whether it would be more
@@ -834,18 +834,55 @@ static inline void ompi_op_select_device(ompi_op_t *op, const void *source,
                                          const void *target, size_t count,
                                          ompi_datatype_t *dtype, int *device)
 {
-    if (OPAL_LIKELY(ompi_op_is_intrinsic (op))) {
-        int source_dev_id = -1, target_dev_id = -1;
-        uint64_t source_flags, target_flags;
-        int target_check_addr = -1;
-        if (target != )opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
-        int source_check_addr = opal_accelerator.check_addr(source, &source_dev_id, &source_flags);
-        if (target_
-    } else {
+    /* default to host */
+    *device = -1;
+    if (!ompi_op_is_intrinsic (op)) {
         *device = -1;
+        return;
+    }
+    /* quick check: can we execute on both sides? */
+    int dtype_id = ompi_op_ddt_map[dtype->id];
+    if (NULL == op->o_device_op || NULL == op->o_device_op->do_intrinsic.fns[dtype_id]) {
+        /* not available on the gpu, must select host */
+        return;
     }
+
+    /* Penalty for accessing unified memory from the host
+     * TODO: how to determine this value? */
+    const double host_unified_memory_penalty = 10;
+
+    double host_startup_cost = 0.0; // host has no startup cost
+    double host_compute_cost = 1.0*count; // host reference 1.0 per element
+    double device_startup_cost = 10000.0; // to be filled below
+    double device_compute_cost = 0.0001*count;
+    double transfer_cost = 0.0; // summed up based on what has to be transferred
+    int source_dev_id = -1, target_dev_id = -1;
+    uint64_t source_flags = 0, target_flags = 0;
+    int target_check_addr = -1;
+    if (target != NULL) target_check_addr = opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
+    int source_check_addr = -1;
+    if (source != NULL) source_check_addr = opal_accelerator.check_addr(source, &source_dev_id, &source_flags);
+    if (op->o_func.intrinsic.fns[dtype_id]) {
+        /* op not available on the host, must select a device */
+        host_compute_cost = 1E12;
+    } else if ((target_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY) || (source_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) {
+        /* at least one buffer is on unified memory */
+        host_compute_cost *= host_unified_memory_penalty; // reduced bandwidth
+    } else if (0 > source_check_addr && 0 > target_check_addr) {
+        /* both buffers are on the device, mark host as unusable */
+        host_compute_cost = 1E12;
+    } else if (0 <= source_check_addr && 0 <= target_check_addr) {
+        /* both buffers are on the host, mark device as unusable */
+        device_compute_cost = 1E12;
+    }
+
+    /* select a device, or remain on the host */
+    if ((host_startup_cost + host_compute_cost) > (device_startup_cost + device_compute_cost)) {
+        *device = (target_dev_id >= 0) ? target_dev_id : source_dev_id;
+    }
+
 }
-#endif // 0
+//#endif // 0
 
 END_C_DECLS
 

From c485ecf86de6e992fa719aa4728879dbea849610 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 27 Apr 2023 10:55:55 -0400
Subject: [PATCH 23/74] Fix undefined symbols in cuda op component

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_functions.c | 204 +++++++++++++++++----------
 1 file changed, 128 insertions(+), 76 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 0873ea1fc33..55dbd38a1a8 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -818,6 +818,53 @@ LOC_FUNC(minloc, long_double_int, <)
 #define LOC_FUNC_3BUF(name, type_name, op) FUNC_3BUF(name, type_name, ompi_op_predefined_##type_name##_t)
 
 
+/* Dispatch Fortran types to C types */
+#define FORT_INT_FUNC_3BUF(name, type_name, type)                                                     \
+    static                                                                                       \
+    void ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,        \
+                                                   struct ompi_datatype_t **dtype,               \
+                                                   opal_accelerator_stream_t *stream,            \
+                                                   struct ompi_op_base_module_1_0_0_t *module) { \
+                                                                                                 \
+        _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
+        switch(sizeof(type)) {  \
+            case sizeof(int8_t):  \
+                ompi_op_cuda_3buff_##name##_int8_t(in1, in2, out, count, dtype, stream, module); \
+                break; \
+            case sizeof(int16_t): \
+                ompi_op_cuda_3buff_##name##_int16_t(in1, in2, out, count, dtype, stream, module); \
+                break; \
+            case sizeof(int32_t): \
+                ompi_op_cuda_3buff_##name##_int32_t(in1, in2, out, count, dtype, stream, module); \
+                break; \
+            case sizeof(int64_t): \
+                ompi_op_cuda_3buff_##name##_int64_t(in1, in2, out, count, dtype, stream, module); \
+                break; \
+        } \
+    }
+
+/* Dispatch Fortran types to C types */
+#define FORT_FLOAT_FUNC_3BUF(name, type_name, type)                                                      \
+    static                                                                                          \
+    void ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,           \
+                                                   struct ompi_datatype_t **dtype,                  \
+                                                   opal_accelerator_stream_t *stream,               \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {    \
+        _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
+        switch(sizeof(type)) {  \
+            case sizeof(float):  \
+                ompi_op_cuda_3buff_##name##_float(in1, in2, out, count, dtype, stream, module);  \
+                break;  \
+            case sizeof(double): \
+                ompi_op_cuda_3buff_##name##_double(in1, in2, out, count, dtype, stream, module); \
+                break; \
+            case sizeof(long double): \
+                ompi_op_cuda_3buff_##name##_long_double(in1, in2, out, count, dtype, stream, module); \
+                break; \
+        } \
+    }
+
+
 /*************************************************************************
  * Max
  *************************************************************************/
@@ -838,49 +885,48 @@ FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(max, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC_3BUF(max, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(max, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC_3BUF(max, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(max, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC_3BUF(max, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(max, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC_3BUF(max, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(max, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(max, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC_3BUF(max, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 /* Floating point */
+#if 0
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC_3BUF(max, short_float, short float)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
 FUNC_FUNC_3BUF(max, short_float, opal_short_float_t)
 #endif
+#endif // 0
 FUNC_FUNC_3BUF(max, float, float)
 FUNC_FUNC_3BUF(max, double, double)
 FUNC_FUNC_3BUF(max, long_double, long double)
 #if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC_3BUF(max, fortran_real, ompi_fortran_real_t)
+FORT_FLOAT_FUNC_3BUF(max, fortran_real, ompi_fortran_real_t)
 #endif
 #if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC_3BUF(max, fortran_double_precision, ompi_fortran_double_precision_t)
+FORT_FLOAT_FUNC_3BUF(max, fortran_double_precision, ompi_fortran_double_precision_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC_3BUF(max, fortran_real2, ompi_fortran_real2_t)
+FORT_FLOAT_FUNC_3BUF(max, fortran_real2, ompi_fortran_real2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC_3BUF(max, fortran_real4, ompi_fortran_real4_t)
+FORT_FLOAT_FUNC_3BUF(max, fortran_real4, ompi_fortran_real4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC_3BUF(max, fortran_real8, ompi_fortran_real8_t)
+FORT_FLOAT_FUNC_3BUF(max, fortran_real8, ompi_fortran_real8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC_3BUF(max, fortran_real16, ompi_fortran_real16_t)
+FORT_FLOAT_FUNC_3BUF(max, fortran_real16, ompi_fortran_real16_t)
 #endif
 
 
@@ -904,49 +950,51 @@ FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(min, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC_3BUF(min, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(min, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC_3BUF(min, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(min, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC_3BUF(min, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(min, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC_3BUF(min, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(min, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC_3BUF(min, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(min, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC_3BUF(min, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Floating point */
+#if 0
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC_3BUF(min, short_float, short float)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
 FUNC_FUNC_3BUF(min, short_float, opal_short_float_t)
 #endif
+#endif // 0
 FUNC_FUNC_3BUF(min, float, float)
 FUNC_FUNC_3BUF(min, double, double)
 FUNC_FUNC_3BUF(min, long_double, long double)
 #if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC_3BUF(min, fortran_real, ompi_fortran_real_t)
+FORT_FLOAT_FUNC_3BUF(min, fortran_real, ompi_fortran_real_t)
 #endif
 #if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC_3BUF(min, fortran_double_precision, ompi_fortran_double_precision_t)
+FORT_FLOAT_FUNC_3BUF(min, fortran_double_precision, ompi_fortran_double_precision_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC_3BUF(min, fortran_real2, ompi_fortran_real2_t)
+FORT_FLOAT_FUNC_3BUF(min, fortran_real2, ompi_fortran_real2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC_3BUF(min, fortran_real4, ompi_fortran_real4_t)
+FORT_FLOAT_FUNC_3BUF(min, fortran_real4, ompi_fortran_real4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC_3BUF(min, fortran_real8, ompi_fortran_real8_t)
+FORT_FLOAT_FUNC_3BUF(min, fortran_real8, ompi_fortran_real8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC_3BUF(min, fortran_real16, ompi_fortran_real16_t)
+FORT_FLOAT_FUNC_3BUF(min, fortran_real16, ompi_fortran_real16_t)
 #endif
 
 /*************************************************************************
@@ -967,49 +1015,51 @@ OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC_3BUF(sum, fortran_integer, ompi_fortran_integer_t, +)
+FORT_INT_FUNC_3BUF(sum, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC_3BUF(sum, fortran_integer1, ompi_fortran_integer1_t, +)
+FORT_INT_FUNC_3BUF(sum, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC_3BUF(sum, fortran_integer2, ompi_fortran_integer2_t, +)
+FORT_INT_FUNC_3BUF(sum, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC_3BUF(sum, fortran_integer4, ompi_fortran_integer4_t, +)
+FORT_INT_FUNC_3BUF(sum, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC_3BUF(sum, fortran_integer8, ompi_fortran_integer8_t, +)
+FORT_INT_FUNC_3BUF(sum, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC_3BUF(sum, fortran_integer16, ompi_fortran_integer16_t, +)
+FORT_INT_FUNC_3BUF(sum, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Floating point */
+#if 0
 #if defined(HAVE_SHORT_FLOAT)
 OP_FUNC_3BUF(sum, short_float, short float, +)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
 OP_FUNC_3BUF(sum, short_float, opal_short_float_t, +)
 #endif
+#endif // 0
 OP_FUNC_3BUF(sum, float, float, +)
 OP_FUNC_3BUF(sum, double, double, +)
 OP_FUNC_3BUF(sum, long_double, long double, +)
 #if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC_3BUF(sum, fortran_real, ompi_fortran_real_t, +)
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real, ompi_fortran_real_t)
 #endif
 #if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC_3BUF(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
+FORT_FLOAT_FUNC_3BUF(sum, fortran_double_precision, ompi_fortran_double_precision_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC_3BUF(sum, fortran_real2, ompi_fortran_real2_t, +)
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real2, ompi_fortran_real2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC_3BUF(sum, fortran_real4, ompi_fortran_real4_t, +)
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real4, ompi_fortran_real4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC_3BUF(sum, fortran_real8, ompi_fortran_real8_t, +)
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real8, ompi_fortran_real8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC_3BUF(sum, fortran_real16, ompi_fortran_real16_t, +)
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real16, ompi_fortran_real16_t)
 #endif
 /* Complex */
 #if 0
@@ -1041,49 +1091,51 @@ OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC_3BUF(prod, fortran_integer, ompi_fortran_integer_t, *)
+FORT_INT_FUNC_3BUF(prod, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC_3BUF(prod, fortran_integer1, ompi_fortran_integer1_t, *)
+FORT_INT_FUNC_3BUF(prod, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC_3BUF(prod, fortran_integer2, ompi_fortran_integer2_t, *)
+FORT_INT_FUNC_3BUF(prod, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC_3BUF(prod, fortran_integer4, ompi_fortran_integer4_t, *)
+FORT_INT_FUNC_3BUF(prod, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC_3BUF(prod, fortran_integer8, ompi_fortran_integer8_t, *)
+FORT_INT_FUNC_3BUF(prod, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC_3BUF(prod, fortran_integer16, ompi_fortran_integer16_t, *)
+FORT_INT_FUNC_3BUF(prod, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Floating point */
+#if 0
 #if defined(HAVE_SHORT_FLOAT)
-OP_FUNC_3BUF(prod, short_float, short float, *)
+FORT_FLOAT_FUNC_3BUF(prod, short_float, short float)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-OP_FUNC_3BUF(prod, short_float, opal_short_float_t, *)
+FORT_FLOAT_FUNC_3BUF(prod, short_float, opal_short_float_t)
 #endif
+#endif // 0
 OP_FUNC_3BUF(prod, float, float, *)
 OP_FUNC_3BUF(prod, double, double, *)
 OP_FUNC_3BUF(prod, long_double, long double, *)
 #if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC_3BUF(prod, fortran_real, ompi_fortran_real_t, *)
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real, ompi_fortran_real_t)
 #endif
 #if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC_3BUF(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
+FORT_FLOAT_FUNC_3BUF(prod, fortran_double_precision, ompi_fortran_double_precision_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC_3BUF(prod, fortran_real2, ompi_fortran_real2_t, *)
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real2, ompi_fortran_real2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC_3BUF(prod, fortran_real4, ompi_fortran_real4_t, *)
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real4, ompi_fortran_real4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC_3BUF(prod, fortran_real8, ompi_fortran_real8_t, *)
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real8, ompi_fortran_real8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC_3BUF(prod, fortran_real16, ompi_fortran_real16_t, *)
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real16, ompi_fortran_real16_t)
 #endif
 /* Complex */
 #if 0
@@ -1117,7 +1169,7 @@ FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(land, fortran_logical, ompi_fortran_logical_t)
+FORT_INT_FUNC_3BUF(land, fortran_logical, ompi_fortran_logical_t)
 #endif
 /* C++ bool */
 FUNC_FUNC_3BUF(land, bool, bool)
@@ -1142,7 +1194,7 @@ FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(lor, fortran_logical, ompi_fortran_logical_t)
+FORT_INT_FUNC_3BUF(lor, fortran_logical, ompi_fortran_logical_t)
 #endif
 /* C++ bool */
 FUNC_FUNC_3BUF(lor, bool, bool)
@@ -1167,7 +1219,7 @@ FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(lxor, fortran_logical, ompi_fortran_logical_t)
+FORT_INT_FUNC_3BUF(lxor, fortran_logical, ompi_fortran_logical_t)
 #endif
 /* C++ bool */
 FUNC_FUNC_3BUF(lxor, bool, bool)
@@ -1192,25 +1244,25 @@ FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(band, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC_3BUF(band, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(band, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC_3BUF(band, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(band, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC_3BUF(band, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(band, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC_3BUF(band, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(band, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC_3BUF(band, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(band, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC_3BUF(band, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Byte */
-FUNC_FUNC_3BUF(band, byte, char)
+FORT_INT_FUNC_3BUF(band, byte, char)
 
 /*************************************************************************
  * Bitwise OR
@@ -1232,25 +1284,25 @@ FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(bor, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC_3BUF(bor, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(bor, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC_3BUF(bor, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(bor, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC_3BUF(bor, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(bor, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC_3BUF(bor, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(bor, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC_3BUF(bor, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(bor, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC_3BUF(bor, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Byte */
-FUNC_FUNC_3BUF(bor, byte, char)
+FORT_INT_FUNC_3BUF(bor, byte, char)
 
 /*************************************************************************
  * Bitwise XOR
@@ -1272,25 +1324,25 @@ FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(bxor, fortran_integer, ompi_fortran_integer_t)
+FORT_INT_FUNC_3BUF(bxor, fortran_integer, ompi_fortran_integer_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(bxor, fortran_integer1, ompi_fortran_integer1_t)
+FORT_INT_FUNC_3BUF(bxor, fortran_integer1, ompi_fortran_integer1_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(bxor, fortran_integer2, ompi_fortran_integer2_t)
+FORT_INT_FUNC_3BUF(bxor, fortran_integer2, ompi_fortran_integer2_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(bxor, fortran_integer4, ompi_fortran_integer4_t)
+FORT_INT_FUNC_3BUF(bxor, fortran_integer4, ompi_fortran_integer4_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(bxor, fortran_integer8, ompi_fortran_integer8_t)
+FORT_INT_FUNC_3BUF(bxor, fortran_integer8, ompi_fortran_integer8_t)
 #endif
 #if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
+FORT_INT_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Byte */
-FUNC_FUNC_3BUF(bxor, byte, char)
+FORT_INT_FUNC_3BUF(bxor, byte, char)
 
 /*************************************************************************
  * Min and max location "pair" datatypes

From 793863c21160d8608dee624b6684f823b041d8eb Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 28 Apr 2023 09:26:25 -0400
Subject: [PATCH 24/74] Fix off-by-one error in device-bucket allocator

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c b/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
index cb38d50ff56..46bafe4cdae 100644
--- a/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
+++ b/opal/mca/allocator/devicebucket/allocator_devicebucket_alloc.c
@@ -177,7 +177,7 @@ void mca_allocator_devicebucket_free(mca_allocator_base_module_t *mem, void *ptr
         bucket_size <<= 1;
     }
 
-    if (bucket_num > mem_options->num_buckets) {
+    if (bucket_num >= mem_options->num_buckets) {
         mem_options->free_mem_fn(mem_options->super.alc_context, ptr);
         OBJ_RELEASE(chunk);
     } else {

From d2e8677819a6a7038ac90b9d7db0fbca9d617265 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 28 Apr 2023 09:30:57 -0400
Subject: [PATCH 25/74] Heuristic to select op device based on element count

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c | 224 ++++++++++++++++-------
 ompi/mca/coll/base/coll_base_util.c      |  16 +-
 ompi/mca/coll/base/coll_base_util.h      |  37 +++-
 ompi/op/op.h                             |  32 +++-
 4 files changed, 240 insertions(+), 69 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 63b7c3ab710..b263f900428 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -140,7 +140,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     int ret, line, rank, size, adjsize, remote, distance;
     int newrank, newremote, extra_ranks;
     char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf_free = NULL, *inplacebuf;
-    int inplacebuf_dev;
+    char *recvbuf = NULL;
     ptrdiff_t span, gap = 0;
 
     size = ompi_comm_size(comm);
@@ -158,34 +158,65 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         return MPI_SUCCESS;
     }
 
-    /* Allocate and initialize temporary send buffer */
+    /* get the device for sbuf and rbuf and where the op would like to execute */
+    int sendbuf_dev, recvbuf_dev, op_dev;
+    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev, &op_dev);
     span = opal_datatype_span(&dtype->super, count, &gap);
-    inplacebuf_free = ompi_coll_base_allocate_op_tmpbuf(sbuf, rbuf, span, op, dtype, &inplacebuf_dev, module);
+    inplacebuf_free = ompi_coll_base_allocate_on_device(op_dev, span, module);
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     inplacebuf = inplacebuf_free - gap;
+    //printf("allreduce ring count %d sbuf_dev %d rbuf_dev %d op_dev %d\n", count, sendbuf_dev, recvbuf_dev, op_dev);
 
-    opal_accelerator_stream_t *stream;
-    opal_accelerator.get_default_stream(inplacebuf_dev, &stream);
-
+    opal_accelerator_stream_t *stream = NULL;
+    if (op_dev >= 0) {
+        opal_accelerator.get_default_stream(op_dev, &stream);
+    }
 
-    if (MPI_IN_PLACE == sbuf) {
+    tmpsend = (char*) sbuf;
+    if (op_dev != recvbuf_dev) {
+        /* copy data to where the op wants it to be */
+        if (MPI_IN_PLACE == sbuf) {
+            ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)rbuf, stream);
+            if (ret < 0) { line = __LINE__; goto error_hndl; }
+        } else {
+            tmpsend = (char*) sbuf;
+            ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)sbuf, stream);
+            if (ret < 0) { line = __LINE__; goto error_hndl; }
+        }
+        tmpsend = (char*) inplacebuf;
+    } else if (MPI_IN_PLACE == sbuf) {
         ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)rbuf, stream);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
         tmpsend = (char*) inplacebuf;
-    } else {
-        tmpsend = (char*) sbuf;
-#if 0
-        ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)sbuf, stream);
-        if (ret < 0) { line = __LINE__; goto error_hndl; }
-#endif // 0
     }
 
-    tmprecv = (char*) rbuf;
+    /* Handle MPI_IN_PLACE */
+    bool use_sbuf = (MPI_IN_PLACE != sbuf);
+    /* allocate temporary recv buffer if the tmpbuf above is on a different device than the rbuf */
+    recvbuf = rbuf;
+    if (op_dev != recvbuf_dev) {
+        recvbuf = ompi_coll_base_allocate_on_device(op_dev, span, module);
+        if (use_sbuf) {
+            /* copy from rbuf */
+            ompi_datatype_copy_content_same_ddt_stream(dtype, count, (char*)recvbuf, (char*)sbuf, stream);
+        } else {
+            /* copy from sbuf */
+            ompi_datatype_copy_content_same_ddt_stream(dtype, count, (char*)recvbuf, (char*)rbuf, stream);
+        }
+        use_sbuf = false;
+    }
+
+    tmprecv = (char*) recvbuf;
 
     /* Determine nearest power of two less than or equal to size */
     adjsize = opal_next_poweroftwo (size);
     adjsize >>= 1;
 
+    /* wait for above copies to complete */
+    if (NULL != stream) {
+        opal_accelerator.wait_stream(stream);
+    }
+
     /* Handle non-power-of-two case:
        - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
        sets new rank to -1.
@@ -197,7 +228,6 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     if (rank <  (2 * extra_ranks)) {
         if (0 == (rank % 2)) {
             /* wait for tmpsend to be copied */
-            opal_accelerator.wait_stream(stream);
             ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
                                     MCA_COLL_BASE_TAG_ALLREDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
@@ -237,7 +267,9 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         bool have_next_iter = ((distance << 1) < adjsize);
 
         /* wait for previous ops to complete to complete */
-        opal_accelerator.wait_stream(stream);
+        if (NULL != stream) {
+            opal_accelerator.wait_stream(stream);
+        }
         /* Exchange the data */
         ret = ompi_coll_base_sendrecv_actual(tmpsend, count, dtype, remote,
                                              MCA_COLL_BASE_TAG_ALLREDUCE,
@@ -255,7 +287,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                 /* send the current recv buffer, and use the tmp buffer to receive */
                 tmpsend = tmprecv;
                 tmprecv = inplacebuf;
-            } else if (have_next_iter || tmprecv == recv) {
+            } else if (have_next_iter || tmprecv == recvbuf) {
                 /* All iterations, and the last if tmprecv is the recv buffer */
                 /* tmprecv = tmpsend (op) tmprecv */
                 ompi_op_reduce_stream(op, tmpsend, tmprecv, count, dtype, stream);
@@ -274,13 +306,13 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                 /* First iteration: use input from sbuf */
                 /* tmpsend = tmprecv (op) sbuf */
                 tmpsend = inplacebuf;
-                if (have_next_iter || tmpsend == recv) {
+                if (have_next_iter || tmpsend == recvbuf) {
                     ompi_3buff_op_reduce_stream(op, tmprecv, sbuf, tmpsend, count, dtype, stream);
                 } else {
                     ompi_op_reduce_stream(op, sbuf, tmprecv, count, dtype, stream);
                     tmpsend = tmprecv;
                 }
-            } else if (have_next_iter || tmpsend == rbuf) {
+            } else if (have_next_iter || tmpsend == recvbuf) {
                 /* All other iterations: reduce into tmpsend for next iteration */
                 /* tmpsend = tmprecv (op) tmpsend */
                 ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
@@ -306,7 +338,9 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
             tmpsend = (char*)rbuf;
         } else {
             /* wait for previous ops to complete to complete */
-            opal_accelerator.wait_stream(stream);
+            if (NULL != stream) {
+                opal_accelerator.wait_stream(stream);
+            }
             ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank - 1),
                                     MCA_COLL_BASE_TAG_ALLREDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
@@ -321,15 +355,25 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     }
 
     /* wait for previous ops to complete */
-    opal_accelerator.wait_stream(stream);
-    ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev, module);
+    if (NULL != stream) {
+        opal_accelerator.wait_stream(stream);
+    }
+    ompi_coll_base_free_tmpbuf(inplacebuf_free, op_dev, module);
+
+    if (op_dev != recvbuf_dev) {
+        ompi_coll_base_free_tmpbuf(recvbuf, op_dev, module);
+    }
     return MPI_SUCCESS;
 
  error_hndl:
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
-    ompi_coll_base_free_tmpbuf(inplacebuf_free, inplacebuf_dev, module);
+    ompi_coll_base_free_tmpbuf(inplacebuf_free, op_dev, module);
+
+    if (op_dev != recvbuf_dev) {
+        ompi_coll_base_free_tmpbuf(recvbuf, op_dev, module);
+    }
     return ret;
 }
 
@@ -406,9 +450,9 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
 {
     int ret, line, rank, size, k, recv_from, send_to, block_count, inbi;
     int early_segcount, late_segcount, split_rank, max_segcount;
-    int inbuf_dev[2] = {-1, -1};
     size_t typelng;
     char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
+    void *recvbuf = NULL;
     ptrdiff_t true_lb, true_extent, lb, extent;
     ptrdiff_t block_offset, max_real_segsize;
     ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
@@ -457,16 +501,35 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
     max_segcount = early_segcount;
     max_real_segsize = true_extent + (max_segcount - 1) * extent;
 
-    /* we don't care about where the send buffer is */
-    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[0], module);
-    if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
+    /* get the device for sbuf and rbuf and where the op would like to execute */
+    int sendbuf_dev, recvbuf_dev, op_dev;
+    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev, &op_dev);
     if (size > 2) {
-        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[1], module);
-        if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
+        inbuf[0] = ompi_coll_base_allocate_on_device(op_dev, 2*max_real_segsize, module);
+        if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
+        inbuf[1] = inbuf[0] + max_real_segsize;
+    } else {
+        inbuf[0] = ompi_coll_base_allocate_on_device(op_dev, max_real_segsize, module);
+        if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
+    //printf("allreduce ring count %d sbuf_dev %d rbuf_dev %d op_dev %d\n", count, sendbuf_dev, recvbuf_dev, op_dev);
 
     /* Handle MPI_IN_PLACE */
     bool use_sbuf = (MPI_IN_PLACE != sbuf);
+    /* allocate temporary recv buffer if the tmpbuf above is on a different device than the rbuf */
+    recvbuf = rbuf;
+    if (op_dev != recvbuf_dev) {
+        recvbuf = ompi_coll_base_allocate_on_device(op_dev, typelng*count, module);
+        if (use_sbuf) {
+            /* copy from rbuf */
+            ompi_datatype_copy_content_same_ddt(dtype, count, (char*)recvbuf, (char*)sbuf);
+        } else {
+            /* copy from sbuf */
+            ompi_datatype_copy_content_same_ddt(dtype, count, (char*)recvbuf, (char*)rbuf);
+        }
+        use_sbuf = false;
+    }
+
 #if 0
     if (MPI_IN_PLACE != sbuf) {
         ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
@@ -504,7 +567,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                     ((ptrdiff_t)rank * (ptrdiff_t)early_segcount) :
                     ((ptrdiff_t)rank * (ptrdiff_t)late_segcount + split_rank));
     block_count = ((rank < split_rank)? early_segcount : late_segcount);
-    tmpsend = ((char*)rbuf) + block_offset * extent;
+    tmpsend = ((use_sbuf) ? ((char*)sbuf) : ((char*)recvbuf)) + block_offset * extent;
     ret = MCA_PML_CALL(send(tmpsend, block_count, dtype, send_to,
                             MCA_COLL_BASE_TAG_ALLREDUCE,
                             MCA_PML_BASE_SEND_STANDARD, comm));
@@ -531,10 +594,11 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                         ((ptrdiff_t)prevblock * early_segcount) :
                         ((ptrdiff_t)prevblock * late_segcount + split_rank));
         block_count = ((prevblock < split_rank)? early_segcount : late_segcount);
-        tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
+        tmprecv = ((char*)recvbuf) + (ptrdiff_t)block_offset * extent;
         if (use_sbuf) {
+            void *tmpsbuf = ((char*)sbuf) + (ptrdiff_t)block_offset * extent;
             /* tmprecv = inbuf[inbi ^ 0x1] (op) sbuf */
-            ompi_3buff_op_reduce(op, inbuf[inbi ^ 0x1], sbuf, tmprecv, block_count, dtype);
+            ompi_3buff_op_reduce(op, inbuf[inbi ^ 0x1], tmpsbuf, tmprecv, block_count, dtype);
         } else {
             /* tmprecv = inbuf[inbi ^ 0x1] (op) tmprecv */
             ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
@@ -558,7 +622,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                     ((ptrdiff_t)recv_from * early_segcount) :
                     ((ptrdiff_t)recv_from * late_segcount + split_rank));
     block_count = ((recv_from < split_rank)? early_segcount : late_segcount);
-    tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
+    tmprecv = ((char*)recvbuf) + (ptrdiff_t)block_offset * extent;
     ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype);
 
     /* Distribution loop - variation of ring allgather */
@@ -578,8 +642,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
         block_count = ((send_data_from < split_rank)?
                        early_segcount : late_segcount);
 
-        tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent;
-        tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent;
+        tmprecv = (char*)recvbuf + (ptrdiff_t)recv_block_offset * extent;
+        tmpsend = (char*)recvbuf + (ptrdiff_t)send_block_offset * extent;
 
         ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to,
                                        MCA_COLL_BASE_TAG_ALLREDUCE,
@@ -587,11 +651,14 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                                        MCA_COLL_BASE_TAG_ALLREDUCE,
                                        comm, MPI_STATUS_IGNORE, rank);
         if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl;}
-
     }
 
-    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0], module);
-    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1], module);
+    ompi_coll_base_free_tmpbuf(inbuf[0], op_dev, module);
+    if (recvbuf != rbuf) {
+        /* copy to final rbuf and release temporary recvbuf */
+        ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)recvbuf);
+        ompi_coll_base_free_tmpbuf(recvbuf, op_dev, module);
+    }
 
     return MPI_SUCCESS;
 
@@ -600,8 +667,12 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0], module);
-    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1], module);
+    ompi_coll_base_free_tmpbuf(inbuf[0], op_dev, module);
+    if (NULL != recvbuf && recvbuf != rbuf) {
+        /* copy to final rbuf and release temporary recvbuf */
+        ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)recvbuf);
+        ompi_coll_base_free_tmpbuf(recvbuf, op_dev, module);
+    }
     return ret;
 }
 
@@ -756,10 +827,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
     /* Allocate and initialize temporary buffers */
-    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[0], module);
+    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, count, dtype, &inbuf_dev[0], module);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, dtype, &inbuf_dev[1], module);
+        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, count, dtype, &inbuf_dev[1], module);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -1044,7 +1115,6 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     mca_coll_base_module_t *module)
 {
     int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL;
-    int tmp_buf_dev = -1;
 
     int comm_size = ompi_comm_size(comm);
     int rank = ompi_comm_rank(comm);
@@ -1073,13 +1143,29 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     ompi_datatype_get_extent(dtype, &lb, &extent);
     dsize = opal_datatype_span(&dtype->super, count, &gap);
 
+    /* get the device for sbuf and rbuf and where the op would like to execute */
+    int sendbuf_dev, recvbuf_dev, op_dev;
+    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev, &op_dev);
+
     /* Temporary buffer for receiving messages */
     char *tmp_buf = NULL;
-    char *tmp_buf_raw = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, dsize, op, dtype, &tmp_buf_dev, module);
+    char *tmp_buf_raw = ompi_coll_base_allocate_on_device(op_dev, dsize, module);
     if (NULL == tmp_buf_raw)
         return OMPI_ERR_OUT_OF_RESOURCE;
     tmp_buf = tmp_buf_raw - gap;
 
+    char *recvbuf = rbuf;
+    if (op_dev != recvbuf_dev) {
+        recvbuf = ompi_coll_base_allocate_on_device(op_dev, dsize, module);
+    }
+    if (op_dev != sendbuf_dev && sbuf != MPI_IN_PLACE) {
+        /* move the data into the recvbuf and set sbuf to MPI_IN_PLACE */
+        ompi_datatype_copy_content_same_ddt(dtype, count, (char*)recvbuf, (char*)sbuf);
+        sbuf = MPI_IN_PLACE;
+    }
+
+    //printf("redscat: count %d sbuf %p dev %d recvbuf %p dev %d tmp_buf %p dev %d\n", count, sbuf, sendbuf_dev, recvbuf, recvbuf_dev, tmp_buf_raw, op_dev);
+
     /*
      * Step 1. Reduce the number of processes to the nearest lower power of two
      * p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes.
@@ -1100,15 +1186,17 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     int vrank, step, wsize;
     int nprocs_rem = comm_size - nprocs_pof2;
 
-    opal_accelerator_stream_t *stream;
-    opal_accelerator.get_default_stream(tmp_buf_dev, &stream);
+    opal_accelerator_stream_t *stream = NULL;
+    if (op_dev >= 0) {
+        opal_accelerator.get_default_stream(op_dev, &stream);
+    }
 
     if (rank < 2 * nprocs_rem) {
         int count_lhalf = count / 2;
         int count_rhalf = count - count_lhalf;
         const void *send_buf = sbuf;
         if (MPI_IN_PLACE == sbuf) {
-            send_buf = rbuf;
+            send_buf = recvbuf;
         }
 
         if (rank % 2 != 0) {
@@ -1127,21 +1215,21 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
             /* Reduce on the right half of the buffers (result in rbuf)
              * We're not using a stream here, the reduction will make sure that the result is available upon return */
-            if (MPI_IN_PLACE == sbuf) {
+            if (MPI_IN_PLACE != sbuf) {
                 /* rbuf = sbuf (op) tmp_buf */
                 ompi_3buff_op_reduce(op,
                                      (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
                                      (char *)sbuf + (ptrdiff_t)count_lhalf * extent,
-                                     (char *)rbuf + count_lhalf * extent,
+                                     (char *)recvbuf + count_lhalf * extent,
                                      count_rhalf, dtype);
             } else {
                 /* rbuf = rbuf (op) tmp_buf */
                 ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
-                               (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
+                               (char *)recvbuf + count_lhalf * extent, count_rhalf, dtype);
             }
 
             /* Send the right half to the left neighbor */
-            err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
+            err = MCA_PML_CALL(send((char *)recvbuf + (ptrdiff_t)count_lhalf * extent,
                                     count_rhalf, dtype, rank - 1,
                                     MCA_COLL_BASE_TAG_ALLREDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
@@ -1165,25 +1253,27 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            if (MPI_IN_PLACE == sbuf) {
+            if (MPI_IN_PLACE != sbuf) {
                 /* rbuf = sbuf (op) tmp_buf */
-                ompi_3buff_op_reduce_stream(op, sbuf, tmp_buf, rbuf, count_lhalf, dtype, stream);
+                ompi_3buff_op_reduce_stream(op, sbuf, tmp_buf, recvbuf, count_lhalf, dtype, stream);
 
             } else {
                 /* rbuf = rbuf (op) tmp_buf */
-                ompi_op_reduce_stream(op, tmp_buf, rbuf, count_lhalf, dtype, stream);
+                ompi_op_reduce_stream(op, tmp_buf, recvbuf, count_lhalf, dtype, stream);
             }
 
 
             /* Recv the right half from the right neighbor */
-            err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
+            err = MCA_PML_CALL(recv((char *)recvbuf + (ptrdiff_t)count_lhalf * extent,
                                     count_rhalf, dtype, rank + 1,
                                     MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                     MPI_STATUS_IGNORE));
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* wait for the op to complete */
-            opal_accelerator.wait_stream(stream);
+            if (NULL != stream) {
+                opal_accelerator.wait_stream(stream);
+            }
 
             vrank = rank / 2;
         }
@@ -1249,7 +1339,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             }
 
             /* Send part of data from the rbuf, recv into the tmp_buf */
-            err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)sindex[step] * extent,
+            err = ompi_coll_base_sendrecv((char *)recvbuf + (ptrdiff_t)sindex[step] * extent,
                                           scount[step], dtype, dest,
                                           MCA_COLL_BASE_TAG_ALLREDUCE,
                                           (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
@@ -1260,7 +1350,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
             /* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
             ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
-                           (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
+                           (char *)recvbuf + (ptrdiff_t)rindex[step] * extent,
                            rcount[step], dtype);
 
             /* Move the current window to the received message */
@@ -1295,10 +1385,10 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
              * Send rcount[step] elements from rbuf[rindex[step]...]
              * Recv scount[step] elements to rbuf[sindex[step]...]
              */
-            err = ompi_coll_base_sendrecv((char *)rbuf + (ptrdiff_t)rindex[step] * extent,
+            err = ompi_coll_base_sendrecv((char *)recvbuf + (ptrdiff_t)rindex[step] * extent,
                                           rcount[step], dtype, dest,
                                           MCA_COLL_BASE_TAG_ALLREDUCE,
-                                          (char *)rbuf + (ptrdiff_t)sindex[step] * extent,
+                                          (char *)recvbuf + (ptrdiff_t)sindex[step] * extent,
                                           scount[step], dtype, dest,
                                           MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                           MPI_STATUS_IGNORE, rank);
@@ -1310,6 +1400,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     /*
      * Step 4. Send total result to excluded odd ranks.
      */
+    bool recvbuf_need_copy = true;
     if (rank < 2 * nprocs_rem) {
         if (rank % 2 != 0) {
             /* Odd process -- recv result from rank - 1 */
@@ -1317,19 +1408,28 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
                                     MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                     MPI_STATUS_IGNORE));
             if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
+            recvbuf_need_copy = false;
 
         } else {
             /* Even process -- send result to rank + 1 */
-            err = MCA_PML_CALL(send(rbuf, count, dtype, rank + 1,
+            err = MCA_PML_CALL(send(recvbuf, count, dtype, rank + 1,
                                     MCA_COLL_BASE_TAG_ALLREDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
         }
     }
 
+    if (recvbuf != rbuf) {
+        /* copy into final rbuf */
+        if (recvbuf_need_copy) {
+            ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)recvbuf);
+        }
+        ompi_coll_base_free_tmpbuf(recvbuf, op_dev, module);
+    }
+
   cleanup_and_return:
 
-    ompi_coll_base_free_tmpbuf(tmp_buf_raw, tmp_buf_dev, module);
+    ompi_coll_base_free_tmpbuf(tmp_buf_raw, op_dev, module);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c
index e6b0e74f958..d18770f3a13 100644
--- a/ompi/mca/coll/base/coll_base_util.c
+++ b/ompi/mca/coll/base/coll_base_util.c
@@ -606,7 +606,7 @@ const char* mca_coll_base_colltype_to_str(int collid)
 
 static void* ompi_coll_base_device_allocate_cb(void *ctx, size_t *size) {
     int dev_id = (intptr_t)ctx;
-    void *ptr;
+    void *ptr = NULL;
     opal_accelerator.mem_alloc(dev_id, &ptr, *size);
     return ptr;
 }
@@ -620,6 +620,10 @@ void *ompi_coll_base_allocate_on_device(int device, size_t size,
                                         mca_coll_base_module_t *module)
 {
     mca_allocator_base_module_t *allocator_module;
+    if (device < 0) {
+        return malloc(size);
+    }
+
     if (NULL == module->base_data->device_allocators) {
         int num_dev;
         opal_accelerator.num_devices(&num_dev);
@@ -643,7 +647,11 @@ void *ompi_coll_base_allocate_on_device(int device, size_t size,
 void ompi_coll_base_free_on_device(int device, void *ptr, mca_coll_base_module_t *module)
 {
     mca_allocator_base_module_t *allocator_module;
-    assert(NULL != module->base_data->device_allocators);
-    allocator_module = module->base_data->device_allocators[device];
-    allocator_module->alc_free(allocator_module, ptr);
+    if (device < 0) {
+        free(ptr);
+    } else {
+        assert(NULL != module->base_data->device_allocators);
+        allocator_module = module->base_data->device_allocators[device];
+        allocator_module->alc_free(allocator_module, ptr);
+    }
 }
\ No newline at end of file
diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h
index 25c5d79d143..10d4cfcba17 100644
--- a/ompi/mca/coll/base/coll_base_util.h
+++ b/ompi/mca/coll/base/coll_base_util.h
@@ -212,6 +212,27 @@ void *ompi_coll_base_allocate_on_device(int device, size_t size,
 
 void ompi_coll_base_free_on_device(int device, void *ptr, mca_coll_base_module_t *module);
 
+
+static inline
+void ompi_coll_base_select_device(
+    struct ompi_op_t *op,
+    const void *sendbuf,
+    const void *recvbuf,
+    size_t count,
+    struct ompi_datatype_t *dtype,
+    int *sendbuf_device,
+    int *recvbuf_device,
+    int *op_device)
+{
+    uint64_t sendbuf_flags, recvbuf_flags;
+    /* TODO: move this into ompi_op_select_device to save the extra lookups? */
+    *recvbuf_device = -1;
+    *sendbuf_device = -1;
+    if (sendbuf != NULL && sendbuf != MPI_IN_PLACE) opal_accelerator.check_addr(sendbuf, sendbuf_device, &sendbuf_flags);
+    if (recvbuf != NULL) opal_accelerator.check_addr(recvbuf, recvbuf_device, &recvbuf_flags);
+    ompi_op_preferred_device(op, *recvbuf_device, *sendbuf_device, count, dtype, op_device);
+}
+
 /**
  * Returns a pointer to memory in the same memory domain as the receive or send buffer.
  * Device memory is allocated if either the receive buffer or the send buffer are
@@ -221,12 +242,22 @@ void ompi_coll_base_free_on_device(int device, void *ptr, mca_coll_base_module_t
 static inline
 void* ompi_coll_base_allocate_op_tmpbuf(
     const void *sendbuf, const void *recvbuf, size_t size,
-    const struct ompi_op_t *op, const struct ompi_datatype_t *dtype,
+    struct ompi_op_t *op, size_t count, struct ompi_datatype_t *dtype,
     int *device, mca_coll_base_module_t *module)
 {
     void *res = NULL;
     uint64_t flags;
     *device = -1;
+
+    ompi_op_select_device(op, sendbuf, recvbuf, count, dtype, device);
+    if (*device > -1) {
+        res = ompi_coll_base_allocate_on_device(*device, size, module);
+        if (NULL == res) {
+            // fallback to host
+            *device = -1;
+        }
+    }
+#if 0
     if ((NULL == op && NULL == dtype) || ompi_op_supports_device(op, dtype)) {
         /* if the recvbuf is on the device we take that device */
         if (NULL != recvbuf && 0 < opal_accelerator.check_addr(recvbuf, device, &flags)) {
@@ -246,6 +277,7 @@ void* ompi_coll_base_allocate_op_tmpbuf(
             }
         }
     }
+#endif // 0
 
     if (NULL == res) {
         res = malloc(size);
@@ -253,6 +285,7 @@ void* ompi_coll_base_allocate_op_tmpbuf(
     return res;
 }
 
+#if 0
 /**
  * Like ompi_coll_base_allocate_op_tmpbuf but without checking op-datatype
  * device compatibility.
@@ -264,7 +297,7 @@ void* ompi_coll_base_allocate_tmpbuf(
 {
     return ompi_coll_base_allocate_op_tmpbuf(sendbuf, recvbuf, size, NULL, NULL, device, module);
 }
-
+#endif // 0
 /**
  * Frees memory allocated through ompi_coll_base_allocate_op_tmpbuf
  * or ompi_coll_base_allocate_tmpbuf.
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 566f18bc01d..9fb0b66203a 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -627,7 +627,7 @@ static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
                 (source_check_addr == 0 || (source_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY))) {
                 /* nothing to be done, we won't need device-capable ops */
             } else {
-                fprintf(stderr, "3buff op: no suitable op module found for device memory!\n");
+                fprintf(stderr, "op: no suitable op %s module for type %s found for device memory!\n", op->o_name, dtype->name);
                 abort();
             }
         }
@@ -810,15 +810,43 @@ static inline void ompi_3buff_op_reduce(ompi_op_t * op, void *source1,
     tgt = target;
 
     if (OPAL_LIKELY(ompi_op_is_intrinsic (op))) {
+        ompi_3buff_op_reduce_stream(op, source1, source2, target, count, dtype, NULL);
+#if 0
         op->o_3buff_intrinsic.fns[ompi_op_ddt_map[dtype->id]](src1, src2,
                                                               tgt, &count,
                                                               &dtype,
                                                               op->o_3buff_intrinsic.modules[ompi_op_ddt_map[dtype->id]]);
+#endif // 0
     } else {
         ompi_3buff_op_user (op, src1, src2, tgt, count, dtype);
     }
 }
 
+static inline void ompi_op_preferred_device(ompi_op_t *op, int source_dev,
+                                            int target_dev, size_t count,
+                                            ompi_datatype_t *dtype, int *op_device)
+{
+    /* default to host */
+    *op_device = -1;
+    if (!ompi_op_is_intrinsic (op)) {
+        return;
+    }
+    /* quick check: can we execute on both sides? */
+    int dtype_id = ompi_op_ddt_map[dtype->id];
+    if (NULL == op->o_device_op || NULL == op->o_device_op->do_intrinsic.fns[dtype_id]) {
+        /* not available on the gpu, must select host */
+        return;
+    }
+
+    double host_startup_cost = 0.0; // host has no startup cost
+    double host_compute_cost = 1.0*count; // host reference 1.0 per element
+    double device_startup_cost = 10000.0; // to be filled below
+    double device_compute_cost = 0.0001*count;
+
+    if ((host_startup_cost + host_compute_cost) > (device_startup_cost + device_compute_cost)) {
+        *op_device = (target_dev >= 0) ? target_dev : source_dev;
+    }
+}
 
 //#if 0
 /**
@@ -877,6 +905,8 @@ static inline void ompi_op_select_device(ompi_op_t *op, const void *source,
     }
 
     /* select a device, or remain on the host */
+    //printf("ompi_op_select_device: host startup %f host compute %f device startup %f device compute %f\n",
+    //       host_startup_cost, host_compute_cost, device_startup_cost, device_compute_cost);
     if ((host_startup_cost + host_compute_cost) > (device_startup_cost + device_compute_cost)) {
         *device = (target_dev_id >= 0) ? target_dev_id : source_dev_id;
     }

From cd7e578de62108486c40ee55f7142ff698d01102 Mon Sep 17 00:00:00 2001
From: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
Date: Mon, 1 May 2023 22:52:05 -0400
Subject: [PATCH 26/74] init op_rocm, not compilable yet

Signed-off-by: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
---
 ompi/mca/op/rocm/Makefile.am         |   83 ++
 ompi/mca/op/rocm/op_rocm.h           |   82 ++
 ompi/mca/op/rocm/op_rocm_component.c |  189 +++
 ompi/mca/op/rocm/op_rocm_functions.c | 1796 ++++++++++++++++++++++++++
 ompi/mca/op/rocm/op_rocm_impl.c      | 1024 +++++++++++++++
 ompi/mca/op/rocm/op_rocm_impl.h      |  899 +++++++++++++
 6 files changed, 4073 insertions(+)
 create mode 100644 ompi/mca/op/rocm/Makefile.am
 create mode 100644 ompi/mca/op/rocm/op_rocm.h
 create mode 100644 ompi/mca/op/rocm/op_rocm_component.c
 create mode 100644 ompi/mca/op/rocm/op_rocm_functions.c
 create mode 100644 ompi/mca/op/rocm/op_rocm_impl.c
 create mode 100644 ompi/mca/op/rocm/op_rocm_impl.h

diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
new file mode 100644
index 00000000000..61b31b2ee5d
--- /dev/null
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -0,0 +1,83 @@
+#
+# Copyright (c) 2023      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This component provides support for offloading reduce ops to CUDA devices.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+AM_CPPFLAGS = $(common_rocm_CPPFLAGS)
+
+sources = op_rocm_component.c op_rocm.h op_rocm_functions.c op_rocm_impl.h
+#sources_extended = op_rocm_functions.cu
+rocm_sources = op_rocm_impl.c
+
+NVCC = nvcc -allow-unsupported-compiler
+
+.c.l$(OBJEXT):
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(NVCC) -prefer-non-pic $(NVCCFLAGS) -Wc,-Xcompiler,-fPIC,-g -c $<
+
+# -o $($@.o:.lo)
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+
+if MCA_BUILD_ompi_op_rocm_DSO
+component_install = mca_op_rocm.la
+else
+component_install =
+component_noinst = libmca_op_rocm.la
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+#CUDADIR=/nfs/apps/spacks/2023-01-01/opt/spack/linux-centos7-x86_64/gcc-9.5.0/rocm-11.8.0-u2modnncfevx54ibr5dy27sxkirwsf7f
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_rocm_la_SOURCES = $(sources)
+mca_op_rocm_la_LIBADD = $(rocm_sources:.c=.lo)
+mca_op_rocm_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
+		$(accelerator_rocm_LIBS) -L$(CUDADIR)/lib64 -lrocmrt
+EXTRA_mca_op_rocm_la_SOURCES = $(rocm_sources)
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_rocm_la_SOURCES = $(sources)
+libmca_op_rocm_la_LIBADD = $(rocm_sources:.c=.lo)
+libmca_op_rocm_la_LDFLAGS = -module -avoid-version\
+		$(accelerator_rocm_LIBS) -L$(CUDADIR)/lib64 -lrocmrt
+EXTRA_libmca_op_rocm_la_SOURCES = $(rocm_sources)
+
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
new file mode 100644
index 00000000000..0773a519aff
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_CUDA_EXPORT_H
+#define MCA_OP_CUDA_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+#include "ompi/runtime/mpiruntime.h"
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+
+BEGIN_C_DECLS
+
+
+#define xstr(x) #x
+#define str(x) xstr(x)
+
+#define CHECK(fn, args)                                       \
+    do {                                                      \
+        hipError_t err = fn args;                            \
+        if (err != hipSuccess) {                             \
+            fprintf(stderr, "%s:%d: %s failed at line: %s: %s\n", \
+                    __FILE__, __LINE__, str(fn), hipGetErrorName(err), \
+                    hipGetErrorString(err));                 \
+            ompi_mpi_abort(MPI_COMM_WORLD, 1);                \
+        }                                                     \
+    } while (0)
+
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+#if 0
+    /* a stream on which to schedule kernel calls */
+    hipStream_t ro_stream;
+    hipCtx_t *ro_ctx;
+#endif // 0
+    int *ro_max_threads_per_block;
+    hipDevice_t *ro_devices;
+    int ro_num_devices;
+} ompi_op_rocm_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *rocm* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_rocm_component_t
+    mca_op_rocm_component;
+
+OMPI_DECLSPEC extern
+ompi_op_base_stream_handler_fn_t ompi_op_rocm_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+OMPI_DECLSPEC extern
+ompi_op_base_3buff_stream_handler_fn_t ompi_op_rocm_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+END_C_DECLS
+
+#endif /* MCA_OP_CUDA_EXPORT_H */
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
new file mode 100644
index 00000000000..87439f63ed7
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2021      Cisco Systems, Inc.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "rocm" op component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/rocm/op_rocm.h"
+
+#include <stdlib.h>
+
+static int rocm_component_open(void);
+static int rocm_component_close(void);
+static int rocm_component_init_query(bool enable_progress_threads,
+                                    bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    rocm_component_op_query(struct ompi_op_t *op, int *priority);
+static int rocm_component_register(void);
+
+ompi_op_rocm_component_t mca_op_rocm_component = {
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "rocm",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = rocm_component_open,
+            .mca_close_component = rocm_component_close,
+            .mca_register_component_params = rocm_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = rocm_component_init_query,
+        .opc_op_query = rocm_component_op_query,
+    },
+    .ro_max_threads_per_block = NULL,
+    .ro_devices = NULL,
+    .ro_num_devices  = 0,
+};
+
+/*
+ * Component open
+ */
+static int rocm_component_open(void)
+{
+    /* We checked the flags during register, so if they are set to
+     * zero either the architecture is not suitable or the user disabled
+     * AVX support.
+     *
+     * A first level check to see what level of AVX is available on the
+     * hardware.
+     *
+     * Note that if this function returns non-OMPI_SUCCESS, then this
+     * component won't even be shown in ompi_info output (which is
+     * probably not what you want).
+     */
+    printf("op rocm_component_open\n");
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int rocm_component_close(void)
+{
+    if (mca_op_rocm_component.ro_num_devices > 0) {
+        //hipStreamDestroy(mca_op_rocm_component.ro_stream);
+        free(mca_op_rocm_component.ro_max_threads_per_block);
+        mca_op_rocm_component.ro_max_threads_per_block = NULL;
+        free(mca_op_rocm_component.ro_devices);
+        mca_op_rocm_component.ro_devices = NULL;
+        mca_op_rocm_component.ro_num_devices = 0;
+    }
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Register MCA params.
+ */
+static int
+rocm_component_register(void)
+{
+    /* TODO: add mca paramters */
+
+    return OMPI_SUCCESS;
+}
+
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int
+rocm_component_init_query(bool enable_progress_threads,
+                         bool enable_mpi_thread_multiple)
+{
+    int num_devices;
+    int rc;
+    int prio_lo, prio_hi;
+    //memset(&mca_op_rocm_component, 0, sizeof(mca_op_rocm_component));
+    hipInit(0);
+    CHECK(hipGetDeviceCount, (&num_devices));
+    mca_op_rocm_component.ro_num_devices = num_devices;
+    mca_op_rocm_component.ro_devices = (hipDevice_t*)malloc(num_devices*sizeof(hipDevice_t));
+#if 0
+    mca_op_rocm_component.ro_ctx = (hipCtx_t*)malloc(num_devices*sizeof(hipCtx_t));
+#endif // 0
+    mca_op_rocm_component.ro_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
+    for (int i = 0; i < num_devices; ++i) {
+        CHECK(hipDeviceGet, (&mca_op_rocm_component.ro_devices[i], i));
+#if 0
+        rc = hipCtxCreate(&mca_op_rocm_component.ro_ctx[i],
+                         0, mca_op_rocm_component.ro_devices[i]);
+        if (hipSuccess != rc) {
+            CHECK(hipDevicePrimaryCtxRetain,
+                  (&mca_op_rocm_component.ro_ctx[i], mca_op_rocm_component.ro_devices[i]));
+        }
+#endif // 0
+        rc = hipDeviceGetAttribute(&mca_op_rocm_component.ro_max_threads_per_block[i],
+                                  hipDeviceAttributeMaxThreadsPerBlock,
+                                  mca_op_rocm_component.ro_devices[i]);
+        if (hipSuccess != rc) {
+            /* fall-back to value that should work on every device */
+            mca_op_rocm_component.ro_max_threads_per_block[i] = 512;
+        }
+    }
+
+#if 0
+    /* try to create a high-priority stream */
+    rc = hipDeviceGetStreamPriorityRange(&prio_lo, &prio_hi);
+    if (hipSuccess != rc) {
+        hipStreamCreateWithPriority(&mca_op_rocm_component.ro_stream, hipStreamNonBlocking, prio_hi);
+    } else {
+        mca_op_rocm_component.ro_stream = 0;
+    }
+#endif // 0
+    printf("op rocm_component_init_query\n");
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t*
+rocm_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    ompi_op_base_module_t *module = NULL;
+
+    module = OBJ_NEW(ompi_op_base_module_t);
+    module->opm_device_enabled = true;
+    for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+        module->opm_fns[i] = ompi_op_rocm_functions[op->o_f_to_c_index][i];
+        module->opm_3buff_fns[i] = ompi_op_rocm_3buff_functions[op->o_f_to_c_index][i];
+
+        if( NULL != module->opm_fns[i] ) {
+            OBJ_RETAIN(module);
+        }
+        if( NULL != module->opm_3buff_fns[i] ) {
+            OBJ_RETAIN(module);
+        }
+    }
+    *priority = 50;
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/rocm/op_rocm_functions.c b/ompi/mca/op/rocm/op_rocm_functions.c
new file mode 100644
index 00000000000..717edb94094
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_functions.c
@@ -0,0 +1,1796 @@
+/*
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/rocm/op_rocm.h"
+#include "opal/mca/accelerator/accelerator.h"
+
+#include "ompi/mca/op/rocm/op_rocm.h"
+#include "ompi/mca/op/rocm/op_rocm_impl.h"
+
+
+static inline void device_op_pre(const void *orig_source1,
+                                 void **source1,
+                                 int *source1_device,
+                                 const void *orig_source2,
+                                 void **source2,
+                                 int *source2_device,
+                                 void *orig_target,
+                                 void **target,
+                                 int *target_device,
+                                 int count,
+                                 struct ompi_datatype_t *dtype,
+                                 int *threads_per_block,
+                                 int *device,
+                                 opal_accelerator_stream_t *stream)
+{
+    uint64_t target_flags = -1, source1_flags = -1, source2_flags = -1;
+    int target_rc, source1_rc, source2_rc = -1;
+
+    *target = orig_target;
+    *source1 = (void*)orig_source1;
+    if (NULL != orig_source2) {
+        *source2 = (void*)orig_source2;
+    }
+
+    target_rc = opal_accelerator.check_addr(*target, target_device, &target_flags);
+    source1_rc = opal_accelerator.check_addr(*source1, source1_device, &source1_flags);
+    *device = *target_device;
+
+    if (NULL != orig_source2) {
+        source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
+        //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
+        //       orig_target, target_rc, *target_device, orig_source1, source1_rc, *source1_device, orig_source2, source2_rc, *source2_device, *device);
+    }
+
+    //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
+    //       target_rc, *target_device, source_rc, *source_device, *device);
+
+    if (0 == target_rc && 0 == source1_rc && 0 == source2_rc) {
+        /* no buffers are on any device, select device 0 */
+        *device = 0;
+    } else if (*target_device == -1) {
+        if (*source1_device == -1 && NULL != orig_source2) {
+            *device = *source2_device;
+        } else {
+            *device = *source1_device;
+        }
+    }
+
+    if (0 == target_rc || 0 == source1_rc || *target_device != *source1_device) {
+        size_t nbytes;
+        ompi_datatype_type_size(dtype, &nbytes);
+        nbytes *= count;
+
+        if (0 == target_rc) {
+            // allocate memory on the device for the target buffer
+            //printf("copying target from device %d to host\n", *target_device);
+            opal_accelerator.mem_alloc_stream(*device, target, nbytes, stream);
+            CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*target, orig_target, nbytes, *(hipStream_t*)stream->stream));
+            *target_device = -1; // mark target device as host
+        }
+
+        if (0 == source1_rc || *device != *source1_device) {
+            // allocate memory on the device for the source buffer
+            //printf("allocating source on device %d\n", *device);
+            opal_accelerator.mem_alloc_stream(*device, source1, nbytes, stream);
+            if (0 == source1_rc) {
+                /* copy from host to device */
+                //printf("copying source from host to device %d\n", *device);
+                CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source1, orig_source1, nbytes, *(hipStream_t*)stream->stream));
+            } else {
+                /* copy from one device to another device */
+                /* TODO: does this actually work? Can we enable P2P? */
+                //printf("attempting cross-device copy for source\n");
+                CHECK(hipMemcpyDtoDAsync, ((hipDeviceptr_t)*source1, (hipDeviceptr_t)orig_source1, nbytes, *(hipStream_t*)stream->stream));
+            }
+        }
+
+    }
+    if (NULL != source2_device && *target_device != *source2_device) {
+        // allocate memory on the device for the source buffer
+        //printf("allocating source on device %d\n", *device);
+        size_t nbytes;
+        ompi_datatype_type_size(dtype, &nbytes);
+        nbytes *= count;
+
+        opal_accelerator.mem_alloc_stream(*device, source2, nbytes, stream);
+        if (0 == source2_rc) {
+            /* copy from host to device */
+            //printf("copying source from host to device %d\n", *device);
+            CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source2, orig_source2, nbytes, *(hipStream_t*)stream->stream));
+        } else {
+            /* copy from one device to another device */
+            /* TODO: does this actually work? Can we enable P2P? */
+            //printf("attempting cross-device copy for source\n");
+            CHECK(hipMemcpyDtoDAsync, ((hipDeviceptr_t)*source2, (hipDeviceptr_t)orig_source2, nbytes, *(hipStream_t*)stream->stream));
+        }
+    }
+    *threads_per_block = mca_op_rocm_component.ro_max_threads_per_block[*device];
+}
+
+static inline void device_op_post(void *source1,
+                                  int source1_device,
+                                  void *source2,
+                                  int source2_device,
+                                  void *orig_target,
+                                  void *target,
+                                  int target_device,
+                                  int count,
+                                  struct ompi_datatype_t *dtype,
+                                  int device,
+                                  opal_accelerator_stream_t *stream)
+{
+    if (-1 == target_device) {
+
+        size_t nbytes;
+        ompi_datatype_type_size(dtype, &nbytes);
+        nbytes *= count;
+
+        CHECK(hipMemcpyDtoHAsync, (orig_target, (hipDeviceptr_t)target, nbytes, *(hipStream_t *)stream->stream));
+    }
+
+    if (-1 == target_device) {
+        opal_accelerator.mem_release_stream(device, target, stream);
+        //CHECK(hipFreeAsync, ((hipDeviceptr_t)target, mca_op_rocm_component.ro_stream));
+    }
+    if (source1_device != device) {
+        opal_accelerator.mem_release_stream(device, source1, stream);
+        //CHECK(hipFreeAsync, ((hipDeviceptr_t)source, mca_op_rocm_component.ro_stream));
+    }
+    if (NULL != source2 && source2_device != device) {
+        opal_accelerator.mem_release_stream(device, source2, stream);
+        //CHECK(hipFreeAsync, ((hipDeviceptr_t)source, mca_op_rocm_component.ro_stream));
+    }
+}
+
+#define FUNC(name, type_name, type)                                                                             \
+    static                                                                                                      \
+    void ompi_op_rocm_2buff_##name##_##type_name(const void *in, void *inout, int *count,                       \
+                                                   struct ompi_datatype_t **dtype,                              \
+                                                   opal_accelerator_stream_t *stream,                           \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {                \
+        int threads_per_block;                                                                                  \
+        int source_device, target_device, device;                                                               \
+        type *source, *target;                                                                                  \
+        int n = *count;                                                                                         \
+        device_op_pre(in, (void**)&source, &source_device, NULL, NULL, NULL,                                    \
+                      inout, (void**)&target, &target_device,                                                   \
+                      n, *dtype,                                                                                \
+                      &threads_per_block, &device, stream);                                                     \
+        hipStream_t *custream = (hipStream_t*)stream->stream;                                                         \
+        ompi_op_rocm_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, *custream);      \
+        device_op_post(source, source_device, NULL, -1, inout, target, target_device, n, *dtype, device, stream);\
+    }
+
+#define OP_FUNC(name, type_name, type, op, ...) FUNC(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
+
+/* reuse the macro above, no work is actually done so we don't care about the func */
+#define FUNC_FUNC(name, type_name, type, ...) FUNC(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+#define LOC_FUNC(name, type_name, op) FUNC(name, type_name, ompi_op_predefined_##type_name##_t)
+
+/* Dispatch Fortran types to C types */
+#define FORT_INT_FUNC(name, type_name, type)                                                     \
+    static                                                                                       \
+    void ompi_op_rocm_2buff_##name##_##type_name(const void *in, void *inout, int *count,        \
+                                                   struct ompi_datatype_t **dtype,               \
+                                                   opal_accelerator_stream_t *stream,            \
+                                                   struct ompi_op_base_module_1_0_0_t *module) { \
+                                                                                                 \
+        _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
+        switch(sizeof(type)) {  \
+            case sizeof(int8_t):  \
+                ompi_op_rocm_2buff_##name##_int8_t(in, inout, count, dtype, stream, module); \
+                break; \
+            case sizeof(int16_t): \
+                ompi_op_rocm_2buff_##name##_int16_t(in, inout, count, dtype, stream, module); \
+                break; \
+            case sizeof(int32_t): \
+                ompi_op_rocm_2buff_##name##_int32_t(in, inout, count, dtype, stream, module); \
+                break; \
+            case sizeof(int64_t): \
+                ompi_op_rocm_2buff_##name##_int64_t(in, inout, count, dtype, stream, module); \
+                break; \
+        } \
+    }
+
+/* Dispatch Fortran types to C types */
+#define FORT_FLOAT_FUNC(name, type_name, type)                                                      \
+    static                                                                                          \
+    void ompi_op_rocm_2buff_##name##_##type_name(const void *in, void *inout, int *count,           \
+                                                   struct ompi_datatype_t **dtype,                  \
+                                                   opal_accelerator_stream_t *stream,               \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {    \
+        _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
+        switch(sizeof(type)) {  \
+            case sizeof(float):  \
+                ompi_op_rocm_2buff_##name##_float(in, inout, count, dtype, stream, module);  \
+                break;  \
+            case sizeof(double): \
+                ompi_op_rocm_2buff_##name##_double(in, inout, count, dtype, stream, module); \
+                break; \
+            case sizeof(long double): \
+                ompi_op_rocm_2buff_##name##_long_double(in, inout, count, dtype, stream, module); \
+                break; \
+        } \
+    }
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC(max,   int8_t,   int8_t)
+FUNC_FUNC(max,  uint8_t,  uint8_t)
+FUNC_FUNC(max,  int16_t,  int16_t)
+FUNC_FUNC(max, uint16_t, uint16_t)
+FUNC_FUNC(max,  int32_t,  int32_t)
+FUNC_FUNC(max, uint32_t, uint32_t)
+FUNC_FUNC(max,  int64_t,  int64_t)
+FUNC_FUNC(max, uint64_t, uint64_t)
+FUNC_FUNC(max,  long,  long)
+FUNC_FUNC(max,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC(max, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC(max, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC(max, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC(max, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC(max, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC(max, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC(max, short_float, opal_short_float_t)
+#endif
+#endif // 0
+FUNC_FUNC(max, float, float)
+FUNC_FUNC(max, double, double)
+FUNC_FUNC(max, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FORT_FLOAT_FUNC(max, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FORT_FLOAT_FUNC(max, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FORT_FLOAT_FUNC(max, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FORT_FLOAT_FUNC(max, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FORT_FLOAT_FUNC(max, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FORT_FLOAT_FUNC(max, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC(min,   int8_t,   int8_t)
+FUNC_FUNC(min,  uint8_t,  uint8_t)
+FUNC_FUNC(min,  int16_t,  int16_t)
+FUNC_FUNC(min, uint16_t, uint16_t)
+FUNC_FUNC(min,  int32_t,  int32_t)
+FUNC_FUNC(min, uint32_t, uint32_t)
+FUNC_FUNC(min,  int64_t,  int64_t)
+FUNC_FUNC(min, uint64_t, uint64_t)
+FUNC_FUNC(min,  long,  long)
+FUNC_FUNC(min,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC(min, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC(min, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC(min, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC(min, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC(min, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC(min, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC(min, short_float, opal_short_float_t)
+#endif
+#endif // 0
+
+FUNC_FUNC(min, float, float)
+FUNC_FUNC(min, double, double)
+FUNC_FUNC(min, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FORT_FLOAT_FUNC(min, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FORT_FLOAT_FUNC(min, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FORT_FLOAT_FUNC(min, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FORT_FLOAT_FUNC(min, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FORT_FLOAT_FUNC(min, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FORT_FLOAT_FUNC(min, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC(sum,   int8_t,   int8_t, +=)
+OP_FUNC(sum,  uint8_t,  uint8_t, +=)
+OP_FUNC(sum,  int16_t,  int16_t, +=)
+OP_FUNC(sum, uint16_t, uint16_t, +=)
+OP_FUNC(sum,  int32_t,  int32_t, +=)
+OP_FUNC(sum, uint32_t, uint32_t, +=)
+OP_FUNC(sum,  int64_t,  int64_t, +=)
+OP_FUNC(sum, uint64_t, uint64_t, +=)
+OP_FUNC(sum,  long,  long, +=)
+OP_FUNC(sum,  unsigned_long, unsigned long, +=)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC(sum, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC(sum, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC(sum, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC(sum, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC(sum, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC(sum, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC(sum, short_float, short float, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC(sum, short_float, opal_short_float_t, +=)
+#endif
+#endif // 0
+
+OP_FUNC(sum, float, float, +=)
+OP_FUNC(sum, double, double, +=)
+OP_FUNC(sum, long_double, long double, +=)
+#if OMPI_HAVE_FORTRAN_REAL
+FORT_FLOAT_FUNC(sum, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FORT_FLOAT_FUNC(sum, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FORT_FLOAT_FUNC(sum, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FORT_FLOAT_FUNC(sum, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FORT_FLOAT_FUNC(sum, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FORT_FLOAT_FUNC(sum, fortran_real16, ompi_fortran_real16_t)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC(sum, c_float_complex, float _Complex, +=)
+OP_FUNC(sum, c_double_complex, double _Complex, +=)
+OP_FUNC(sum, c_long_double_complex, long double _Complex, +=)
+#endif // 0
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC(prod,   int8_t,   int8_t, *=)
+OP_FUNC(prod,  uint8_t,  uint8_t, *=)
+OP_FUNC(prod,  int16_t,  int16_t, *=)
+OP_FUNC(prod, uint16_t, uint16_t, *=)
+OP_FUNC(prod,  int32_t,  int32_t, *=)
+OP_FUNC(prod, uint32_t, uint32_t, *=)
+OP_FUNC(prod,  int64_t,  int64_t, *=)
+OP_FUNC(prod, uint64_t, uint64_t, *=)
+OP_FUNC(prod,  long,  long, *=)
+OP_FUNC(prod,  unsigned_long, unsigned long, *=)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC(prod, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC(prod, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC(prod, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC(prod, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC(prod, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC(prod, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+
+#if 0
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC(prod, short_float, short float, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC(prod, short_float, opal_short_float_t, *=)
+#endif
+#endif // 0
+
+OP_FUNC(prod, float, float, *=)
+OP_FUNC(prod, double, double, *=)
+OP_FUNC(prod, long_double, long double, *=)
+#if OMPI_HAVE_FORTRAN_REAL
+FORT_FLOAT_FUNC(prod, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FORT_FLOAT_FUNC(prod, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FORT_FLOAT_FUNC(prod, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FORT_FLOAT_FUNC(prod, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FORT_FLOAT_FUNC(prod, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FORT_FLOAT_FUNC(prod, fortran_real16, ompi_fortran_real16_t)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC(prod, c_short_float_complex, short float _Complex, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC(prod, c_float_complex, float _Complex, *=)
+OP_FUNC(prod, c_double_complex, double _Complex, *=)
+OP_FUNC(prod, c_long_double_complex, long double _Complex, *=)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC(land,   int8_t,   int8_t)
+FUNC_FUNC(land,  uint8_t,  uint8_t)
+FUNC_FUNC(land,  int16_t,  int16_t)
+FUNC_FUNC(land, uint16_t, uint16_t)
+FUNC_FUNC(land,  int32_t,  int32_t)
+FUNC_FUNC(land, uint32_t, uint32_t)
+FUNC_FUNC(land,  int64_t,  int64_t)
+FUNC_FUNC(land, uint64_t, uint64_t)
+FUNC_FUNC(land,  long,  long)
+FUNC_FUNC(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FORT_INT_FUNC(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC(lor,   int8_t,   int8_t)
+FUNC_FUNC(lor,  uint8_t,  uint8_t)
+FUNC_FUNC(lor,  int16_t,  int16_t)
+FUNC_FUNC(lor, uint16_t, uint16_t)
+FUNC_FUNC(lor,  int32_t,  int32_t)
+FUNC_FUNC(lor, uint32_t, uint32_t)
+FUNC_FUNC(lor,  int64_t,  int64_t)
+FUNC_FUNC(lor, uint64_t, uint64_t)
+FUNC_FUNC(lor,  long,  long)
+FUNC_FUNC(lor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FORT_INT_FUNC(lor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC(lxor,   int8_t,   int8_t)
+FUNC_FUNC(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC(lxor,  int16_t,  int16_t)
+FUNC_FUNC(lxor, uint16_t, uint16_t)
+FUNC_FUNC(lxor,  int32_t,  int32_t)
+FUNC_FUNC(lxor, uint32_t, uint32_t)
+FUNC_FUNC(lxor,  int64_t,  int64_t)
+FUNC_FUNC(lxor, uint64_t, uint64_t)
+FUNC_FUNC(lxor,  long,  long)
+FUNC_FUNC(lxor,  unsigned_long, unsigned long)
+
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FORT_INT_FUNC(lxor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC(band,   int8_t,   int8_t)
+FUNC_FUNC(band,  uint8_t,  uint8_t)
+FUNC_FUNC(band,  int16_t,  int16_t)
+FUNC_FUNC(band, uint16_t, uint16_t)
+FUNC_FUNC(band,  int32_t,  int32_t)
+FUNC_FUNC(band, uint32_t, uint32_t)
+FUNC_FUNC(band,  int64_t,  int64_t)
+FUNC_FUNC(band, uint64_t, uint64_t)
+FUNC_FUNC(band,  long,  long)
+FUNC_FUNC(band,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC(band, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC(band, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC(band, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC(band, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC(band, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC(band, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC(bor,   int8_t,   int8_t)
+FUNC_FUNC(bor,  uint8_t,  uint8_t)
+FUNC_FUNC(bor,  int16_t,  int16_t)
+FUNC_FUNC(bor, uint16_t, uint16_t)
+FUNC_FUNC(bor,  int32_t,  int32_t)
+FUNC_FUNC(bor, uint32_t, uint32_t)
+FUNC_FUNC(bor,  int64_t,  int64_t)
+FUNC_FUNC(bor, uint64_t, uint64_t)
+FUNC_FUNC(bor,  long,  long)
+FUNC_FUNC(bor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC(bor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC(bor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC(bor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC(bor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC(bor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC(bor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC(bxor,   int8_t,   int8_t)
+FUNC_FUNC(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC(bxor,  int16_t,  int16_t)
+FUNC_FUNC(bxor, uint16_t, uint16_t)
+FUNC_FUNC(bxor,  int32_t,  int32_t)
+FUNC_FUNC(bxor, uint32_t, uint32_t)
+FUNC_FUNC(bxor,  int64_t,  int64_t)
+FUNC_FUNC(bxor, uint64_t, uint64_t)
+FUNC_FUNC(bxor,  long,  long)
+FUNC_FUNC(bxor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC(bxor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC(bxor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC(bxor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC(bxor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC(bxor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC(bxor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC(bxor, byte, char)
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+#if 0
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC(maxloc, 2real, >)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC(maxloc, 2double_precision, >)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC(maxloc, 2integer, >)
+#endif
+#endif // 0
+LOC_FUNC(maxloc, float_int, >)
+LOC_FUNC(maxloc, double_int, >)
+LOC_FUNC(maxloc, long_int, >)
+LOC_FUNC(maxloc, 2int, >)
+LOC_FUNC(maxloc, short_int, >)
+LOC_FUNC(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+#if 0
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC(minloc, 2real, <)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC(minloc, 2double_precision, <)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC(minloc, 2integer, <)
+#endif
+#endif // 0
+LOC_FUNC(minloc, float_int, <)
+LOC_FUNC(minloc, double_int, <)
+LOC_FUNC(minloc, long_int, <)
+LOC_FUNC(minloc, 2int, <)
+LOC_FUNC(minloc, short_int, <)
+LOC_FUNC(minloc, long_double_int, <)
+
+
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *    routines, needed for some optimizations.
+ */
+#define FUNC_3BUF(name, type_name, type)                                                                        \
+    static                                                                                                      \
+    void ompi_op_rocm_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,      \
+                                                   struct ompi_datatype_t **dtype,                              \
+                                                   opal_accelerator_stream_t *stream,                           \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {                \
+        int threads_per_block;                                                                                  \
+        int source1_device, source2_device, target_device, device;                                              \
+        type *source1, *source2, *target;                                                                       \
+        int n = *count;                                                                                         \
+        device_op_pre(in1, (void**)&source1, &source1_device,                                                   \
+                      in2, (void**)&source2, &source2_device,                                                   \
+                      out, (void**)&target, &target_device,                                                     \
+                      n, *dtype,                                                                                \
+                      &threads_per_block, &device, stream);                                                     \
+        hipStream_t *custream = (hipStream_t*)stream->stream;                                                         \
+        ompi_op_rocm_3buff_##name##_##type_name##_submit(source1, source2, target, n, threads_per_block, *custream);\
+        device_op_post(source1, source1_device, source2, source2_device, out, target, target_device, n, *dtype, device, stream);\
+    }
+
+
+#define OP_FUNC_3BUF(name, type_name, type, op, ...) FUNC_3BUF(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
+
+/* reuse the macro above, no work is actually done so we don't care about the func */
+#define FUNC_FUNC_3BUF(name, type_name, type, ...) FUNC_3BUF(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+#define LOC_FUNC_3BUF(name, type_name, op) FUNC_3BUF(name, type_name, ompi_op_predefined_##type_name##_t)
+
+
+/* Dispatch Fortran types to C types */
+#define FORT_INT_FUNC_3BUF(name, type_name, type)                                                     \
+    static                                                                                       \
+    void ompi_op_rocm_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,        \
+                                                   struct ompi_datatype_t **dtype,               \
+                                                   opal_accelerator_stream_t *stream,            \
+                                                   struct ompi_op_base_module_1_0_0_t *module) { \
+                                                                                                 \
+        _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
+        switch(sizeof(type)) {  \
+            case sizeof(int8_t):  \
+                ompi_op_rocm_3buff_##name##_int8_t(in1, in2, out, count, dtype, stream, module); \
+                break; \
+            case sizeof(int16_t): \
+                ompi_op_rocm_3buff_##name##_int16_t(in1, in2, out, count, dtype, stream, module); \
+                break; \
+            case sizeof(int32_t): \
+                ompi_op_rocm_3buff_##name##_int32_t(in1, in2, out, count, dtype, stream, module); \
+                break; \
+            case sizeof(int64_t): \
+                ompi_op_rocm_3buff_##name##_int64_t(in1, in2, out, count, dtype, stream, module); \
+                break; \
+        } \
+    }
+
+/* Dispatch Fortran types to C types */
+#define FORT_FLOAT_FUNC_3BUF(name, type_name, type)                                                      \
+    static                                                                                          \
+    void ompi_op_rocm_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,           \
+                                                   struct ompi_datatype_t **dtype,                  \
+                                                   opal_accelerator_stream_t *stream,               \
+                                                   struct ompi_op_base_module_1_0_0_t *module) {    \
+        _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
+        switch(sizeof(type)) {  \
+            case sizeof(float):  \
+                ompi_op_rocm_3buff_##name##_float(in1, in2, out, count, dtype, stream, module);  \
+                break;  \
+            case sizeof(double): \
+                ompi_op_rocm_3buff_##name##_double(in1, in2, out, count, dtype, stream, module); \
+                break; \
+            case sizeof(long double): \
+                ompi_op_rocm_3buff_##name##_long_double(in1, in2, out, count, dtype, stream, module); \
+                break; \
+        } \
+    }
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF(max,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(max,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(max,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(max, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(max,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(max, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(max,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(max,  long,  long)
+FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC_3BUF(max, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC_3BUF(max, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC_3BUF(max, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC_3BUF(max, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC_3BUF(max, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+/* Floating point */
+#if 0
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF(max, short_float, opal_short_float_t)
+#endif
+#endif // 0
+FUNC_FUNC_3BUF(max, float, float)
+FUNC_FUNC_3BUF(max, double, double)
+FUNC_FUNC_3BUF(max, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FORT_FLOAT_FUNC_3BUF(max, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FORT_FLOAT_FUNC_3BUF(max, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FORT_FLOAT_FUNC_3BUF(max, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FORT_FLOAT_FUNC_3BUF(max, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FORT_FLOAT_FUNC_3BUF(max, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FORT_FLOAT_FUNC_3BUF(max, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF(min,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(min,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(min,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(min, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(min,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(min, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(min,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(min,  long,  long)
+FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC_3BUF(min, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC_3BUF(min, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC_3BUF(min, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC_3BUF(min, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC_3BUF(min, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC_3BUF(min, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if 0
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF(min, short_float, opal_short_float_t)
+#endif
+#endif // 0
+FUNC_FUNC_3BUF(min, float, float)
+FUNC_FUNC_3BUF(min, double, double)
+FUNC_FUNC_3BUF(min, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FORT_FLOAT_FUNC_3BUF(min, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FORT_FLOAT_FUNC_3BUF(min, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FORT_FLOAT_FUNC_3BUF(min, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FORT_FLOAT_FUNC_3BUF(min, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FORT_FLOAT_FUNC_3BUF(min, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FORT_FLOAT_FUNC_3BUF(min, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF(sum,   int8_t,   int8_t, +)
+OP_FUNC_3BUF(sum,  uint8_t,  uint8_t, +)
+OP_FUNC_3BUF(sum,  int16_t,  int16_t, +)
+OP_FUNC_3BUF(sum, uint16_t, uint16_t, +)
+OP_FUNC_3BUF(sum,  int32_t,  int32_t, +)
+OP_FUNC_3BUF(sum, uint32_t, uint32_t, +)
+OP_FUNC_3BUF(sum,  int64_t,  int64_t, +)
+OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
+OP_FUNC_3BUF(sum,  long,  long, +)
+OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC_3BUF(sum, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC_3BUF(sum, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC_3BUF(sum, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC_3BUF(sum, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC_3BUF(sum, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC_3BUF(sum, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if 0
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF(sum, short_float, short float, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF(sum, short_float, opal_short_float_t, +)
+#endif
+#endif // 0
+OP_FUNC_3BUF(sum, float, float, +)
+OP_FUNC_3BUF(sum, double, double, +)
+OP_FUNC_3BUF(sum, long_double, long double, +)
+#if OMPI_HAVE_FORTRAN_REAL
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FORT_FLOAT_FUNC_3BUF(sum, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FORT_FLOAT_FUNC_3BUF(sum, fortran_real16, ompi_fortran_real16_t)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF(sum, c_short_float_complex, short float _Complex, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF(sum, c_float_complex, float _Complex, +)
+OP_FUNC_3BUF(sum, c_double_complex, double _Complex, +)
+OP_FUNC_3BUF(sum, c_long_double_complex, long double _Complex, +)
+#endif // 0
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF(prod,   int8_t,   int8_t, *)
+OP_FUNC_3BUF(prod,  uint8_t,  uint8_t, *)
+OP_FUNC_3BUF(prod,  int16_t,  int16_t, *)
+OP_FUNC_3BUF(prod, uint16_t, uint16_t, *)
+OP_FUNC_3BUF(prod,  int32_t,  int32_t, *)
+OP_FUNC_3BUF(prod, uint32_t, uint32_t, *)
+OP_FUNC_3BUF(prod,  int64_t,  int64_t, *)
+OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
+OP_FUNC_3BUF(prod,  long,  long, *)
+OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC_3BUF(prod, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC_3BUF(prod, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC_3BUF(prod, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC_3BUF(prod, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC_3BUF(prod, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC_3BUF(prod, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if 0
+#if defined(HAVE_SHORT_FLOAT)
+FORT_FLOAT_FUNC_3BUF(prod, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FORT_FLOAT_FUNC_3BUF(prod, short_float, opal_short_float_t)
+#endif
+#endif // 0
+OP_FUNC_3BUF(prod, float, float, *)
+OP_FUNC_3BUF(prod, double, double, *)
+OP_FUNC_3BUF(prod, long_double, long double, *)
+#if OMPI_HAVE_FORTRAN_REAL
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FORT_FLOAT_FUNC_3BUF(prod, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FORT_FLOAT_FUNC_3BUF(prod, fortran_real16, ompi_fortran_real16_t)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF(prod, c_short_float_complex, short float _Complex, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF(prod, c_float_complex, float _Complex, *)
+OP_FUNC_3BUF(prod, c_double_complex, double _Complex, *)
+OP_FUNC_3BUF(prod, c_long_double_complex, long double _Complex, *)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC_3BUF(land,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(land,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(land,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(land, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(land,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(land, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(land,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(land,  long,  long)
+FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FORT_INT_FUNC_3BUF(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC_3BUF(lor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(lor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(lor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(lor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(lor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(lor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(lor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(lor,  long,  long)
+FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FORT_INT_FUNC_3BUF(lor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC_3BUF(lxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(lxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(lxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(lxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(lxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(lxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(lxor,  long,  long)
+FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FORT_INT_FUNC_3BUF(lxor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC_3BUF(band,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(band,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(band,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(band, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(band,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(band, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(band,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(band,  long,  long)
+FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC_3BUF(band, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC_3BUF(band, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC_3BUF(band, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC_3BUF(band, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC_3BUF(band, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC_3BUF(band, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FORT_INT_FUNC_3BUF(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC_3BUF(bor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(bor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(bor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(bor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(bor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(bor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(bor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(bor,  long,  long)
+FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC_3BUF(bor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC_3BUF(bor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC_3BUF(bor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC_3BUF(bor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC_3BUF(bor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC_3BUF(bor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FORT_INT_FUNC_3BUF(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC_3BUF(bxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(bxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(bxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(bxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(bxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(bxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(bxor,  long,  long)
+FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FORT_INT_FUNC_3BUF(bxor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FORT_INT_FUNC_3BUF(bxor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FORT_INT_FUNC_3BUF(bxor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FORT_INT_FUNC_3BUF(bxor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FORT_INT_FUNC_3BUF(bxor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FORT_INT_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FORT_INT_FUNC_3BUF(bxor, byte, char)
+
+/*************************************************************************
+ * Min and max location "pair" datatypes
+ *************************************************************************/
+
+/*
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_STRUCT_3BUF(2real, ompi_fortran_real_t, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_STRUCT_3BUF(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_STRUCT_3BUF(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
+#endif
+LOC_STRUCT_3BUF(float_int, float, int)
+LOC_STRUCT_3BUF(double_int, double, int)
+LOC_STRUCT_3BUF(long_int, long, int)
+LOC_STRUCT_3BUF(2int, int, int)
+LOC_STRUCT_3BUF(short_int, short, int)
+LOC_STRUCT_3BUF(long_double_int, long double, int)
+*/
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+#if 0
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF(maxloc, 2real, >)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF(maxloc, 2double_precision, >)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF(maxloc, 2integer, >)
+#endif
+#endif // 0
+LOC_FUNC_3BUF(maxloc, float_int, >)
+LOC_FUNC_3BUF(maxloc, double_int, >)
+LOC_FUNC_3BUF(maxloc, long_int, >)
+LOC_FUNC_3BUF(maxloc, 2int, >)
+LOC_FUNC_3BUF(maxloc, short_int, >)
+LOC_FUNC_3BUF(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+#if 0
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF(minloc, 2real, <)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF(minloc, 2double_precision, <)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF(minloc, 2integer, <)
+#endif
+#endif // 0
+LOC_FUNC_3BUF(minloc, float_int, <)
+LOC_FUNC_3BUF(minloc, double_int, <)
+LOC_FUNC_3BUF(minloc, long_int, <)
+LOC_FUNC_3BUF(minloc, 2int, <)
+LOC_FUNC_3BUF(minloc, short_int, <)
+LOC_FUNC_3BUF(minloc, long_double_int, <)
+
+/*
+ * Helpful defines, because there's soooo many names!
+ *
+ * **NOTE** These #define's used to be strictly ordered but the use of
+ * designated initializers removed this restrictions. When adding new
+ * operators ALWAYS use a designated initializer!
+ */
+
+/** C integer ***********************************************************/
+#define C_INTEGER(name, ftype)                                             \
+  [OMPI_OP_BASE_TYPE_INT8_T] = ompi_op_rocm_##ftype##_##name##_int8_t,     \
+  [OMPI_OP_BASE_TYPE_UINT8_T] = ompi_op_rocm_##ftype##_##name##_uint8_t,   \
+  [OMPI_OP_BASE_TYPE_INT16_T] = ompi_op_rocm_##ftype##_##name##_int16_t,   \
+  [OMPI_OP_BASE_TYPE_UINT16_T] = ompi_op_rocm_##ftype##_##name##_uint16_t, \
+  [OMPI_OP_BASE_TYPE_INT32_T] = ompi_op_rocm_##ftype##_##name##_int32_t,   \
+  [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_rocm_##ftype##_##name##_uint32_t, \
+  [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_rocm_##ftype##_##name##_int64_t,   \
+  [OMPI_OP_BASE_TYPE_LONG] = ompi_op_rocm_##ftype##_##name##_long,   \
+  [OMPI_OP_BASE_TYPE_UNSIGNED_LONG] = ompi_op_rocm_##ftype##_##name##_unsigned_long,   \
+  [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_rocm_##ftype##_##name##_uint64_t
+
+/** All the Fortran integers ********************************************/
+
+#if OMPI_HAVE_FORTRAN_INTEGER
+#define FORTRAN_INTEGER_PLAIN(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_integer
+#else
+#define FORTRAN_INTEGER_PLAIN(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+#define FORTRAN_INTEGER1(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_integer1
+#else
+#define FORTRAN_INTEGER1(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+#define FORTRAN_INTEGER2(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_integer2
+#else
+#define FORTRAN_INTEGER2(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+#define FORTRAN_INTEGER4(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_integer4
+#else
+#define FORTRAN_INTEGER4(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+#define FORTRAN_INTEGER8(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_integer8
+#else
+#define FORTRAN_INTEGER8(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+#define FORTRAN_INTEGER16(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_integer16
+#else
+#define FORTRAN_INTEGER16(name, ftype) NULL
+#endif
+
+#define FORTRAN_INTEGER(name, ftype)                                  \
+    [OMPI_OP_BASE_TYPE_INTEGER] = FORTRAN_INTEGER_PLAIN(name, ftype), \
+    [OMPI_OP_BASE_TYPE_INTEGER1] = FORTRAN_INTEGER1(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_INTEGER2] = FORTRAN_INTEGER2(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_INTEGER4] = FORTRAN_INTEGER4(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_INTEGER8] = FORTRAN_INTEGER8(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_INTEGER16] = FORTRAN_INTEGER16(name, ftype)
+
+/** All the Fortran reals ***********************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+#define FLOATING_POINT_FORTRAN_REAL_PLAIN(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_real
+#else
+#define FLOATING_POINT_FORTRAN_REAL_PLAIN(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+#define FLOATING_POINT_FORTRAN_REAL2(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_real2
+#else
+#define FLOATING_POINT_FORTRAN_REAL2(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+#define FLOATING_POINT_FORTRAN_REAL4(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_real4
+#else
+#define FLOATING_POINT_FORTRAN_REAL4(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+#define FLOATING_POINT_FORTRAN_REAL8(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_real8
+#else
+#define FLOATING_POINT_FORTRAN_REAL8(name, ftype) NULL
+#endif
+/* If:
+   - we have fortran REAL*16, *and*
+   - fortran REAL*16 matches the bit representation of the
+     corresponding C type
+   Only then do we put in function pointers for REAL*16 reductions.
+   Otherwise, just put in NULL. */
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+#define FLOATING_POINT_FORTRAN_REAL16(name, ftype) ompi_op_rocm_##ftype##_##name##_fortran_real16
+#else
+#define FLOATING_POINT_FORTRAN_REAL16(name, ftype) NULL
+#endif
+
+#define FLOATING_POINT_FORTRAN_REAL(name, ftype)                               \
+    [OMPI_OP_BASE_TYPE_REAL] = FLOATING_POINT_FORTRAN_REAL_PLAIN(name, ftype), \
+    [OMPI_OP_BASE_TYPE_REAL2] = FLOATING_POINT_FORTRAN_REAL2(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_REAL4] = FLOATING_POINT_FORTRAN_REAL4(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_REAL8] = FLOATING_POINT_FORTRAN_REAL8(name, ftype),     \
+    [OMPI_OP_BASE_TYPE_REAL16] = FLOATING_POINT_FORTRAN_REAL16(name, ftype)
+
+/** Fortran double precision ********************************************/
+
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+#define FLOATING_POINT_FORTRAN_DOUBLE_PRECISION(name, ftype)  \
+    ompi_op_rocm_##ftype##_##name##_fortran_double_precision
+#else
+#define FLOATING_POINT_FORTRAN_DOUBLE_PRECISION(name, ftype) NULL
+#endif
+
+/** Floating point, including all the Fortran reals *********************/
+
+//#if defined(HAVE_SHORT_FLOAT) || defined(HAVE_OPAL_SHORT_FLOAT_T)
+//#define SHORT_FLOAT(name, ftype) ompi_op_rocm_##ftype##_##name##_short_float
+//#else
+#define SHORT_FLOAT(name, ftype) NULL
+//#endif
+#define FLOAT(name, ftype) ompi_op_rocm_##ftype##_##name##_float
+#define DOUBLE(name, ftype) ompi_op_rocm_##ftype##_##name##_double
+#define LONG_DOUBLE(name, ftype) ompi_op_rocm_##ftype##_##name##_long_double
+
+#define FLOATING_POINT(name, ftype)                                                            \
+  [OMPI_OP_BASE_TYPE_SHORT_FLOAT] = SHORT_FLOAT(name, ftype),                                  \
+  [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                                              \
+  [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype),                                            \
+  FLOATING_POINT_FORTRAN_REAL(name, ftype),                                                    \
+  [OMPI_OP_BASE_TYPE_DOUBLE_PRECISION] = FLOATING_POINT_FORTRAN_DOUBLE_PRECISION(name, ftype), \
+  [OMPI_OP_BASE_TYPE_LONG_DOUBLE] = LONG_DOUBLE(name, ftype)
+
+/** Fortran logical *****************************************************/
+
+#if OMPI_HAVE_FORTRAN_LOGICAL
+#define FORTRAN_LOGICAL(name, ftype)                                          \
+  ompi_op_rocm_##ftype##_##name##_fortran_logical  /* OMPI_OP_CUDA_TYPE_LOGICAL */
+#else
+#define FORTRAN_LOGICAL(name, ftype) NULL
+#endif
+
+#define LOGICAL(name, ftype)                                    \
+    [OMPI_OP_BASE_TYPE_LOGICAL] = FORTRAN_LOGICAL(name, ftype), \
+    [OMPI_OP_BASE_TYPE_BOOL] = ompi_op_rocm_##ftype##_##name##_bool
+
+/** Complex *****************************************************/
+#if 0
+
+#if defined(HAVE_SHORT_FLOAT__COMPLEX) || defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+#define SHORT_FLOAT_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_short_float_complex
+#else
+#define SHORT_FLOAT_COMPLEX(name, ftype) NULL
+#endif
+#define FLOAT_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_float_complex
+#define DOUBLE_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_double_complex
+#define LONG_DOUBLE_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_long_double_complex
+#else
+#define SHORT_FLOAT_COMPLEX(name, ftype) NULL
+#define FLOAT_COMPLEX(name, ftype) NULL
+#define DOUBLE_COMPLEX(name, ftype) NULL
+#define LONG_DOUBLE_COMPLEX(name, ftype) NULL
+#endif // 0
+
+#define COMPLEX(name, ftype)                                                  \
+    [OMPI_OP_CUDA_TYPE_C_SHORT_FLOAT_COMPLEX] = SHORT_FLOAT_COMPLEX(name, ftype), \
+    [OMPI_OP_CUDA_TYPE_C_FLOAT_COMPLEX] = FLOAT_COMPLEX(name, ftype),         \
+    [OMPI_OP_CUDA_TYPE_C_DOUBLE_COMPLEX] = DOUBLE_COMPLEX(name, ftype),       \
+    [OMPI_OP_CUDA_TYPE_C_LONG_DOUBLE_COMPLEX] = LONG_DOUBLE_COMPLEX(name, ftype)
+
+/** Byte ****************************************************************/
+
+#define BYTE(name, ftype)                                     \
+  [OMPI_OP_BASE_TYPE_BYTE] = ompi_op_rocm_##ftype##_##name##_byte
+
+/** Fortran complex *****************************************************/
+/** Fortran "2" types ***************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL && OMPI_SIZEOF_FLOAT == OMPI_SIZEOF_FORTRAN_REAL
+#define TWOLOC_FORTRAN_2REAL(name, ftype) ompi_op_rocm_##ftype##_##name##_2double_precision
+#else
+#define TWOLOC_FORTRAN_2REAL(name, ftype) NULL
+#endif
+
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION && OMPI_SIZEOF_DOUBLE == OMPI_SIZEOF_FORTRAN_DOUBLE_PRECISION
+#define TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype) ompi_op_rocm_##ftype##_##name##_2double_precision
+#else
+#define TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype) NULL
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER && OMPI_SIZEOF_INT == OMPI_SIZEOF_FORTRAN_INTEGER
+#define TWOLOC_FORTRAN_2INTEGER(name, ftype) ompi_op_rocm_##ftype##_##name##_2int
+#else
+#define TWOLOC_FORTRAN_2INTEGER(name, ftype) NULL
+#endif
+
+/** All "2" types *******************************************************/
+
+#define TWOLOC(name, ftype)                                                                   \
+    [OMPI_OP_BASE_TYPE_2REAL] = TWOLOC_FORTRAN_2REAL(name, ftype),                            \
+    [OMPI_OP_BASE_TYPE_2DOUBLE_PRECISION] = TWOLOC_FORTRAN_2DOUBLE_PRECISION(name, ftype),    \
+    [OMPI_OP_BASE_TYPE_2INTEGER] = TWOLOC_FORTRAN_2INTEGER(name, ftype),                      \
+    [OMPI_OP_BASE_TYPE_FLOAT_INT] = ompi_op_rocm_##ftype##_##name##_float_int,                \
+    [OMPI_OP_BASE_TYPE_DOUBLE_INT] = ompi_op_rocm_##ftype##_##name##_double_int,              \
+    [OMPI_OP_BASE_TYPE_LONG_INT] = ompi_op_rocm_##ftype##_##name##_long_int,                  \
+    [OMPI_OP_BASE_TYPE_2INT] = ompi_op_rocm_##ftype##_##name##_2int,                          \
+    [OMPI_OP_BASE_TYPE_SHORT_INT] = ompi_op_rocm_##ftype##_##name##_short_int,                \
+    [OMPI_OP_BASE_TYPE_LONG_DOUBLE_INT] = ompi_op_rocm_##ftype##_##name##_long_double_int
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+    (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+    (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+     OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+ompi_op_base_stream_handler_fn_t ompi_op_rocm_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+    {
+        /* Corresponds to MPI_OP_NULL */
+        [OMPI_OP_BASE_FORTRAN_NULL] = {
+            /* Leaving this empty puts in NULL for all entries */
+            NULL,
+        },
+        /* Corresponds to MPI_MAX */
+        [OMPI_OP_BASE_FORTRAN_MAX] = {
+            C_INTEGER(max, 2buff),
+            FORTRAN_INTEGER(max, 2buff),
+            FLOATING_POINT(max, 2buff),
+        },
+        /* Corresponds to MPI_MIN */
+        [OMPI_OP_BASE_FORTRAN_MIN] = {
+            C_INTEGER(min, 2buff),
+            FORTRAN_INTEGER(min, 2buff),
+            FLOATING_POINT(min, 2buff),
+        },
+        /* Corresponds to MPI_SUM */
+        [OMPI_OP_BASE_FORTRAN_SUM] = {
+            C_INTEGER(sum, 2buff),
+            FORTRAN_INTEGER(sum, 2buff),
+            FLOATING_POINT(sum, 2buff),
+            NULL,
+        },
+        /* Corresponds to MPI_PROD */
+        [OMPI_OP_BASE_FORTRAN_PROD] = {
+            C_INTEGER(prod, 2buff),
+            FORTRAN_INTEGER(prod, 2buff),
+            FLOATING_POINT(prod, 2buff),
+            NULL,
+        },
+        /* Corresponds to MPI_LAND */
+        [OMPI_OP_BASE_FORTRAN_LAND] = {
+            C_INTEGER(land, 2buff),
+            LOGICAL(land, 2buff),
+        },
+        /* Corresponds to MPI_BAND */
+        [OMPI_OP_BASE_FORTRAN_BAND] = {
+            C_INTEGER(band, 2buff),
+            FORTRAN_INTEGER(band, 2buff),
+            BYTE(band, 2buff),
+        },
+        /* Corresponds to MPI_LOR */
+        [OMPI_OP_BASE_FORTRAN_LOR] = {
+            C_INTEGER(lor, 2buff),
+            LOGICAL(lor, 2buff),
+        },
+        /* Corresponds to MPI_BOR */
+        [OMPI_OP_BASE_FORTRAN_BOR] = {
+            C_INTEGER(bor, 2buff),
+            FORTRAN_INTEGER(bor, 2buff),
+            BYTE(bor, 2buff),
+        },
+        /* Corresponds to MPI_LXOR */
+        [OMPI_OP_BASE_FORTRAN_LXOR] = {
+            C_INTEGER(lxor, 2buff),
+            LOGICAL(lxor, 2buff),
+        },
+        /* Corresponds to MPI_BXOR */
+        [OMPI_OP_BASE_FORTRAN_BXOR] = {
+            C_INTEGER(bxor, 2buff),
+            FORTRAN_INTEGER(bxor, 2buff),
+            BYTE(bxor, 2buff),
+        },
+        /* Corresponds to MPI_MAXLOC */
+        [OMPI_OP_BASE_FORTRAN_MAXLOC] = {
+            TWOLOC(maxloc, 2buff),
+        },
+        /* Corresponds to MPI_MINLOC */
+        [OMPI_OP_BASE_FORTRAN_MINLOC] = {
+            TWOLOC(minloc, 2buff),
+        },
+        /* Corresponds to MPI_REPLACE */
+        [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+            /* (MPI_ACCUMULATE is handled differently than the other
+               reductions, so just zero out its function
+               implementations here to ensure that users don't invoke
+               MPI_REPLACE with any reduction operations other than
+               ACCUMULATE) */
+            NULL,
+        },
+
+    };
+
+ompi_op_base_3buff_stream_handler_fn_t ompi_op_rocm_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+    {
+        /* Corresponds to MPI_OP_NULL */
+        [OMPI_OP_BASE_FORTRAN_NULL] = {
+            /* Leaving this empty puts in NULL for all entries */
+            NULL,
+        },
+        /* Corresponds to MPI_MAX */
+        [OMPI_OP_BASE_FORTRAN_MAX] = {
+            C_INTEGER(max, 3buff),
+            FORTRAN_INTEGER(max, 3buff),
+            FLOATING_POINT(max, 3buff),
+        },
+        /* Corresponds to MPI_MIN */
+        [OMPI_OP_BASE_FORTRAN_MIN] = {
+            C_INTEGER(min, 3buff),
+            FORTRAN_INTEGER(min, 3buff),
+            FLOATING_POINT(min, 3buff),
+        },
+        /* Corresponds to MPI_SUM */
+        [OMPI_OP_BASE_FORTRAN_SUM] = {
+            C_INTEGER(sum, 3buff),
+            FORTRAN_INTEGER(sum, 3buff),
+            FLOATING_POINT(sum, 3buff),
+            NULL,
+        },
+        /* Corresponds to MPI_PROD */
+        [OMPI_OP_BASE_FORTRAN_PROD] = {
+            C_INTEGER(prod, 3buff),
+            FORTRAN_INTEGER(prod, 3buff),
+            FLOATING_POINT(prod, 3buff),
+            NULL,
+        },
+        /* Corresponds to MPI_LAND */
+        [OMPI_OP_BASE_FORTRAN_LAND] ={
+            C_INTEGER(land, 3buff),
+            LOGICAL(land, 3buff),
+        },
+        /* Corresponds to MPI_BAND */
+        [OMPI_OP_BASE_FORTRAN_BAND] = {
+            C_INTEGER(band, 3buff),
+            FORTRAN_INTEGER(band, 3buff),
+            BYTE(band, 3buff),
+        },
+        /* Corresponds to MPI_LOR */
+        [OMPI_OP_BASE_FORTRAN_LOR] = {
+            C_INTEGER(lor, 3buff),
+            LOGICAL(lor, 3buff),
+        },
+        /* Corresponds to MPI_BOR */
+        [OMPI_OP_BASE_FORTRAN_BOR] = {
+            C_INTEGER(bor, 3buff),
+            FORTRAN_INTEGER(bor, 3buff),
+            BYTE(bor, 3buff),
+        },
+        /* Corresponds to MPI_LXOR */
+        [OMPI_OP_BASE_FORTRAN_LXOR] = {
+            C_INTEGER(lxor, 3buff),
+            LOGICAL(lxor, 3buff),
+        },
+        /* Corresponds to MPI_BXOR */
+        [OMPI_OP_BASE_FORTRAN_BXOR] = {
+            C_INTEGER(bxor, 3buff),
+            FORTRAN_INTEGER(bxor, 3buff),
+            BYTE(bxor, 3buff),
+        },
+        /* Corresponds to MPI_MAXLOC */
+        [OMPI_OP_BASE_FORTRAN_MAXLOC] = {
+            TWOLOC(maxloc, 3buff),
+        },
+        /* Corresponds to MPI_MINLOC */
+        [OMPI_OP_BASE_FORTRAN_MINLOC] = {
+            TWOLOC(minloc, 3buff),
+        },
+        /* Corresponds to MPI_REPLACE */
+        [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+            /* MPI_ACCUMULATE is handled differently than the other
+               reductions, so just zero out its function
+               implementations here to ensure that users don't invoke
+               MPI_REPLACE with any reduction operations other than
+               ACCUMULATE */
+            NULL,
+        },
+    };
\ No newline at end of file
diff --git a/ompi/mca/op/rocm/op_rocm_impl.c b/ompi/mca/op/rocm/op_rocm_impl.c
new file mode 100644
index 00000000000..9f964c3a4b7
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_impl.c
@@ -0,0 +1,1024 @@
+#include "hip/hip_runtime.h"
+/*
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <sys/types.h>
+
+#include <hip/hip_complex.h>
+
+#include "op_rocm_impl.h"
+
+/* TODO: missing support for
+ * - short float (conditional on whether short float is available)
+ * - complex
+ * - 3buff implementation
+ */
+
+#define THREADS_PER_BLOCK 512
+
+#define OP_FUNC(name, type_name, type, op)                                                          \
+    static __global__ void                                                                          \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {          \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            inout[i] = inout[i] op in[i];                                                           \
+        }                                                                                           \
+    }                                                                                               \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                           \
+                                                   type *inout,                                     \
+                                                   int count,                                       \
+                                                   int threads_per_block,                           \
+                                                   hipStream_t stream) {                               \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count + threads-1) / threads;                                                \
+        int n = count;                                                                              \
+        hipStream_t s = stream;                                                                        \
+        ompi_op_rocm_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+    }
+
+
+#define FUNC_FUNC(name, type_name, type)                                                            \
+    static __global__ void                                                                          \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {          \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            inout[i] = current_func(inout[i], in[i]);                                               \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                                \
+                                              type *inout,                                          \
+                                              int count,                                            \
+                                              int threads_per_block,                                \
+                                              hipStream_t stream) {                                    \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count + threads-1) / threads;                                                \
+        int n = count;                                                                              \
+        hipStream_t s = stream;                                                                        \
+        ompi_op_rocm_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+    }
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+
+#define LOC_FUNC(name, type_name, op)                                                               \
+    static __global__ void                                                                          \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in,  \
+                                                     ompi_op_predefined_##type_name##_t *inout,     \
+                                                     int n)                                         \
+    {                                                                                               \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            const ompi_op_predefined_##type_name##_t *a = &in[i];                                   \
+            ompi_op_predefined_##type_name##_t *b = &inout[i];                                      \
+            if (a->v op b->v) {                                                                     \
+                b->v = a->v;                                                                        \
+                b->k = a->k;                                                                        \
+            } else if (a->v == b->v) {                                                              \
+                b->k = (b->k < a->k ? b->k : a->k);                                                 \
+            }                                                                                       \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_rocm_2buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a,   \
+                                            ompi_op_predefined_##type_name##_t *b,                  \
+                                            int count,                                              \
+                                            int threads_per_block,                                  \
+                                            hipStream_t stream) {                                      \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count + threads-1) / threads;                                                \
+        hipStream_t s = stream;                                                                        \
+        ompi_op_rocm_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(a, b, count);   \
+    }
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC(max,   int8_t,   int8_t)
+FUNC_FUNC(max,  uint8_t,  uint8_t)
+FUNC_FUNC(max,  int16_t,  int16_t)
+FUNC_FUNC(max, uint16_t, uint16_t)
+FUNC_FUNC(max,  int32_t,  int32_t)
+FUNC_FUNC(max, uint32_t, uint32_t)
+FUNC_FUNC(max,  int64_t,  int64_t)
+FUNC_FUNC(max, uint64_t, uint64_t)
+FUNC_FUNC(max,  long,  long)
+FUNC_FUNC(max,  unsigned_long, unsigned long)
+
+FUNC_FUNC(max, float, float)
+FUNC_FUNC(max, double, double)
+FUNC_FUNC(max, long_double, long double)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC(min,   int8_t,   int8_t)
+FUNC_FUNC(min,  uint8_t,  uint8_t)
+FUNC_FUNC(min,  int16_t,  int16_t)
+FUNC_FUNC(min, uint16_t, uint16_t)
+FUNC_FUNC(min,  int32_t,  int32_t)
+FUNC_FUNC(min, uint32_t, uint32_t)
+FUNC_FUNC(min,  int64_t,  int64_t)
+FUNC_FUNC(min, uint64_t, uint64_t)
+FUNC_FUNC(min,  long,  long)
+FUNC_FUNC(min,  unsigned_long, unsigned long)
+
+
+FUNC_FUNC(min, float, float)
+FUNC_FUNC(min, double, double)
+FUNC_FUNC(min, long_double, long double)
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC(sum,   int8_t,   int8_t, +=)
+OP_FUNC(sum,  uint8_t,  uint8_t, +=)
+OP_FUNC(sum,  int16_t,  int16_t, +=)
+OP_FUNC(sum, uint16_t, uint16_t, +=)
+OP_FUNC(sum,  int32_t,  int32_t, +=)
+OP_FUNC(sum, uint32_t, uint32_t, +=)
+OP_FUNC(sum,  int64_t,  int64_t, +=)
+OP_FUNC(sum, uint64_t, uint64_t, +=)
+OP_FUNC(sum,  long,  long, +=)
+OP_FUNC(sum,  unsigned_long, unsigned long, +=)
+
+OP_FUNC(sum, float, float, +=)
+OP_FUNC(sum, double, double, +=)
+OP_FUNC(sum, long_double, long double, +=)
+
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+#endif // 0
+#undef current_func
+#define current_func(a, b) (hipCmulf(a,b))
+FUNC_FUNC(sum, c_float_complex, hipFloatComplex)
+#undef current_func
+#define current_func(a, b) (hipCmul(a,b))
+FUNC_FUNC(sum, c_double_complex, hipDoubleComplex)
+//OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC(prod,   int8_t,   int8_t, *=)
+OP_FUNC(prod,  uint8_t,  uint8_t, *=)
+OP_FUNC(prod,  int16_t,  int16_t, *=)
+OP_FUNC(prod, uint16_t, uint16_t, *=)
+OP_FUNC(prod,  int32_t,  int32_t, *=)
+OP_FUNC(prod, uint32_t, uint32_t, *=)
+OP_FUNC(prod,  int64_t,  int64_t, *=)
+OP_FUNC(prod, uint64_t, uint64_t, *=)
+OP_FUNC(prod,  long,  long, *=)
+OP_FUNC(prod,  unsigned_long, unsigned long, *=)
+
+OP_FUNC(prod, float, float, *=)
+OP_FUNC(prod, double, double, *=)
+OP_FUNC(prod, long_double, long double, *=)
+
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC(prod, c_short_float_complex, short float _Complex, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC(prod, c_float_complex, float _Complex, *=)
+OP_FUNC(prod, c_double_complex, double _Complex, *=)
+OP_FUNC(prod, c_long_double_complex, long double _Complex, *=)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC(land,   int8_t,   int8_t)
+FUNC_FUNC(land,  uint8_t,  uint8_t)
+FUNC_FUNC(land,  int16_t,  int16_t)
+FUNC_FUNC(land, uint16_t, uint16_t)
+FUNC_FUNC(land,  int32_t,  int32_t)
+FUNC_FUNC(land, uint32_t, uint32_t)
+FUNC_FUNC(land,  int64_t,  int64_t)
+FUNC_FUNC(land, uint64_t, uint64_t)
+FUNC_FUNC(land,  long,  long)
+FUNC_FUNC(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC(lor,   int8_t,   int8_t)
+FUNC_FUNC(lor,  uint8_t,  uint8_t)
+FUNC_FUNC(lor,  int16_t,  int16_t)
+FUNC_FUNC(lor, uint16_t, uint16_t)
+FUNC_FUNC(lor,  int32_t,  int32_t)
+FUNC_FUNC(lor, uint32_t, uint32_t)
+FUNC_FUNC(lor,  int64_t,  int64_t)
+FUNC_FUNC(lor, uint64_t, uint64_t)
+FUNC_FUNC(lor,  long,  long)
+FUNC_FUNC(lor,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC(lxor,   int8_t,   int8_t)
+FUNC_FUNC(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC(lxor,  int16_t,  int16_t)
+FUNC_FUNC(lxor, uint16_t, uint16_t)
+FUNC_FUNC(lxor,  int32_t,  int32_t)
+FUNC_FUNC(lxor, uint32_t, uint32_t)
+FUNC_FUNC(lxor,  int64_t,  int64_t)
+FUNC_FUNC(lxor, uint64_t, uint64_t)
+FUNC_FUNC(lxor,  long,  long)
+FUNC_FUNC(lxor,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC(band,   int8_t,   int8_t)
+FUNC_FUNC(band,  uint8_t,  uint8_t)
+FUNC_FUNC(band,  int16_t,  int16_t)
+FUNC_FUNC(band, uint16_t, uint16_t)
+FUNC_FUNC(band,  int32_t,  int32_t)
+FUNC_FUNC(band, uint32_t, uint32_t)
+FUNC_FUNC(band,  int64_t,  int64_t)
+FUNC_FUNC(band, uint64_t, uint64_t)
+FUNC_FUNC(band,  long,  long)
+FUNC_FUNC(band,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC(bor,   int8_t,   int8_t)
+FUNC_FUNC(bor,  uint8_t,  uint8_t)
+FUNC_FUNC(bor,  int16_t,  int16_t)
+FUNC_FUNC(bor, uint16_t, uint16_t)
+FUNC_FUNC(bor,  int32_t,  int32_t)
+FUNC_FUNC(bor, uint32_t, uint32_t)
+FUNC_FUNC(bor,  int64_t,  int64_t)
+FUNC_FUNC(bor, uint64_t, uint64_t)
+FUNC_FUNC(bor,  long,  long)
+FUNC_FUNC(bor,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC(bxor,   int8_t,   int8_t)
+FUNC_FUNC(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC(bxor,  int16_t,  int16_t)
+FUNC_FUNC(bxor, uint16_t, uint16_t)
+FUNC_FUNC(bxor,  int32_t,  int32_t)
+FUNC_FUNC(bxor, uint32_t, uint32_t)
+FUNC_FUNC(bxor,  int64_t,  int64_t)
+FUNC_FUNC(bxor, uint64_t, uint64_t)
+FUNC_FUNC(bxor,  long,  long)
+FUNC_FUNC(bxor,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC(bxor, byte, char)
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+LOC_FUNC(maxloc, float_int, >)
+LOC_FUNC(maxloc, double_int, >)
+LOC_FUNC(maxloc, long_int, >)
+LOC_FUNC(maxloc, 2int, >)
+LOC_FUNC(maxloc, short_int, >)
+LOC_FUNC(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+LOC_FUNC(minloc, float_int, <)
+LOC_FUNC(minloc, double_int, <)
+LOC_FUNC(minloc, long_int, <)
+LOC_FUNC(minloc, 2int, <)
+LOC_FUNC(minloc, short_int, <)
+LOC_FUNC(minloc, long_double_int, <)
+
+
+/*
+ *  This is a three buffer (2 input and 1 output) version of the reduction
+ *    routines, needed for some optimizations.
+ */
+#define OP_FUNC_3BUF(name, type_name, type, op)                                                     \
+    static __global__ void                                                                          \
+    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const type *in1, const type* in2,              \
+                                                     type *out, int n) {                            \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            out[i] = in1[i] op in2[i];                                                              \
+        }                                                                                           \
+    }                                                                                               \
+    void ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1, const type *in2,         \
+                                                          type *out, int count,                     \
+                                                          int threads_per_block,                    \
+                                                          hipStream_t stream) {                        \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count+threads-1) / threads;                                                  \
+        ompi_op_rocm_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
+                                                           0, stream>>>(in1, in2, out, count);      \
+    }
+
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out = op(in1, in2))
+ */
+#define FUNC_FUNC_3BUF(name, type_name, type)                                                       \
+    static __global__ void                                                                          \
+    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const type *in1, const type *in2,              \
+                                                     type *out, int n) {                            \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            out[i] = current_func(in1[i], in2[i]);                                                  \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1, const type *in2,              \
+                                                     type *out, int count,                          \
+                                                     int threads_per_block,                         \
+                                                     hipStream_t stream) {                             \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count+threads-1) / threads;                                                  \
+        ompi_op_rocm_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
+                                                           0, stream>>>(in1, in2, out, count);      \
+    }
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+/*
+#define LOC_STRUCT(type_name, type1, type2) \
+  typedef struct { \
+      type1 v; \
+      type2 k; \
+  } ompi_op_predefined_##type_name##_t;
+*/
+
+#define LOC_FUNC_3BUF(name, type_name, op)                                                          \
+    static __global__ void                                                                          \
+    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in1, \
+                                                     const ompi_op_predefined_##type_name##_t *in2, \
+                                                     ompi_op_predefined_##type_name##_t *out,       \
+                                                     int n)                                         \
+    {                                                                                               \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            const ompi_op_predefined_##type_name##_t *a1 = &in1[i];                                 \
+            const ompi_op_predefined_##type_name##_t *a2 = &in2[i];                                 \
+            ompi_op_predefined_##type_name##_t *b = &out[i];                                        \
+            if (a1->v op a2->v) {                                                                   \
+                b->v = a1->v;                                                                       \
+                b->k = a1->k;                                                                       \
+            } else if (a1->v == a2->v) {                                                            \
+                b->v = a1->v;                                                                       \
+                b->k = (a2->k < a1->k ? a2->k : a1->k);                                             \
+            } else {                                                                                \
+                b->v = a2->v;                                                                       \
+                b->k = a2->k;                                                                       \
+            }                                                                                       \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_rocm_3buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *in1, \
+                                                     const ompi_op_predefined_##type_name##_t *in2, \
+                                                     ompi_op_predefined_##type_name##_t *out,       \
+                                                     int count,                                     \
+                                                     int threads_per_block,                         \
+                                                     hipStream_t stream)                               \
+    {                                                                                               \
+        int threads = threads_per_block;                                                            \
+        int blocks  = (count+threads-1) / threads;                                                  \
+        ompi_op_rocm_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
+                                                           0, stream>>>(in1, in2, out, count);      \
+    }
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF(max,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(max,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(max,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(max, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(max,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(max, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(max,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(max,  long,  long)
+FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(max, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(max, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(max, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(max, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(max, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(max, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF(max, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF(max, float, float)
+FUNC_FUNC_3BUF(max, double, double)
+FUNC_FUNC_3BUF(max, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF(max, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF(max, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF(max, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF(max, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF(max, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF(max, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+/* C integer */
+FUNC_FUNC_3BUF(min,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(min,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(min,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(min, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(min,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(min, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(min,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(min,  long,  long)
+FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(min, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(min, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(min, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(min, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(min, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(min, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF(min, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF(min, float, float)
+FUNC_FUNC_3BUF(min, double, double)
+FUNC_FUNC_3BUF(min, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF(min, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF(min, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF(min, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF(min, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF(min, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF(min, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF(sum,   int8_t,   int8_t, +)
+OP_FUNC_3BUF(sum,  uint8_t,  uint8_t, +)
+OP_FUNC_3BUF(sum,  int16_t,  int16_t, +)
+OP_FUNC_3BUF(sum, uint16_t, uint16_t, +)
+OP_FUNC_3BUF(sum,  int32_t,  int32_t, +)
+OP_FUNC_3BUF(sum, uint32_t, uint32_t, +)
+OP_FUNC_3BUF(sum,  int64_t,  int64_t, +)
+OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
+OP_FUNC_3BUF(sum,  long,  long, +)
+OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF(sum, fortran_integer, ompi_fortran_integer_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF(sum, fortran_integer1, ompi_fortran_integer1_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF(sum, fortran_integer2, ompi_fortran_integer2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF(sum, fortran_integer4, ompi_fortran_integer4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF(sum, fortran_integer8, ompi_fortran_integer8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF(sum, fortran_integer16, ompi_fortran_integer16_t, +)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF(sum, short_float, short float, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF(sum, short_float, opal_short_float_t, +)
+#endif
+OP_FUNC_3BUF(sum, float, float, +)
+OP_FUNC_3BUF(sum, double, double, +)
+OP_FUNC_3BUF(sum, long_double, long double, +)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF(sum, fortran_real, ompi_fortran_real_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF(sum, fortran_real2, ompi_fortran_real2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF(sum, fortran_real4, ompi_fortran_real4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF(sum, fortran_real8, ompi_fortran_real8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF(sum, fortran_real16, ompi_fortran_real16_t, +)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF(sum, c_short_float_complex, short float _Complex, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+#endif // 0
+#undef current_func
+#define current_func(a, b) (hipCmulf(a,b))
+FUNC_FUNC_3BUF(sum, c_float_complex, hipFloatComplex)
+#undef current_func
+#define current_func(a, b) (hipCmul(a,b))
+FUNC_FUNC_3BUF(sum, c_double_complex, hipDoubleComplex)
+//OP_FUNC_3BUF(sum, c_long_double_complex, cuLongDoubleComplex, +)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF(prod,   int8_t,   int8_t, *)
+OP_FUNC_3BUF(prod,  uint8_t,  uint8_t, *)
+OP_FUNC_3BUF(prod,  int16_t,  int16_t, *)
+OP_FUNC_3BUF(prod, uint16_t, uint16_t, *)
+OP_FUNC_3BUF(prod,  int32_t,  int32_t, *)
+OP_FUNC_3BUF(prod, uint32_t, uint32_t, *)
+OP_FUNC_3BUF(prod,  int64_t,  int64_t, *)
+OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
+OP_FUNC_3BUF(prod,  long,  long, *)
+OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF(prod, fortran_integer, ompi_fortran_integer_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF(prod, fortran_integer1, ompi_fortran_integer1_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF(prod, fortran_integer2, ompi_fortran_integer2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF(prod, fortran_integer4, ompi_fortran_integer4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF(prod, fortran_integer8, ompi_fortran_integer8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF(prod, fortran_integer16, ompi_fortran_integer16_t, *)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF(prod, short_float, short float, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF(prod, short_float, opal_short_float_t, *)
+#endif
+OP_FUNC_3BUF(prod, float, float, *)
+OP_FUNC_3BUF(prod, double, double, *)
+OP_FUNC_3BUF(prod, long_double, long double, *)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF(prod, fortran_real, ompi_fortran_real_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF(prod, fortran_real2, ompi_fortran_real2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF(prod, fortran_real4, ompi_fortran_real4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF(prod, fortran_real8, ompi_fortran_real8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF(prod, fortran_real16, ompi_fortran_real16_t, *)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF(prod, c_short_float_complex, short float _Complex, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF(prod, c_float_complex, float _Complex, *)
+OP_FUNC_3BUF(prod, c_double_complex, double _Complex, *)
+OP_FUNC_3BUF(prod, c_long_double_complex, long double _Complex, *)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC_3BUF(land,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(land,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(land,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(land, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(land,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(land, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(land,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(land,  long,  long)
+FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC_3BUF(lor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(lor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(lor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(lor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(lor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(lor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(lor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(lor,  long,  long)
+FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(lor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC_3BUF(lxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(lxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(lxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(lxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(lxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(lxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(lxor,  long,  long)
+FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF(lxor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC_3BUF(band,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(band,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(band,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(band, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(band,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(band, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(band,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(band,  long,  long)
+FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(band, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(band, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(band, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(band, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(band, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(band, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC_3BUF(bor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(bor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(bor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(bor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(bor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(bor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(bor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(bor,  long,  long)
+FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(bor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(bor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(bor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(bor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(bor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(bor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC_3BUF(bxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF(bxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF(bxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF(bxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF(bxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF(bxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF(bxor,  long,  long)
+FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF(bxor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF(bxor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF(bxor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF(bxor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF(bxor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF(bxor, byte, char)
+
+/*************************************************************************
+ * Min and max location "pair" datatypes
+ *************************************************************************/
+
+/*
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_STRUCT_3BUF(2real, ompi_fortran_real_t, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_STRUCT_3BUF(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_STRUCT_3BUF(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
+#endif
+LOC_STRUCT_3BUF(float_int, float, int)
+LOC_STRUCT_3BUF(double_int, double, int)
+LOC_STRUCT_3BUF(long_int, long, int)
+LOC_STRUCT_3BUF(2int, int, int)
+LOC_STRUCT_3BUF(short_int, short, int)
+LOC_STRUCT_3BUF(long_double_int, long double, int)
+*/
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF(maxloc, 2real, >)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF(maxloc, 2double_precision, >)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF(maxloc, 2integer, >)
+#endif
+LOC_FUNC_3BUF(maxloc, float_int, >)
+LOC_FUNC_3BUF(maxloc, double_int, >)
+LOC_FUNC_3BUF(maxloc, long_int, >)
+LOC_FUNC_3BUF(maxloc, 2int, >)
+LOC_FUNC_3BUF(maxloc, short_int, >)
+LOC_FUNC_3BUF(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF(minloc, 2real, <)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF(minloc, 2double_precision, <)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF(minloc, 2integer, <)
+#endif
+LOC_FUNC_3BUF(minloc, float_int, <)
+LOC_FUNC_3BUF(minloc, double_int, <)
+LOC_FUNC_3BUF(minloc, long_int, <)
+LOC_FUNC_3BUF(minloc, 2int, <)
+LOC_FUNC_3BUF(minloc, short_int, <)
+LOC_FUNC_3BUF(minloc, long_double_int, <)
diff --git a/ompi/mca/op/rocm/op_rocm_impl.h b/ompi/mca/op/rocm/op_rocm_impl.h
new file mode 100644
index 00000000000..0606c508280
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_impl.h
@@ -0,0 +1,899 @@
+/*
+ * Copyright (c) 2019-2023 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2020      Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include <sys/types.h>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_complex.h>
+
+#ifndef BEGIN_C_DECLS
+#if defined(c_plusplus) || defined(__cplusplus)
+#    define BEGIN_C_DECLS extern "C" {
+#    define END_C_DECLS   }
+#else
+#    define BEGIN_C_DECLS /* empty */
+#    define END_C_DECLS   /* empty */
+#endif
+#endif
+
+BEGIN_C_DECLS
+
+#define OP_FUNC_SIG(name, type_name, type, op)                                               \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                  \
+                                                   type *inout,                     \
+                                                   int count,                      \
+                                                   int threads_per_block,           \
+                                                   hipStream_t stream);
+
+#define FUNC_FUNC_SIG(name, type_name, type)                                            \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                  \
+                                                   type *inout,                     \
+                                                   int count,                      \
+                                                   int threads_per_block,           \
+                                                   hipStream_t stream);
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for minloc and maxloc
+ */
+#define LOC_STRUCT(type_name, type1, type2) \
+  typedef struct { \
+      type1 v; \
+      type2 k; \
+  } ompi_op_predefined_##type_name##_t;
+
+#define LOC_FUNC_SIG(name, type_name, op) \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a, \
+                                            ompi_op_predefined_##type_name##_t *b,    \
+                                            int count,                                   \
+                                            int threads_per_block,                        \
+                                            hipStream_t stream);
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_SIG(max,   int8_t,   int8_t)
+FUNC_FUNC_SIG(max,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(max,  int16_t,  int16_t)
+FUNC_FUNC_SIG(max, uint16_t, uint16_t)
+FUNC_FUNC_SIG(max,  int32_t,  int32_t)
+FUNC_FUNC_SIG(max, uint32_t, uint32_t)
+FUNC_FUNC_SIG(max,  int64_t,  int64_t)
+FUNC_FUNC_SIG(max, uint64_t, uint64_t)
+FUNC_FUNC_SIG(max,  long,  long)
+FUNC_FUNC_SIG(max,  unsigned_long, unsigned long)
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_SIG(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_SIG(max, short_float, opal_short_float_t)
+#endif
+#endif // 0
+
+FUNC_FUNC_SIG(max, float, float)
+FUNC_FUNC_SIG(max, double, double)
+FUNC_FUNC_SIG(max, long_double, long double)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_SIG(min,   int8_t,   int8_t)
+FUNC_FUNC_SIG(min,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(min,  int16_t,  int16_t)
+FUNC_FUNC_SIG(min, uint16_t, uint16_t)
+FUNC_FUNC_SIG(min,  int32_t,  int32_t)
+FUNC_FUNC_SIG(min, uint32_t, uint32_t)
+FUNC_FUNC_SIG(min,  int64_t,  int64_t)
+FUNC_FUNC_SIG(min, uint64_t, uint64_t)
+FUNC_FUNC_SIG(min,  long,  long)
+FUNC_FUNC_SIG(min,  unsigned_long, unsigned long)
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_SIG(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_SIG(min, short_float, opal_short_float_t)
+#endif
+#endif // 0
+
+FUNC_FUNC_SIG(min, float, float)
+FUNC_FUNC_SIG(min, double, double)
+FUNC_FUNC_SIG(min, long_double, long double)
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_SIG(sum,   int8_t,   int8_t, +=)
+OP_FUNC_SIG(sum,  uint8_t,  uint8_t, +=)
+OP_FUNC_SIG(sum,  int16_t,  int16_t, +=)
+OP_FUNC_SIG(sum, uint16_t, uint16_t, +=)
+OP_FUNC_SIG(sum,  int32_t,  int32_t, +=)
+OP_FUNC_SIG(sum, uint32_t, uint32_t, +=)
+OP_FUNC_SIG(sum,  int64_t,  int64_t, +=)
+OP_FUNC_SIG(sum, uint64_t, uint64_t, +=)
+OP_FUNC_SIG(sum,  long,  long, +=)
+OP_FUNC_SIG(sum,  unsigned_long, unsigned long, +=)
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_SIG(sum, short_float, short float, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_SIG(sum, short_float, opal_short_float_t, +=)
+#endif
+#endif // 0
+
+OP_FUNC_SIG(sum, float, float, +=)
+OP_FUNC_SIG(sum, double, double, +=)
+OP_FUNC_SIG(sum, long_double, long double, +=)
+
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_SIG(sum, c_short_float_complex, short float _Complex, +=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+#endif // 0
+FUNC_FUNC_SIG(sum, c_float_complex, hipFloatComplex)
+FUNC_FUNC_SIG(sum, c_double_complex, hipDoubleComplex)
+//OP_FUNC_SIG(sum, c_float_complex, float _Complex, +=)
+//OP_FUNC_SIG(sum, c_double_complex, double _Complex, +=)
+//OP_FUNC_SIG(sum, c_long_double_complex, long double _Complex, +=)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_SIG(prod,   int8_t,   int8_t, *=)
+OP_FUNC_SIG(prod,  uint8_t,  uint8_t, *=)
+OP_FUNC_SIG(prod,  int16_t,  int16_t, *=)
+OP_FUNC_SIG(prod, uint16_t, uint16_t, *=)
+OP_FUNC_SIG(prod,  int32_t,  int32_t, *=)
+OP_FUNC_SIG(prod, uint32_t, uint32_t, *=)
+OP_FUNC_SIG(prod,  int64_t,  int64_t, *=)
+OP_FUNC_SIG(prod, uint64_t, uint64_t, *=)
+OP_FUNC_SIG(prod,  long,  long, *=)
+OP_FUNC_SIG(prod,  unsigned_long, unsigned long, *=)
+
+#if 0
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_SIG(prod, short_float, short float, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_SIG(prod, short_float, opal_short_float_t, *=)
+#endif
+#endif // 0
+
+OP_FUNC_SIG(prod, float, float, *=)
+OP_FUNC_SIG(prod, double, double, *=)
+OP_FUNC_SIG(prod, long_double, long double, *=)
+
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_SIG(prod, c_short_float_complex, short float _Complex, *=)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_SIG(prod, c_float_complex, float _Complex, *=)
+OP_FUNC_SIG(prod, c_double_complex, double _Complex, *=)
+OP_FUNC_SIG(prod, c_long_double_complex, long double _Complex, *=)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) && (b))
+/* C integer */
+FUNC_FUNC_SIG(land,   int8_t,   int8_t)
+FUNC_FUNC_SIG(land,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(land,  int16_t,  int16_t)
+FUNC_FUNC_SIG(land, uint16_t, uint16_t)
+FUNC_FUNC_SIG(land,  int32_t,  int32_t)
+FUNC_FUNC_SIG(land, uint32_t, uint32_t)
+FUNC_FUNC_SIG(land,  int64_t,  int64_t)
+FUNC_FUNC_SIG(land, uint64_t, uint64_t)
+FUNC_FUNC_SIG(land,  long,  long)
+FUNC_FUNC_SIG(land,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC_SIG(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) || (b))
+/* C integer */
+FUNC_FUNC_SIG(lor,   int8_t,   int8_t)
+FUNC_FUNC_SIG(lor,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(lor,  int16_t,  int16_t)
+FUNC_FUNC_SIG(lor, uint16_t, uint16_t)
+FUNC_FUNC_SIG(lor,  int32_t,  int32_t)
+FUNC_FUNC_SIG(lor, uint32_t, uint32_t)
+FUNC_FUNC_SIG(lor,  int64_t,  int64_t)
+FUNC_FUNC_SIG(lor, uint64_t, uint64_t)
+FUNC_FUNC_SIG(lor,  long,  long)
+FUNC_FUNC_SIG(lor,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC_SIG(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a ? 1 : 0) ^ (b ? 1: 0))
+/* C integer */
+FUNC_FUNC_SIG(lxor,   int8_t,   int8_t)
+FUNC_FUNC_SIG(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(lxor,  int16_t,  int16_t)
+FUNC_FUNC_SIG(lxor, uint16_t, uint16_t)
+FUNC_FUNC_SIG(lxor,  int32_t,  int32_t)
+FUNC_FUNC_SIG(lxor, uint32_t, uint32_t)
+FUNC_FUNC_SIG(lxor,  int64_t,  int64_t)
+FUNC_FUNC_SIG(lxor, uint64_t, uint64_t)
+FUNC_FUNC_SIG(lxor,  long,  long)
+FUNC_FUNC_SIG(lxor,  unsigned_long, unsigned long)
+
+/* C++ bool */
+FUNC_FUNC_SIG(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+/* C integer */
+FUNC_FUNC_SIG(band,   int8_t,   int8_t)
+FUNC_FUNC_SIG(band,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(band,  int16_t,  int16_t)
+FUNC_FUNC_SIG(band, uint16_t, uint16_t)
+FUNC_FUNC_SIG(band,  int32_t,  int32_t)
+FUNC_FUNC_SIG(band, uint32_t, uint32_t)
+FUNC_FUNC_SIG(band,  int64_t,  int64_t)
+FUNC_FUNC_SIG(band, uint64_t, uint64_t)
+FUNC_FUNC_SIG(band,  long,  long)
+FUNC_FUNC_SIG(band,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC_SIG(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+/* C integer */
+FUNC_FUNC_SIG(bor,   int8_t,   int8_t)
+FUNC_FUNC_SIG(bor,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(bor,  int16_t,  int16_t)
+FUNC_FUNC_SIG(bor, uint16_t, uint16_t)
+FUNC_FUNC_SIG(bor,  int32_t,  int32_t)
+FUNC_FUNC_SIG(bor, uint32_t, uint32_t)
+FUNC_FUNC_SIG(bor,  int64_t,  int64_t)
+FUNC_FUNC_SIG(bor, uint64_t, uint64_t)
+FUNC_FUNC_SIG(bor,  long,  long)
+FUNC_FUNC_SIG(bor,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC_SIG(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+/* C integer */
+FUNC_FUNC_SIG(bxor,   int8_t,   int8_t)
+FUNC_FUNC_SIG(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC_SIG(bxor,  int16_t,  int16_t)
+FUNC_FUNC_SIG(bxor, uint16_t, uint16_t)
+FUNC_FUNC_SIG(bxor,  int32_t,  int32_t)
+FUNC_FUNC_SIG(bxor, uint32_t, uint32_t)
+FUNC_FUNC_SIG(bxor,  int64_t,  int64_t)
+FUNC_FUNC_SIG(bxor, uint64_t, uint64_t)
+FUNC_FUNC_SIG(bxor,  long,  long)
+FUNC_FUNC_SIG(bxor,  unsigned_long, unsigned long)
+
+/* Byte */
+FUNC_FUNC_SIG(bxor, byte, char)
+
+/*************************************************************************
+ * Min and max location "pair" datatypes
+ *************************************************************************/
+
+LOC_STRUCT(float_int, float, int)
+LOC_STRUCT(double_int, double, int)
+LOC_STRUCT(long_int, long, int)
+LOC_STRUCT(2int, int, int)
+LOC_STRUCT(short_int, short, int)
+LOC_STRUCT(long_double_int, long double, int)
+LOC_STRUCT(unsigned_long, unsigned long, int)
+/* compat types for Fortran */
+LOC_STRUCT(2real, float, float)
+LOC_STRUCT(2double_precision, double, double)
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+LOC_FUNC_SIG(maxloc, float_int, >)
+LOC_FUNC_SIG(maxloc, double_int, >)
+LOC_FUNC_SIG(maxloc, long_int, >)
+LOC_FUNC_SIG(maxloc, 2int, >)
+LOC_FUNC_SIG(maxloc, short_int, >)
+LOC_FUNC_SIG(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+LOC_FUNC_SIG(minloc, float_int, <)
+LOC_FUNC_SIG(minloc, double_int, <)
+LOC_FUNC_SIG(minloc, long_int, <)
+LOC_FUNC_SIG(minloc, 2int, <)
+LOC_FUNC_SIG(minloc, short_int, <)
+LOC_FUNC_SIG(minloc, long_double_int, <)
+
+
+
+#define OP_FUNC_3BUF_SIG(name, type_name, type, op)                                               \
+    void ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1,                  \
+                                                          const type *in2,                  \
+                                                          type *inout,                     \
+                                                          int count,                      \
+                                                          int threads_per_block,           \
+                                                          hipStream_t stream);
+
+#define FUNC_FUNC_3BUF_SIG(name, type_name, type)                                            \
+    void ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1,                  \
+                                                          const type *in2,                  \
+                                                          type *inout,                     \
+                                                          int count,                      \
+                                                          int threads_per_block,           \
+                                                          hipStream_t stream);
+
+#define LOC_FUNC_3BUF_SIG(name, type_name, op) \
+    void ompi_op_rocm_3buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a1, \
+                                                          const ompi_op_predefined_##type_name##_t *a2, \
+                                                          ompi_op_predefined_##type_name##_t *b,    \
+                                                          int count,                                   \
+                                                          int threads_per_block,                        \
+                                                          hipStream_t stream);
+
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_3BUF_SIG(max,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(max,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(max,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(max, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(max,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(max, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(max,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(max, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(max,  long,  long)
+FUNC_FUNC_3BUF_SIG(max,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(max, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(max, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(max, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(max, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(max, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(max, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF_SIG(max, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF_SIG(max, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF_SIG(max, float, float)
+FUNC_FUNC_3BUF_SIG(max, double, double)
+FUNC_FUNC_3BUF_SIG(max, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF_SIG(max, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF_SIG(max, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF_SIG(max, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF_SIG(max, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF_SIG(max, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF_SIG(max, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_3BUF_SIG(min,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(min,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(min,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(min, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(min,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(min, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(min,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(min, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(min,  long,  long)
+FUNC_FUNC_3BUF_SIG(min,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(min, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(min, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(min, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(min, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(min, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(min, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+FUNC_FUNC_3BUF_SIG(min, short_float, short float)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+FUNC_FUNC_3BUF_SIG(min, short_float, opal_short_float_t)
+#endif
+FUNC_FUNC_3BUF_SIG(min, float, float)
+FUNC_FUNC_3BUF_SIG(min, double, double)
+FUNC_FUNC_3BUF_SIG(min, long_double, long double)
+#if OMPI_HAVE_FORTRAN_REAL
+FUNC_FUNC_3BUF_SIG(min, fortran_real, ompi_fortran_real_t)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+FUNC_FUNC_3BUF_SIG(min, fortran_double_precision, ompi_fortran_double_precision_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+FUNC_FUNC_3BUF_SIG(min, fortran_real2, ompi_fortran_real2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+FUNC_FUNC_3BUF_SIG(min, fortran_real4, ompi_fortran_real4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+FUNC_FUNC_3BUF_SIG(min, fortran_real8, ompi_fortran_real8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+FUNC_FUNC_3BUF_SIG(min, fortran_real16, ompi_fortran_real16_t)
+#endif
+
+/*************************************************************************
+ * Sum
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF_SIG(sum,   int8_t,   int8_t, +)
+OP_FUNC_3BUF_SIG(sum,  uint8_t,  uint8_t, +)
+OP_FUNC_3BUF_SIG(sum,  int16_t,  int16_t, +)
+OP_FUNC_3BUF_SIG(sum, uint16_t, uint16_t, +)
+OP_FUNC_3BUF_SIG(sum,  int32_t,  int32_t, +)
+OP_FUNC_3BUF_SIG(sum, uint32_t, uint32_t, +)
+OP_FUNC_3BUF_SIG(sum,  int64_t,  int64_t, +)
+OP_FUNC_3BUF_SIG(sum, uint64_t, uint64_t, +)
+OP_FUNC_3BUF_SIG(sum,  long,  long, +)
+OP_FUNC_3BUF_SIG(sum,  unsigned_long, unsigned long, +)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF_SIG(sum, fortran_integer, ompi_fortran_integer_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF_SIG(sum, fortran_integer1, ompi_fortran_integer1_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF_SIG(sum, fortran_integer2, ompi_fortran_integer2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF_SIG(sum, fortran_integer4, ompi_fortran_integer4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF_SIG(sum, fortran_integer8, ompi_fortran_integer8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF_SIG(sum, fortran_integer16, ompi_fortran_integer16_t, +)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF_SIG(sum, short_float, short float, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF_SIG(sum, short_float, opal_short_float_t, +)
+#endif
+OP_FUNC_3BUF_SIG(sum, float, float, +)
+OP_FUNC_3BUF_SIG(sum, double, double, +)
+OP_FUNC_3BUF_SIG(sum, long_double, long double, +)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF_SIG(sum, fortran_real, ompi_fortran_real_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF_SIG(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF_SIG(sum, fortran_real2, ompi_fortran_real2_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF_SIG(sum, fortran_real4, ompi_fortran_real4_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF_SIG(sum, fortran_real8, ompi_fortran_real8_t, +)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF_SIG(sum, fortran_real16, ompi_fortran_real16_t, +)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF_SIG(sum, c_short_float_complex, short float _Complex, +)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF_SIG(sum, c_float_complex, float _Complex, +)
+OP_FUNC_3BUF_SIG(sum, c_double_complex, double _Complex, +)
+OP_FUNC_3BUF_SIG(sum, c_long_double_complex, long double _Complex, +)
+#endif // 0
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+
+/* C integer */
+OP_FUNC_3BUF_SIG(prod,   int8_t,   int8_t, *)
+OP_FUNC_3BUF_SIG(prod,  uint8_t,  uint8_t, *)
+OP_FUNC_3BUF_SIG(prod,  int16_t,  int16_t, *)
+OP_FUNC_3BUF_SIG(prod, uint16_t, uint16_t, *)
+OP_FUNC_3BUF_SIG(prod,  int32_t,  int32_t, *)
+OP_FUNC_3BUF_SIG(prod, uint32_t, uint32_t, *)
+OP_FUNC_3BUF_SIG(prod,  int64_t,  int64_t, *)
+OP_FUNC_3BUF_SIG(prod, uint64_t, uint64_t, *)
+OP_FUNC_3BUF_SIG(prod,  long,  long, *)
+OP_FUNC_3BUF_SIG(prod,  unsigned_long, unsigned long, *)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+OP_FUNC_3BUF_SIG(prod, fortran_integer, ompi_fortran_integer_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+OP_FUNC_3BUF_SIG(prod, fortran_integer1, ompi_fortran_integer1_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+OP_FUNC_3BUF_SIG(prod, fortran_integer2, ompi_fortran_integer2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+OP_FUNC_3BUF_SIG(prod, fortran_integer4, ompi_fortran_integer4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+OP_FUNC_3BUF_SIG(prod, fortran_integer8, ompi_fortran_integer8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+OP_FUNC_3BUF_SIG(prod, fortran_integer16, ompi_fortran_integer16_t, *)
+#endif
+/* Floating point */
+#if defined(HAVE_SHORT_FLOAT)
+OP_FUNC_3BUF_SIG(prod, short_float, short float, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
+OP_FUNC_3BUF_SIG(prod, short_float, opal_short_float_t, *)
+#endif
+OP_FUNC_3BUF_SIG(prod, float, float, *)
+OP_FUNC_3BUF_SIG(prod, double, double, *)
+OP_FUNC_3BUF_SIG(prod, long_double, long double, *)
+#if OMPI_HAVE_FORTRAN_REAL
+OP_FUNC_3BUF_SIG(prod, fortran_real, ompi_fortran_real_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+OP_FUNC_3BUF_SIG(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL2
+OP_FUNC_3BUF_SIG(prod, fortran_real2, ompi_fortran_real2_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+OP_FUNC_3BUF_SIG(prod, fortran_real4, ompi_fortran_real4_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+OP_FUNC_3BUF_SIG(prod, fortran_real8, ompi_fortran_real8_t, *)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
+OP_FUNC_3BUF_SIG(prod, fortran_real16, ompi_fortran_real16_t, *)
+#endif
+/* Complex */
+#if 0
+#if defined(HAVE_SHORT_FLOAT__COMPLEX)
+OP_FUNC_3BUF_SIG(prod, c_short_float_complex, short float _Complex, *)
+#elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
+COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
+#endif
+OP_FUNC_3BUF_SIG(prod, c_float_complex, float _Complex, *)
+OP_FUNC_3BUF_SIG(prod, c_double_complex, double _Complex, *)
+OP_FUNC_3BUF_SIG(prod, c_long_double_complex, long double _Complex, *)
+#endif // 0
+
+/*************************************************************************
+ * Logical AND
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_3BUF_SIG(land,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(land,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(land,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(land, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(land,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(land, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(land,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(land, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(land,  long,  long)
+FUNC_FUNC_3BUF_SIG(land,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF_SIG(land, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF_SIG(land, bool, bool)
+
+/*************************************************************************
+ * Logical OR
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_3BUF_SIG(lor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(lor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(lor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(lor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(lor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(lor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(lor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(lor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(lor,  long,  long)
+FUNC_FUNC_3BUF_SIG(lor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF_SIG(lor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF_SIG(lor, bool, bool)
+
+/*************************************************************************
+ * Logical XOR
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_3BUF_SIG(lxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(lxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(lxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(lxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(lxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(lxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(lxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(lxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(lxor,  long,  long)
+FUNC_FUNC_3BUF_SIG(lxor,  unsigned_long, unsigned long)
+
+/* Logical */
+#if OMPI_HAVE_FORTRAN_LOGICAL
+FUNC_FUNC_3BUF_SIG(lxor, fortran_logical, ompi_fortran_logical_t)
+#endif
+/* C++ bool */
+FUNC_FUNC_3BUF_SIG(lxor, bool, bool)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_3BUF_SIG(band,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(band,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(band,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(band, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(band,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(band, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(band,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(band, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(band,  long,  long)
+FUNC_FUNC_3BUF_SIG(band,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(band, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(band, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(band, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(band, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(band, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(band, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF_SIG(band, byte, char)
+
+/*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_3BUF_SIG(bor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(bor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(bor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(bor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(bor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(bor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(bor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(bor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(bor,  long,  long)
+FUNC_FUNC_3BUF_SIG(bor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(bor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF_SIG(bor, byte, char)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+
+/* C integer */
+FUNC_FUNC_3BUF_SIG(bxor,   int8_t,   int8_t)
+FUNC_FUNC_3BUF_SIG(bxor,  uint8_t,  uint8_t)
+FUNC_FUNC_3BUF_SIG(bxor,  int16_t,  int16_t)
+FUNC_FUNC_3BUF_SIG(bxor, uint16_t, uint16_t)
+FUNC_FUNC_3BUF_SIG(bxor,  int32_t,  int32_t)
+FUNC_FUNC_3BUF_SIG(bxor, uint32_t, uint32_t)
+FUNC_FUNC_3BUF_SIG(bxor,  int64_t,  int64_t)
+FUNC_FUNC_3BUF_SIG(bxor, uint64_t, uint64_t)
+FUNC_FUNC_3BUF_SIG(bxor,  long,  long)
+FUNC_FUNC_3BUF_SIG(bxor,  unsigned_long, unsigned long)
+
+/* Fortran integer */
+#if OMPI_HAVE_FORTRAN_INTEGER
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer, ompi_fortran_integer_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER1
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer1, ompi_fortran_integer1_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer2, ompi_fortran_integer2_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer4, ompi_fortran_integer4_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer8, ompi_fortran_integer8_t)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+FUNC_FUNC_3BUF_SIG(bxor, fortran_integer16, ompi_fortran_integer16_t)
+#endif
+/* Byte */
+FUNC_FUNC_3BUF_SIG(bxor, byte, char)
+
+/*************************************************************************
+ * Max location
+ *************************************************************************/
+
+#if 0
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF_SIG(maxloc, 2real, >)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF_SIG(maxloc, 2double_precision, >)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF_SIG(maxloc, 2integer, >)
+#endif
+#endif // 0
+LOC_FUNC_3BUF_SIG(maxloc, float_int, >)
+LOC_FUNC_3BUF_SIG(maxloc, double_int, >)
+LOC_FUNC_3BUF_SIG(maxloc, long_int, >)
+LOC_FUNC_3BUF_SIG(maxloc, 2int, >)
+LOC_FUNC_3BUF_SIG(maxloc, short_int, >)
+LOC_FUNC_3BUF_SIG(maxloc, long_double_int, >)
+
+/*************************************************************************
+ * Min location
+ *************************************************************************/
+
+#if 0
+#if OMPI_HAVE_FORTRAN_REAL
+LOC_FUNC_3BUF_SIG(minloc, 2real, <)
+#endif
+#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
+LOC_FUNC_3BUF_SIG(minloc, 2double_precision, <)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER
+LOC_FUNC_3BUF_SIG(minloc, 2integer, <)
+#endif
+#endif // 0
+LOC_FUNC_3BUF_SIG(minloc, float_int, <)
+LOC_FUNC_3BUF_SIG(minloc, double_int, <)
+LOC_FUNC_3BUF_SIG(minloc, long_int, <)
+LOC_FUNC_3BUF_SIG(minloc, 2int, <)
+LOC_FUNC_3BUF_SIG(minloc, short_int, <)
+LOC_FUNC_3BUF_SIG(minloc, long_double_int, <)
+
+END_C_DECLS

From 2ccaa879113d02e1c4644f2a6401a5788ec7a060 Mon Sep 17 00:00:00 2001
From: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
Date: Tue, 2 May 2023 20:43:22 -0400
Subject: [PATCH 27/74] implemented funcs in accelerator_rocm modules

Signed-off-by: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
---
 ompi/mca/op/rocm/Makefile.am                  |  21 ++-
 .../rocm/{op_rocm_impl.c => op_rocm_impl.cu}  |   0
 opal/mca/accelerator/cuda/accelerator_cuda.c  |   8 +-
 opal/mca/accelerator/rocm/accelerator_rocm.h  |   5 +
 .../rocm/accelerator_rocm_component.c         |  57 ++++++-
 .../rocm/accelerator_rocm_module.c            | 158 +++++++++++++++++-
 6 files changed, 228 insertions(+), 21 deletions(-)
 rename ompi/mca/op/rocm/{op_rocm_impl.c => op_rocm_impl.cu} (100%)

diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index 61b31b2ee5d..252d1833c7d 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -21,13 +21,16 @@ AM_CPPFLAGS = $(common_rocm_CPPFLAGS)
 
 sources = op_rocm_component.c op_rocm.h op_rocm_functions.c op_rocm_impl.h
 #sources_extended = op_rocm_functions.cu
-rocm_sources = op_rocm_impl.c
+rocm_sources = op_rocm_impl.cu
 
-NVCC = nvcc -allow-unsupported-compiler
+HIPCC = hipcc
+HIPCCFLAGS= -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -fPIC
+#-I/opt/rocm-5.5.0/include -I/opt/rocm-5.5.0/llvm/bin/../lib/clang/16.0.0
+ 
 
-.c.l$(OBJEXT):
+.cu.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(NVCC) -prefer-non-pic $(NVCCFLAGS) -Wc,-Xcompiler,-fPIC,-g -c $<
+	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -prefer-non-pic $(HIPCCFLAGS) -Wc,-Xcompiler,-fPIC,-g -c $<
 
 # -o $($@.o:.lo)
 
@@ -59,14 +62,14 @@ endif
 # The DSO should install itself in $(ompilibdir) (by default,
 # $prefix/lib/openmpi).
 
-#CUDADIR=/nfs/apps/spacks/2023-01-01/opt/spack/linux-centos7-x86_64/gcc-9.5.0/rocm-11.8.0-u2modnncfevx54ibr5dy27sxkirwsf7f
+ROCMDIR=/opt/rocm/lib
 
 mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_rocm_la_SOURCES = $(sources)
-mca_op_rocm_la_LIBADD = $(rocm_sources:.c=.lo)
+mca_op_rocm_la_LIBADD = $(rocm_sources:.cu=.lo)
 mca_op_rocm_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
-		$(accelerator_rocm_LIBS) -L$(CUDADIR)/lib64 -lrocmrt
+		$(accelerator_rocm_LIBS) -L$(ROCMDIR)/lib -lhiprtc
 EXTRA_mca_op_rocm_la_SOURCES = $(rocm_sources)
 
 # Specific information for static builds.
@@ -76,8 +79,8 @@ EXTRA_mca_op_rocm_la_SOURCES = $(rocm_sources)
 
 noinst_LTLIBRARIES = $(component_noinst)
 libmca_op_rocm_la_SOURCES = $(sources)
-libmca_op_rocm_la_LIBADD = $(rocm_sources:.c=.lo)
+libmca_op_rocm_la_LIBADD = $(rocm_sources:.cu=.lo)
 libmca_op_rocm_la_LDFLAGS = -module -avoid-version\
-		$(accelerator_rocm_LIBS) -L$(CUDADIR)/lib64 -lrocmrt
+		$(accelerator_rocm_LIBS) -L$(ROCMDIR)/lib -lhiprtc
 EXTRA_libmca_op_rocm_la_SOURCES = $(rocm_sources)
 
diff --git a/ompi/mca/op/rocm/op_rocm_impl.c b/ompi/mca/op/rocm/op_rocm_impl.cu
similarity index 100%
rename from ompi/mca/op/rocm/op_rocm_impl.c
rename to ompi/mca/op/rocm/op_rocm_impl.cu
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index cd028504a48..54415867070 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -36,8 +36,7 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
-static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
-                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                              opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size);
@@ -471,8 +470,7 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
     return OPAL_SUCCESS;
 }
 
-static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
-                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type)
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type)
 {
     CUdeviceptr tmp;
     CUresult result;
@@ -867,4 +865,4 @@ static int accelerator_cuda_get_num_devices(int *num_devices)
 {
     *num_devices = opal_accelerator_cuda_num_devices;
     return OPAL_SUCCESS;
-}
\ No newline at end of file
+}
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h
index fdc062af612..e9afaa51019 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm.h
+++ b/opal/mca/accelerator/rocm/accelerator_rocm.h
@@ -63,4 +63,9 @@ OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyD2H_limit;
 
 OPAL_DECLSPEC extern int opal_accelerator_rocm_lazy_init(void);
 
+OPAL_DECLSPEC extern hipStream_t opal_accelerator_alloc_stream;
+OPAL_DECLSPEC extern opal_accelerator_rocm_stream_t opal_accelerator_rocm_default_stream;
+OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_rocm_stream_lock;
+OPAL_DECLSPEC extern int opal_accelerator_rocm_num_devices;
+
 #endif
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
index 317de021565..605978a9974 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
@@ -20,8 +20,10 @@
 #include <dlfcn.h>
 
 #include "opal/mca/dl/base/base.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "opal/runtime/opal_params.h"
 #include "accelerator_rocm.h"
+#include "opal/util/proc.h"
 
 int opal_accelerator_rocm_memcpy_async = 1;
 int opal_accelerator_rocm_verbose = 0;
@@ -32,7 +34,7 @@ size_t opal_accelerator_rocm_memcpyH2D_limit=1048576;
 static opal_mutex_t accelerator_rocm_init_lock;
 static bool accelerator_rocm_init_complete = false;
 
-hipStream_t opal_accelerator_rocm_MemcpyStream = NULL;
+hipStream_t *opal_accelerator_rocm_MemcpyStream = NULL;
 
 /*
  * Public string showing the accelerator rocm component version number
@@ -40,6 +42,17 @@ hipStream_t opal_accelerator_rocm_MemcpyStream = NULL;
 const char *opal_accelerator_rocm_component_version_string
     = "OPAL rocm accelerator MCA component version " OPAL_VERSION;
 
+/* Define global variables, used in accelerator_rocm.c */
+//opal_accelerator_rocm_stream_t opal_accelerator_rocm_memcpy_stream = {0};
+hipStream_t opal_accelerator_rocm_alloc_stream = NULL;
+opal_accelerator_rocm_stream_t opal_accelerator_rocm_default_stream = {0};
+opal_mutex_t opal_accelerator_rocm_stream_lock = {0};
+int opal_accelerator_rocm_num_devices = 0;
+
+/* Initialization lock for delayed rocm initialization */
+static opal_mutex_t accelerator_rocm_init_lock;
+static bool accelerator_rocm_init_complete = false;
+static int checkmem;
 
 #define HIP_CHECK(condition)                                                 \
 {                                                                            \
@@ -175,14 +188,46 @@ int opal_accelerator_rocm_lazy_init()
         goto out;
     }
 
-    err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream);
+    hipGetDeviceCount(&opal_accelerator_rocm_num_devices);
+
+    /* Create stream for use in cuMemcpyAsync synchronous copies */
+    hipStream_t memcpy_stream;
+    err = hipStreamCreate(&memcpy_stream);
+    if (OPAL_UNLIKELY(result != hipSuccess)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipStreamCreateWithFlags failed", true,
+                       OPAL_PROC_MY_HOSTNAME, err);
+        goto out;
+    }
+    opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t));
+    *(hipStream_t*)opal_accelerator_rocm_MemcpyStream = memcpy_stream;
+
+    /* Create stream for use in cuMemcpyAsync synchronous copies */
+    err = hipStreamCreateWithFlags(&opal_accelerator_rocm_alloc_stream, 0);
+    if (OPAL_UNLIKELY(err != hipSuccess)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipStreamCreateWithFlags failed", true,
+                       OPAL_PROC_MY_HOSTNAME, err);
+        goto out;
+    }
+
+    /* Create a default stream to be used by various components.
+     * We try to create a high-priority stream and fall back to a regular stream.
+     */
+    hipStream_t *default_stream = malloc(sizeof(hipStream_t));
+    err = hipDeviceGetStreamPriorityRange(&prio_lo, &prio_hi);
     if (hipSuccess != err) {
-        opal_output(0, "Could not create hipStream, err=%d %s\n",
-                err, hipGetErrorString(err));
+        err = hipStreamCreateWithPriority(default_stream,
+                                            hipStreamNonBlocking, prio_hi);
+    } else {
+        err = hipStreamCreateWithFlags(default_stream, 0);
+    }
+    if (OPAL_UNLIKELY(err != hipSuccess)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipStreamCreateWithFlags failed", true,
+                       OPAL_PROC_MY_HOSTNAME, err);
         goto out;
     }
+    OBJ_CONSTRUCT(&opal_accelerator_rocm_default_stream, opal_accelerator_rocm_stream_t);
+    opal_accelerator_rocm_default_stream.base.stream = default_stream;
 
-    err = OPAL_SUCCESS;
     opal_atomic_wmb();
     accelerator_rocm_init_complete = true;
 out:
@@ -193,7 +238,7 @@ int opal_accelerator_rocm_lazy_init()
 static opal_accelerator_base_module_t* accelerator_rocm_init(void)
 {
     OBJ_CONSTRUCT(&accelerator_rocm_init_lock, opal_mutex_t);
-    
+
     hipError_t err;
 
     if (opal_rocm_runtime_initialized) {
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index d5640db2100..71684227480 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -13,8 +13,10 @@
 #include "opal/mca/accelerator/base/base.h"
 #include "opal/constants.h"
 #include "opal/util/output.h"
+#include "opal/util/proc.h"
 
 /* Accelerator API's */
+static int mca_accelerator_rocm_get_default_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int mca_accelerator_rocm_check_addr(const void *addr, int *dev_id, uint64_t *flags);
 static int mca_accelerator_rocm_create_stream(int dev_id, opal_accelerator_stream_t **stream);
 
@@ -26,10 +28,13 @@ static int mca_accelerator_rocm_memcpy_async(int dest_dev_id, int src_dev_id, vo
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_mem_alloc(int dev_id, void **ptr, size_t size);
 static int mca_accelerator_rocm_mem_release(int dev_id, void *ptr);
+static int mca_accelerator_rocm_mem_alloc_stream(int dev_id, void **ptr, size_t size, opal_accelerator_stream_t *stream);
+static int mca_accelerator_rocm_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int mca_accelerator_rocm_get_address_range(int dev_id, const void *ptr, void **base,
                                                   size_t *size);
 
@@ -42,8 +47,14 @@ static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, i
 
 static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream);
+
+static int mca_accelerator_rocm_get_num_devices(int *num_devices);
+
 opal_accelerator_base_module_t opal_accelerator_rocm_module =
 {
+    mca_accelerator_rocm_get_default_stream, //DONE
+
     mca_accelerator_rocm_check_addr,
 
     mca_accelerator_rocm_create_stream,
@@ -54,9 +65,12 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
 
     mca_accelerator_rocm_memcpy_async,
     mca_accelerator_rocm_memcpy,
+    mca_accelerator_rocm_memmove_async, //DONE
     mca_accelerator_rocm_memmove,
     mca_accelerator_rocm_mem_alloc,
     mca_accelerator_rocm_mem_release,
+    mca_accelerator_rocm_mem_alloc_stream, //DONE
+    mca_accelerator_rocm_mem_release_stream, //DONE
     mca_accelerator_rocm_get_address_range,
 
     mca_accelerator_rocm_host_register,
@@ -66,7 +80,10 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
     mca_accelerator_rocm_get_device_pci_attr,
     mca_accelerator_rocm_device_can_access_peer,
 
-    mca_accelerator_rocm_get_buffer_id
+    mca_accelerator_rocm_get_buffer_id,
+
+    mca_accelerator_rocm_wait_stream, //DONE
+    mca_accelerator_rocm_get_num_devices //DONE
 };
 
 
@@ -111,6 +128,16 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6
     return ret;
 }
 
+static int mca_accelerator_rocm_get_default_stream(int dev_id, opal_accelerator_stream_t **stream)
+{
+    int delayed_init = opal_accelerator_rocm_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    *stream = &opal_accelerator_rocm_default_stream;
+    return OPAL_SUCCESS;
+}
+
 static int mca_accelerator_rocm_create_stream(int dev_id, opal_accelerator_stream_t **stream)
 {
     if (NULL == stream) {
@@ -316,6 +343,43 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
     return OPAL_SUCCESS;
 }
 
+
+static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type)
+{
+    hipDeviceptr_t tmp;
+    hipError_t result;
+    void *ptr;
+
+    int delayed_init = opal_accelerator_rocm_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == dest || NULL == src || size <= 0) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = mca_accelerator_rocm_mem_alloc_stream(src_dev_id, &ptr, size, stream);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
+        return OPAL_ERROR;
+    }
+    tmp = (hipDeviceptr_t)ptr;
+    result = hipMemcpyAsync(tmp, (hipDeviceptr_t) src, size, hipMemcpyDefault, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipMemcpyAsync failed", true, tmp, src, size,
+                       result);
+        return OPAL_ERROR;
+    }
+    result = hipMemcpyAsync((hipDeviceptr_t) dest, tmp, size, hipMemcpyDefault, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipMemcpyAsync failed", true, dest, tmp,
+                       size, result);
+        return OPAL_ERROR;
+    }
+    return mca_accelerator_rocm_mem_release_stream(src_dev_id, ptr, stream);
+}
+
+
 static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest,
 					const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type)
@@ -566,3 +630,95 @@ static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal
 #endif
     return OPAL_SUCCESS;
 }
+
+static int mca_accelerator_rocm_mem_alloc_stream(
+    int dev_id,
+    void **addr,
+    size_t size,
+    opal_accelerator_stream_t *stream)
+{
+//#if HIP_VERSION >= ??? //TODO
+    hipError_t result;
+
+    int delayed_init = opal_accelerator_rocm_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == stream || NULL == addr || 0 == size) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    /* Try to allocate the memory from a memory pool, if available */
+    /* get the default pool */
+    hipMemPool_t mpool;
+    result = hipDeviceGetDefaultMemPool(&mpool, dev_id);
+    if (hipSuccess == result) {
+        result = hipMallocFromPoolAsync(addr, size, mpool, *(hipStream_t*)stream->stream);
+        if (hipSuccess == result) {
+            return OPAL_SUCCESS;
+        }
+    }
+    if (hipErrorNotSupported != result) {
+        opal_show_help("help-accelerator-rocm.txt", "hipMallocFromPoolAsync failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    /* fall-back to regular stream allocation */
+
+    result = hipMallocAsync(addr, size, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipMalloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+//#else
+//    return mca_accelerator_rocm_mem_alloc(dev_id, addr, size);
+//#endif // HIP_VERSION
+}
+
+static int mca_accelerator_rocm_mem_release_stream(
+    int dev_id,
+    void *addr,
+    opal_accelerator_stream_t *stream)
+{
+//#if HIP_VERSION >= ??? //TODO
+    hipError_t result;
+
+    if (NULL == stream || NULL == addr) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = hipFreeAsync(addr, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipMalloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+//#else
+    /* wait for everything on the device to complete */
+//    mca_accelerator_rocm_wait_stream(stream);
+//    return mca_accelerator_rocm_mem_release(dev_id, addr);
+//#endif // HIP_VERSION >= 11020
+}
+
+static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+    result = hipStreamSynchronize(*(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipStreamSynchronize failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
+static int mca_accelerator_rocm_get_num_devices(int *num_devices)
+{
+    *num_devices = opal_accelerator_rocm_num_devices;
+    return OPAL_SUCCESS;
+}

From a6f1cce785567d97031ce292a1d5cd319536d691 Mon Sep 17 00:00:00 2001
From: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
Date: Wed, 3 May 2023 12:17:19 -0400
Subject: [PATCH 28/74] add -I include path to Makefile

Signed-off-by: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
---
 ompi/mca/op/rocm/Makefile.am | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index 252d1833c7d..5d986f6a95d 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -24,13 +24,12 @@ sources = op_rocm_component.c op_rocm.h op_rocm_functions.c op_rocm_impl.h
 rocm_sources = op_rocm_impl.cu
 
 HIPCC = hipcc
-HIPCCFLAGS= -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -fPIC
-#-I/opt/rocm-5.5.0/include -I/opt/rocm-5.5.0/llvm/bin/../lib/clang/16.0.0
+HIPCCFLAGS= -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.5.0/include
  
 
 .cu.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -prefer-non-pic $(HIPCCFLAGS) -Wc,-Xcompiler,-fPIC,-g -c $<
+	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -prefer-non-pic $(HIPCCFLAGS) -Wc,-fPIC,-g -c $<
 
 # -o $($@.o:.lo)
 
@@ -62,14 +61,14 @@ endif
 # The DSO should install itself in $(ompilibdir) (by default,
 # $prefix/lib/openmpi).
 
-ROCMDIR=/opt/rocm/lib
+ROCMDIR=/opt/rocm-5.5.0/lib
 
 mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_rocm_la_SOURCES = $(sources)
 mca_op_rocm_la_LIBADD = $(rocm_sources:.cu=.lo)
 mca_op_rocm_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
-		$(accelerator_rocm_LIBS) -L$(ROCMDIR)/lib -lhiprtc
+		$(accelerator_rocm_LIBS) $(HIPCCFLAGS) -L$(ROCMDIR)/lib -lhiprtc
 EXTRA_mca_op_rocm_la_SOURCES = $(rocm_sources)
 
 # Specific information for static builds.
@@ -81,6 +80,6 @@ noinst_LTLIBRARIES = $(component_noinst)
 libmca_op_rocm_la_SOURCES = $(sources)
 libmca_op_rocm_la_LIBADD = $(rocm_sources:.cu=.lo)
 libmca_op_rocm_la_LDFLAGS = -module -avoid-version\
-		$(accelerator_rocm_LIBS) -L$(ROCMDIR)/lib -lhiprtc
+		$(accelerator_rocm_LIBS) ${HIPCCFLAGS} -L$(ROCMDIR)/lib -lhiprtc
 EXTRA_libmca_op_rocm_la_SOURCES = $(rocm_sources)
 

From ce0b88d57637cb4f38207946f31af069e1fb947c Mon Sep 17 00:00:00 2001
From: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
Date: Mon, 15 May 2023 20:08:38 -0400
Subject: [PATCH 29/74] added rocm codes into test example

Signed-off-by: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
---
 test/datatype/reduce_local.c | 49 +++++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index b412479a93a..36733fb9e45 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -21,7 +21,8 @@
 #include <unistd.h>
 
 // TODO: detect through configure
-#define HAVE_CUDA 1
+//#define HAVE_CUDA 1
+#define HAVE_ROCM 1
 
 #include "mpi.h"
 #include "ompi/communicator/communicator.h"
@@ -230,7 +231,7 @@ static allocator_t host_allocator = {
     .free     = &host_free,
     .fini     = &host_fini};
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA)
 #include <cuda_runtime.h>
 static void cuda_init() {
     // nothing to be done
@@ -262,6 +263,40 @@ static allocator_t cuda_allocator = {
     .memcpy   = &cuda_memcpy,
     .free     = &cuda_free,
     .fini     = &cuda_fini};
+
+#elif defined(HAVE_ROCM)
+#include <hip/hip_runtime.h>
+static void rocm_init() {
+    // nothing to be done
+}
+static void *rocm_allocate(size_t size, size_t align) {
+    (void)align; // ignored
+    void *ptr;
+    int err;
+    if (hipSuccess != (err = hipMalloc(&ptr, size))) {
+        fprintf(stderr, "hipMalloc failed to allocate %zuB: %s", size, hipGetErrorName(err));
+        return NULL;
+    }
+    return ptr;
+}
+static void* rocm_memcpy(void *dst, const void *src, size_t size) {
+    hipMemcpy(dst, src, size, hipMemcpyDefault);
+    return dst;
+}
+static void rocm_free(void *ptr) {
+    hipFree(ptr);
+}
+static void rocm_fini() {
+    // nothing to be done
+}
+static allocator_t rocm_allocator = {
+    .flags    = ALLOCATOR_DISCRETE,
+    .init     = &rocm_init,
+    .allocate = &rocm_allocate,
+    .memcpy   = &rocm_memcpy,
+    .free     = &rocm_free,
+    .fini     = &rocm_fini};
+
 #endif
 
 int main(int argc, char **argv)
@@ -356,11 +391,16 @@ int main(int argc, char **argv)
                 // default allocator
                 break;
             } else
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA)
             if (0 == strncmp("cuda", optarg, 4)) {
                 allocator = &cuda_allocator;
                 break;
             } else
+#elif defined(HAVE_ROCM)
+            if (0 == strncmp("rocm", optarg, 4)) {
+                allocator = &rocm_allocator;
+                break;
+            } else
 #endif
             {
                 fprintf(stderr, "Unsupported allocator: %s\n", optarg);
@@ -379,6 +419,9 @@ int main(int argc, char **argv)
                     " -d <memory-space> : host"
 #ifdef HAVE_CUDA
                     ", cuda"
+#endif
+#ifdef HAVE_ROCM
+                    ", rocm"
 #endif
                     "\n"
                     " -i <number> : shift on all buffers to check alignment\n"

From ad420fe001b8d26b7afa42120d70fd9cad659550 Mon Sep 17 00:00:00 2001
From: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
Date: Tue, 16 May 2023 14:58:02 -0400
Subject: [PATCH 30/74] fixed kernel launches in hip

Signed-off-by: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
---
 ompi/mca/op/rocm/Makefile.am                  | 11 ++++----
 ompi/mca/op/rocm/op_rocm_component.c          |  5 ++++
 ompi/mca/op/rocm/op_rocm_functions.c          |  4 ++-
 .../{op_rocm_impl.cu => op_rocm_impl.cpp}     | 27 ++++++++++++-------
 4 files changed, 31 insertions(+), 16 deletions(-)
 rename ompi/mca/op/rocm/{op_rocm_impl.cu => op_rocm_impl.cpp} (97%)

diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index 5d986f6a95d..4994f7dea50 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -9,7 +9,7 @@
 # $HEADER$
 #
 
-# This component provides support for offloading reduce ops to CUDA devices.
+# This component provides support for offloading reduce ops to ROCM devices.
 #
 # See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
 # for more details on how to make Open MPI components.
@@ -20,14 +20,13 @@
 AM_CPPFLAGS = $(common_rocm_CPPFLAGS)
 
 sources = op_rocm_component.c op_rocm.h op_rocm_functions.c op_rocm_impl.h
-#sources_extended = op_rocm_functions.cu
-rocm_sources = op_rocm_impl.cu
+rocm_sources = op_rocm_impl.cpp
 
 HIPCC = hipcc
 HIPCCFLAGS= -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.5.0/include
  
 
-.cu.l$(OBJEXT):
+.cpp.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -prefer-non-pic $(HIPCCFLAGS) -Wc,-fPIC,-g -c $<
 
@@ -66,7 +65,7 @@ ROCMDIR=/opt/rocm-5.5.0/lib
 mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_rocm_la_SOURCES = $(sources)
-mca_op_rocm_la_LIBADD = $(rocm_sources:.cu=.lo)
+mca_op_rocm_la_LIBADD = $(rocm_sources:.cpp=.lo)
 mca_op_rocm_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
 		$(accelerator_rocm_LIBS) $(HIPCCFLAGS) -L$(ROCMDIR)/lib -lhiprtc
 EXTRA_mca_op_rocm_la_SOURCES = $(rocm_sources)
@@ -78,7 +77,7 @@ EXTRA_mca_op_rocm_la_SOURCES = $(rocm_sources)
 
 noinst_LTLIBRARIES = $(component_noinst)
 libmca_op_rocm_la_SOURCES = $(sources)
-libmca_op_rocm_la_LIBADD = $(rocm_sources:.cu=.lo)
+libmca_op_rocm_la_LIBADD = $(rocm_sources:.cpp=.lo)
 libmca_op_rocm_la_LDFLAGS = -module -avoid-version\
 		$(accelerator_rocm_LIBS) ${HIPCCFLAGS} -L$(ROCMDIR)/lib -lhiprtc
 EXTRA_libmca_op_rocm_la_SOURCES = $(rocm_sources)
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
index 87439f63ed7..85477f18be4 100644
--- a/ompi/mca/op/rocm/op_rocm_component.c
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -97,6 +97,7 @@ static int rocm_component_close(void)
         mca_op_rocm_component.ro_num_devices = 0;
     }
 
+    printf("op rocm_component_close\n");
     return OMPI_SUCCESS;
 }
 
@@ -108,6 +109,7 @@ rocm_component_register(void)
 {
     /* TODO: add mca paramters */
 
+    printf("op rocm_component_register\n");
     return OMPI_SUCCESS;
 }
 
@@ -148,6 +150,8 @@ rocm_component_init_query(bool enable_progress_threads,
             /* fall-back to value that should work on every device */
             mca_op_rocm_component.ro_max_threads_per_block[i] = 512;
         }
+        //TODO
+        printf("OUTPUT - nthreads: %d\n", mca_op_rocm_component.ro_max_threads_per_block[i]);
     }
 
 #if 0
@@ -185,5 +189,6 @@ rocm_component_op_query(struct ompi_op_t *op, int *priority)
         }
     }
     *priority = 50;
+    printf("op rocm_component_op_query\n");
     return (ompi_op_base_module_1_0_0_t *) module;
 }
diff --git a/ompi/mca/op/rocm/op_rocm_functions.c b/ompi/mca/op/rocm/op_rocm_functions.c
index 717edb94094..a4d15c999a3 100644
--- a/ompi/mca/op/rocm/op_rocm_functions.c
+++ b/ompi/mca/op/rocm/op_rocm_functions.c
@@ -57,6 +57,8 @@ static inline void device_op_pre(const void *orig_source1,
     source1_rc = opal_accelerator.check_addr(*source1, source1_device, &source1_flags);
     *device = *target_device;
 
+    // TODO
+    printf("OUT - target device & rc %d %d source %d %d\n", *target_device, target_rc, *source1_device, source1_rc);
     if (NULL != orig_source2) {
         source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
         //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
@@ -1793,4 +1795,4 @@ ompi_op_base_3buff_stream_handler_fn_t ompi_op_rocm_3buff_functions[OMPI_OP_BASE
                ACCUMULATE */
             NULL,
         },
-    };
\ No newline at end of file
+    };
diff --git a/ompi/mca/op/rocm/op_rocm_impl.cu b/ompi/mca/op/rocm/op_rocm_impl.cpp
similarity index 97%
rename from ompi/mca/op/rocm/op_rocm_impl.cu
rename to ompi/mca/op/rocm/op_rocm_impl.cpp
index 9f964c3a4b7..c174bc31f29 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.cu
+++ b/ompi/mca/op/rocm/op_rocm_impl.cpp
@@ -44,7 +44,9 @@
         int blocks  = (count + threads-1) / threads;                                                \
         int n = count;                                                                              \
         hipStream_t s = stream;                                                                        \
-        ompi_op_rocm_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel, \
+                dim3(blocks), dim3(threads), 0, s,\
+                in, inout, n);  \
     }
 
 
@@ -67,7 +69,9 @@
         int blocks  = (count + threads-1) / threads;                                                \
         int n = count;                                                                              \
         hipStream_t s = stream;                                                                        \
-        ompi_op_rocm_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel, \
+                dim3(blocks), dim3(threads), 0, s, \
+                in, inout, n);  \
     }
 
 /*
@@ -106,7 +110,9 @@
         int threads = threads_per_block;                                                            \
         int blocks  = (count + threads-1) / threads;                                                \
         hipStream_t s = stream;                                                                        \
-        ompi_op_rocm_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(a, b, count);   \
+        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel, \
+                dim3(blocks), dim3(threads), 0, s, \
+                a, b, count);   \
     }
 
 /*************************************************************************
@@ -395,8 +401,9 @@ LOC_FUNC(minloc, long_double_int, <)
                                                           hipStream_t stream) {                        \
         int threads = threads_per_block;                                                            \
         int blocks  = (count+threads-1) / threads;                                                  \
-        ompi_op_rocm_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
-                                                           0, stream>>>(in1, in2, out, count);      \
+        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel, \
+                dim3(blocks), dim3(threads), 0, stream, \
+                in1, in2, out, count);      \
     }
 
 
@@ -424,8 +431,9 @@ LOC_FUNC(minloc, long_double_int, <)
                                                      hipStream_t stream) {                             \
         int threads = threads_per_block;                                                            \
         int blocks  = (count+threads-1) / threads;                                                  \
-        ompi_op_rocm_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
-                                                           0, stream>>>(in1, in2, out, count);      \
+        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel, \
+                dim3(blocks), dim3(threads), 0, stream,  \
+                in1, in2, out, count);      \
     }
 
 /*
@@ -478,8 +486,9 @@ LOC_FUNC(minloc, long_double_int, <)
     {                                                                                               \
         int threads = threads_per_block;                                                            \
         int blocks  = (count+threads-1) / threads;                                                  \
-        ompi_op_rocm_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
-                                                           0, stream>>>(in1, in2, out, count);      \
+        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel,   \
+                dim3(blocks), dim3(threads), 0, stream, \
+                in1, in2, out, count);      \
     }
 
 

From c3c32873a7fdde7b0fb0f0be5ac3b4ad0f506db5 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 27 Jun 2023 13:54:07 +0000
Subject: [PATCH 31/74] Make headers in reduce_local better parsable

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 test/datatype/reduce_local.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index 36733fb9e45..3306c34f86f 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -68,20 +68,31 @@ static int total_errors = 0;
         _a < _b ? _a : _b;      \
     })
 
+static void print_header(int max_shift) {
+    printf("%-10s %-10s %-10s %-10s %-10s", "Op", "Type", "TypeSize", "Check", "Count");
+    if (1 == max_shift) {
+        printf("%-10s", "'Time (seconds)'");
+    } else {
+        for (int i = 0; i < max_shift; ++i) {
+            printf(" %-10s ", "'Shift %d [s]'");
+        }
+    }
+    printf("\n");
+}
+
 static void print_status(char *op, char *type, int type_size, int count, int max_shift,
                          double *duration, int repeats, int correct)
 {
     if (correct) {
-        printf("%-10s %s %-10d%s ", op, type, type_size,
-               (verbose ? " [\033[1;32msuccess\033[0m]" : ""));
+        printf("%-10s %s %-10d success", op, type, type_size);
     } else {
-        printf("%-10s %s [\033[1;31mfail\033[0m]", op, type);
+        printf("%-10s %s %-10d [\033[1;31mfail\033[0m]", op, type, type_size);
         total_errors++;
     }
     if (1 == max_shift) {
-        printf(" count  %-10d  time (seconds) %.8f seconds\n", count, duration[0] / repeats);
+        printf(" %-10d %.8f\n", count, duration[0] / repeats);
     } else {
-        printf(" count  %-10d  time (seconds / shifts) ", count);
+        printf(" %-10d ", count);
         for (int i = 0; i < max_shift; i++) {
             printf("%.8f ", duration[i] / repeats);
         }
@@ -248,6 +259,7 @@ static void *cuda_allocate(size_t size, size_t align) {
 }
 static void* cuda_memcpy(void *dst, const void *src, size_t size) {
     cudaMemcpy(dst, src, size, cudaMemcpyDefault);
+    cudaDeviceSynchronize();
     return dst;
 }
 static void cuda_free(void *ptr) {
@@ -453,6 +465,8 @@ int main(int argc, char **argv)
     size = ompi_comm_size(MPI_COMM_WORLD);
     (void) size;
 
+    print_header(max_shift);
+
     for (uint32_t type_idx = 0; type_idx < strlen(type); type_idx++) {
         for (uint32_t op_idx = 0; do_ops[op_idx] >= 0; op_idx++) {
             op = array_of_ops[do_ops[op_idx]].name;

From 9674aae641328a5714582f444f1271bbea4c5b2a Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 27 Jun 2023 13:54:58 +0000
Subject: [PATCH 32/74] CUDA: disable internal memory pool (seems broken)

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/cuda/accelerator_cuda.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index 54415867070..d953cb955b9 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -537,15 +537,17 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
         return OPAL_ERR_BAD_PARAM;
     }
 
+#if 0
     /* prefer managed memory */
     result = cudaMallocManaged(ptr, size, cudaMemAttachGlobal);
     if (cudaSuccess == result) {
         return OPAL_SUCCESS;
     }
+#endif // 0
 
     /* fall-back to discrete memory */
 
-#if CUDA_VERSION >= 11020
+#if CUDA_VERSION >= 11020 && 0
     /* Try to allocate the memory from a memory pool, if available */
     /* get the default pool */
     cudaMemPool_t mpool;
@@ -556,6 +558,7 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
             /* this is a blocking function, so wait for the allocation to happen */
             result = cuStreamSynchronize(opal_accelerator_cuda_alloc_stream);
             if (cudaSuccess == result) {
+                printf("CUDA from mempool %p\n", *ptr);
                 return OPAL_SUCCESS;
             }
         }
@@ -574,6 +577,7 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
                         OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
+    //printf("CUDA from cuMemAlloc %p\n", *ptr);
     return 0;
 }
 

From 628c0f1cfc78bbb0463e67f7d88bdff7e63c353d Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 27 Jun 2023 13:58:47 +0000
Subject: [PATCH 33/74] Op: minor comment correction

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/op/op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ompi/op/op.h b/ompi/op/op.h
index 9fb0b66203a..814a8464030 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -831,7 +831,7 @@ static inline void ompi_op_preferred_device(ompi_op_t *op, int source_dev,
     if (!ompi_op_is_intrinsic (op)) {
         return;
     }
-    /* quick check: can we execute on both sides? */
+    /* quick check: can we execute on the device? */
     int dtype_id = ompi_op_ddt_map[dtype->id];
     if (NULL == op->o_device_op || NULL == op->o_device_op->do_intrinsic.fns[dtype_id]) {
         /* not available on the gpu, must select host */

From 251dac453a69711a46f1a1186336566e01280fb4 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 28 Jun 2023 17:09:56 +0000
Subject: [PATCH 34/74] Reduce_local: set hip device during init

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 test/datatype/reduce_local.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index 3306c34f86f..e5a2d078abd 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -74,7 +74,9 @@ static void print_header(int max_shift) {
         printf("%-10s", "'Time (seconds)'");
     } else {
         for (int i = 0; i < max_shift; ++i) {
-            printf(" %-10s ", "'Shift %d [s]'");
+            char str[128];
+            snprintf(str, 128, "'Shift %d [s]'", i);
+            printf(" %-10s ", str);
         }
     }
     printf("\n");
@@ -279,7 +281,14 @@ static allocator_t cuda_allocator = {
 #elif defined(HAVE_ROCM)
 #include <hip/hip_runtime.h>
 static void rocm_init() {
-    // nothing to be done
+    hipError_t ret = hipInit(0);
+    assert(hipSuccess == ret);
+    int num_devs = 0;
+    ret = hipGetDeviceCount(&num_devs);
+    assert(hipSuccess == ret);
+    assert(num_devs > 0);
+    ret = hipSetDevice(0);
+    assert(hipSuccess == ret);
 }
 static void *rocm_allocate(size_t size, size_t align) {
     (void)align; // ignored

From 7589d1748a468a124b4f57d2ca3e87aa1cbfac77 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 28 Jun 2023 17:10:30 +0000
Subject: [PATCH 35/74] CUDA accelerator: fix compiler warnings

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/cuda/accelerator_cuda.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index d953cb955b9..cc7b7c38489 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -259,7 +259,7 @@ static int accelerator_cuda_get_default_stream(int dev_id, opal_accelerator_stre
     if (OPAL_UNLIKELY(0 != delayed_init)) {
         return delayed_init;
     }
-    *stream = &opal_accelerator_cuda_default_stream;
+    *stream = &opal_accelerator_cuda_default_stream.base;
     return OPAL_SUCCESS;
 }
 
@@ -460,7 +460,7 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERROR;
     }
     result = cuStreamSynchronize(*(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream);
-#endif 0
+#endif //0
     result = cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
@@ -511,11 +511,11 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
     int ret;
     CUresult result;
 
-    ret = accelerator_cuda_memmove_async(dest_dev_id, src_dev_id, dest, src, size, &opal_accelerator_cuda_memcpy_stream, type);
+    ret = accelerator_cuda_memmove_async(dest_dev_id, src_dev_id, dest, src, size, &opal_accelerator_cuda_memcpy_stream.base, type);
     if (OPAL_SUCCESS != ret) {
         return OPAL_ERROR;
     }
-    result = accelerator_cuda_wait_stream(&opal_accelerator_cuda_memcpy_stream);
+    result = accelerator_cuda_wait_stream(&opal_accelerator_cuda_memcpy_stream.base);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);

From ead6847df35e62997b3eda9a9b557505c5a3e383 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 28 Jun 2023 17:13:17 +0000
Subject: [PATCH 36/74] Device op: pass device to lower-level op to avoid
 recurring queries

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c      |  22 +-
 ompi/mca/coll/base/coll_base_reduce.c         |   2 +-
 ompi/mca/op/cuda/op_cuda.h                    |   1 +
 ompi/mca/op/cuda/op_cuda_component.c          |  24 +-
 ompi/mca/op/cuda/op_cuda_functions.c          | 148 ++++++-----
 ompi/mca/op/cuda/op_cuda_impl.cu              | 187 +++++++++----
 ompi/mca/op/cuda/op_cuda_impl.h               |  54 ++--
 ompi/mca/op/op.h                              |   6 +-
 ompi/mca/op/rocm/Makefile.am                  |   4 +-
 ompi/mca/op/rocm/configure.m4                 |  36 +++
 ompi/mca/op/rocm/op_rocm.h                    |   1 +
 ompi/mca/op/rocm/op_rocm_component.c          |  27 +-
 ompi/mca/op/rocm/op_rocm_functions.c          | 191 ++++++++------
 ompi/mca/op/rocm/op_rocm_impl.cpp             | 249 ++++++++++++------
 ompi/mca/op/rocm/op_rocm_impl.h               |  46 ++--
 ompi/op/op.h                                  |  95 ++++---
 .../rocm/accelerator_rocm_module.c            |   2 +
 17 files changed, 683 insertions(+), 412 deletions(-)
 create mode 100644 ompi/mca/op/rocm/configure.m4

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index b263f900428..504cd9468c5 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -241,10 +241,10 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
             if (tmpsend == sbuf) {
                 tmpsend = inplacebuf;
                 /* tmpsend = tmprecv (op) sbuf */
-                ompi_3buff_op_reduce_stream(op, sbuf, tmprecv, tmpsend, count, dtype, stream);
+                ompi_3buff_op_reduce_stream(op, sbuf, tmprecv, tmpsend, count, dtype, op_dev, stream);
             } else {
                 /* tmpsend = tmprecv (op) tmpsend */
-                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
+                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, op_dev, stream);
             }
             newrank = rank >> 1;
         }
@@ -283,14 +283,14 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
             if (tmpsend == sbuf) {
                 /* special case: 1st iteration takes one input from the sbuf */
                 /* tmprecv = sbuf (op) tmprecv */
-                ompi_op_reduce_stream(op, sbuf, tmprecv, count, dtype, stream);
+                ompi_op_reduce_stream(op, sbuf, tmprecv, count, dtype, op_dev, stream);
                 /* send the current recv buffer, and use the tmp buffer to receive */
                 tmpsend = tmprecv;
                 tmprecv = inplacebuf;
             } else if (have_next_iter || tmprecv == recvbuf) {
                 /* All iterations, and the last if tmprecv is the recv buffer */
                 /* tmprecv = tmpsend (op) tmprecv */
-                ompi_op_reduce_stream(op, tmpsend, tmprecv, count, dtype, stream);
+                ompi_op_reduce_stream(op, tmpsend, tmprecv, count, dtype, op_dev, stream);
                 /* swap send and receive buffers */
                 tmpswap = tmprecv;
                 tmprecv = tmpsend;
@@ -299,7 +299,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                 /* Last iteration if tmprecv is not the recv buffer, then tmpsend is */
                 /* Make sure we reduce into the receive buffer
                  * tmpsend = tmprecv (op) tmpsend */
-                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
+                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, op_dev, stream);
             }
         } else {
             if (tmpsend == sbuf) {
@@ -307,18 +307,18 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                 /* tmpsend = tmprecv (op) sbuf */
                 tmpsend = inplacebuf;
                 if (have_next_iter || tmpsend == recvbuf) {
-                    ompi_3buff_op_reduce_stream(op, tmprecv, sbuf, tmpsend, count, dtype, stream);
+                    ompi_3buff_op_reduce_stream(op, tmprecv, sbuf, tmpsend, count, dtype, op_dev, stream);
                 } else {
-                    ompi_op_reduce_stream(op, sbuf, tmprecv, count, dtype, stream);
+                    ompi_op_reduce_stream(op, sbuf, tmprecv, count, dtype, op_dev, stream);
                     tmpsend = tmprecv;
                 }
             } else if (have_next_iter || tmpsend == recvbuf) {
                 /* All other iterations: reduce into tmpsend for next iteration */
                 /* tmpsend = tmprecv (op) tmpsend */
-                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, stream);
+                ompi_op_reduce_stream(op, tmprecv, tmpsend, count, dtype, op_dev, stream);
             } else {
                 /* Last iteration: reduce into rbuf and set tmpsend to rbuf (needed at the end) */
-                ompi_op_reduce_stream(op, tmpsend, tmprecv, count, dtype, stream);
+                ompi_op_reduce_stream(op, tmpsend, tmprecv, count, dtype, op_dev, stream);
                 tmpsend = tmprecv;
             }
         }
@@ -1255,11 +1255,11 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             /* Reduce on the right half of the buffers (result in rbuf) */
             if (MPI_IN_PLACE != sbuf) {
                 /* rbuf = sbuf (op) tmp_buf */
-                ompi_3buff_op_reduce_stream(op, sbuf, tmp_buf, recvbuf, count_lhalf, dtype, stream);
+                ompi_3buff_op_reduce_stream(op, sbuf, tmp_buf, recvbuf, count_lhalf, dtype, op_dev, stream);
 
             } else {
                 /* rbuf = rbuf (op) tmp_buf */
-                ompi_op_reduce_stream(op, tmp_buf, recvbuf, count_lhalf, dtype, stream);
+                ompi_op_reduce_stream(op, tmp_buf, recvbuf, count_lhalf, dtype, op_dev, stream);
             }
 
 
diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c
index e7cf7e0656a..72efde701f7 100644
--- a/ompi/mca/coll/base/coll_base_reduce.c
+++ b/ompi/mca/coll/base/coll_base_reduce.c
@@ -114,7 +114,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi
 
         /* If this is a non-commutative operation we must copy
            sendbuf to the accumbuf, in order to simplify the loops */
-        
+
         if (!ompi_op_is_commute(op) && MPI_IN_PLACE != sendbuf) {
             ompi_datatype_copy_content_same_ddt(datatype, original_count,
                                                 (char*)accumbuf,
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index 0298b64b561..86fdc3c6ace 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -56,6 +56,7 @@ typedef struct {
     CUcontext *cu_ctx;
 #endif // 0
     int *cu_max_threads_per_block;
+    int *cu_max_blocks;
     CUdevice *cu_devices;
     int cu_num_devices;
 } ompi_op_cuda_component_t;
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index a377e477bc7..b519ffe60ec 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -59,6 +59,7 @@ ompi_op_cuda_component_t mca_op_cuda_component = {
         .opc_op_query = cuda_component_op_query,
     },
     .cu_max_threads_per_block = NULL,
+    .cu_max_blocks = NULL,
     .cu_devices = NULL,
     .cu_num_devices  = 0,
 };
@@ -92,6 +93,8 @@ static int cuda_component_close(void)
         //cuStreamDestroy(mca_op_cuda_component.cu_stream);
         free(mca_op_cuda_component.cu_max_threads_per_block);
         mca_op_cuda_component.cu_max_threads_per_block = NULL;
+        free(mca_op_cuda_component.cu_max_blocks);
+        mca_op_cuda_component.cu_max_blocks = NULL;
         free(mca_op_cuda_component.cu_devices);
         mca_op_cuda_component.cu_devices = NULL;
         mca_op_cuda_component.cu_num_devices = 0;
@@ -127,27 +130,24 @@ cuda_component_init_query(bool enable_progress_threads,
     CHECK(cuDeviceGetCount, (&num_devices));
     mca_op_cuda_component.cu_num_devices = num_devices;
     mca_op_cuda_component.cu_devices = (CUdevice*)malloc(num_devices*sizeof(CUdevice));
-#if 0
-    mca_op_cuda_component.cu_ctx = (CUcontext*)malloc(num_devices*sizeof(CUcontext));
-#endif // 0
     mca_op_cuda_component.cu_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
+    mca_op_cuda_component.cu_max_blocks = (int*)malloc(num_devices*sizeof(int));
     for (int i = 0; i < num_devices; ++i) {
         CHECK(cuDeviceGet, (&mca_op_cuda_component.cu_devices[i], i));
-#if 0
-        rc = cuCtxCreate(&mca_op_cuda_component.cu_ctx[i],
-                         0, mca_op_cuda_component.cu_devices[i]);
-        if (CUDA_SUCCESS != rc) {
-            CHECK(cuDevicePrimaryCtxRetain,
-                  (&mca_op_cuda_component.cu_ctx[i], mca_op_cuda_component.cu_devices[i]));
-        }
-#endif // 0
         rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_threads_per_block[i],
-                                  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                                  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                                   mca_op_cuda_component.cu_devices[i]);
         if (CUDA_SUCCESS != rc) {
             /* fall-back to value that should work on every device */
             mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
         }
+        rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_blocks[i],
+                                  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                                  mca_op_cuda_component.cu_devices[i]);
+        if (CUDA_SUCCESS != rc) {
+            /* fall-back to value that should work on every device */
+            mca_op_cuda_component.cu_max_blocks[i] = 512;
+        }
     }
 
 #if 0
diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 55dbd38a1a8..f3e211aad1e 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -41,6 +41,7 @@ static inline void device_op_pre(const void *orig_source1,
                                  int count,
                                  struct ompi_datatype_t *dtype,
                                  int *threads_per_block,
+                                 int *max_blocks,
                                  int *device,
                                  opal_accelerator_stream_t *stream)
 {
@@ -53,80 +54,91 @@ static inline void device_op_pre(const void *orig_source1,
         *source2 = (void*)orig_source2;
     }
 
-    target_rc = opal_accelerator.check_addr(*target, target_device, &target_flags);
-    source1_rc = opal_accelerator.check_addr(*source1, source1_device, &source1_flags);
-    *device = *target_device;
+    if (*device != MCA_ACCELERATOR_NO_DEVICE_ID) {
+        /* we got the device from the caller, just adjust the output parameters */
+        *target_device = *device;
+        *source1_device = *device;
+        if (NULL != source2_device) {
+            *source2_device = *device;
+        }
+    } else {
 
-    if (NULL != orig_source2) {
-        source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
-        //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
-        //       orig_target, target_rc, *target_device, orig_source1, source1_rc, *source1_device, orig_source2, source2_rc, *source2_device, *device);
-    }
+        target_rc = opal_accelerator.check_addr(*target, target_device, &target_flags);
+        source1_rc = opal_accelerator.check_addr(*source1, source1_device, &source1_flags);
+        *device = *target_device;
 
-    //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
-    //       target_rc, *target_device, source_rc, *source_device, *device);
-
-    if (0 == target_rc && 0 == source1_rc && 0 == source2_rc) {
-        /* no buffers are on any device, select device 0 */
-        *device = 0;
-    } else if (*target_device == -1) {
-        if (*source1_device == -1 && NULL != orig_source2) {
-            *device = *source2_device;
-        } else {
-            *device = *source1_device;
+        if (NULL != orig_source2) {
+            source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
+            //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
+            //       orig_target, target_rc, *target_device, orig_source1, source1_rc, *source1_device, orig_source2, source2_rc, *source2_device, *device);
         }
-    }
 
-    if (0 == target_rc || 0 == source1_rc || *target_device != *source1_device) {
-        size_t nbytes;
-        ompi_datatype_type_size(dtype, &nbytes);
-        nbytes *= count;
+        //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
+        //       target_rc, *target_device, source_rc, *source_device, *device);
 
-        if (0 == target_rc) {
-            // allocate memory on the device for the target buffer
-            //printf("copying target from device %d to host\n", *target_device);
-            opal_accelerator.mem_alloc_stream(*device, target, nbytes, stream);
-            CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*target, orig_target, nbytes, *(CUstream*)stream->stream));
-            *target_device = -1; // mark target device as host
+        if (0 == target_rc && 0 == source1_rc && 0 == source2_rc) {
+            /* no buffers are on any device, select device 0 */
+            *device = 0;
+        } else if (*target_device == -1) {
+            if (*source1_device == -1 && NULL != orig_source2) {
+                *device = *source2_device;
+            } else {
+                *device = *source1_device;
+            }
         }
 
-        if (0 == source1_rc || *device != *source1_device) {
+        if (0 == target_rc || 0 == source1_rc || *target_device != *source1_device) {
+            size_t nbytes;
+            ompi_datatype_type_size(dtype, &nbytes);
+            nbytes *= count;
+
+            if (0 == target_rc) {
+                // allocate memory on the device for the target buffer
+                //printf("copying target from device %d to host\n", *target_device);
+                opal_accelerator.mem_alloc_stream(*device, target, nbytes, stream);
+                CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*target, orig_target, nbytes, *(CUstream*)stream->stream));
+                *target_device = -1; // mark target device as host
+            }
+
+            if (0 == source1_rc || *device != *source1_device) {
+                // allocate memory on the device for the source buffer
+                //printf("allocating source on device %d\n", *device);
+                opal_accelerator.mem_alloc_stream(*device, source1, nbytes, stream);
+                if (0 == source1_rc) {
+                    /* copy from host to device */
+                    //printf("copying source from host to device %d\n", *device);
+                    CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source1, orig_source1, nbytes, *(CUstream*)stream->stream));
+                } else {
+                    /* copy from one device to another device */
+                    /* TODO: does this actually work? Can we enable P2P? */
+                    //printf("attempting cross-device copy for source\n");
+                    CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source1, (CUdeviceptr)orig_source1, nbytes, *(CUstream*)stream->stream));
+                }
+            }
+
+        }
+        if (NULL != source2_device && *target_device != *source2_device) {
             // allocate memory on the device for the source buffer
             //printf("allocating source on device %d\n", *device);
-            opal_accelerator.mem_alloc_stream(*device, source1, nbytes, stream);
-            if (0 == source1_rc) {
+            size_t nbytes;
+            ompi_datatype_type_size(dtype, &nbytes);
+            nbytes *= count;
+
+            opal_accelerator.mem_alloc_stream(*device, source2, nbytes, stream);
+            if (0 == source2_rc) {
                 /* copy from host to device */
                 //printf("copying source from host to device %d\n", *device);
-                CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source1, orig_source1, nbytes, *(CUstream*)stream->stream));
+                CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source2, orig_source2, nbytes, *(CUstream*)stream->stream));
             } else {
                 /* copy from one device to another device */
                 /* TODO: does this actually work? Can we enable P2P? */
                 //printf("attempting cross-device copy for source\n");
-                CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source1, (CUdeviceptr)orig_source1, nbytes, *(CUstream*)stream->stream));
+                CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source2, (CUdeviceptr)orig_source2, nbytes, *(CUstream*)stream->stream));
             }
         }
-
-    }
-    if (NULL != source2_device && *target_device != *source2_device) {
-        // allocate memory on the device for the source buffer
-        //printf("allocating source on device %d\n", *device);
-        size_t nbytes;
-        ompi_datatype_type_size(dtype, &nbytes);
-        nbytes *= count;
-
-        opal_accelerator.mem_alloc_stream(*device, source2, nbytes, stream);
-        if (0 == source2_rc) {
-            /* copy from host to device */
-            //printf("copying source from host to device %d\n", *device);
-            CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source2, orig_source2, nbytes, *(CUstream*)stream->stream));
-        } else {
-            /* copy from one device to another device */
-            /* TODO: does this actually work? Can we enable P2P? */
-            //printf("attempting cross-device copy for source\n");
-            CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source2, (CUdeviceptr)orig_source2, nbytes, *(CUstream*)stream->stream));
-        }
     }
     *threads_per_block = mca_op_cuda_component.cu_max_threads_per_block[*device];
+    *max_blocks = mca_op_cuda_component.cu_max_blocks[*device];
 }
 
 static inline void device_op_post(void *source1,
@@ -141,7 +153,7 @@ static inline void device_op_post(void *source1,
                                   int device,
                                   opal_accelerator_stream_t *stream)
 {
-    if (-1 == target_device) {
+    if (MCA_ACCELERATOR_NO_DEVICE_ID == target_device) {
 
         size_t nbytes;
         ompi_datatype_type_size(dtype, &nbytes);
@@ -150,7 +162,7 @@ static inline void device_op_post(void *source1,
         CHECK(cuMemcpyDtoHAsync, (orig_target, (CUdeviceptr)target, nbytes, *(CUstream *)stream->stream));
     }
 
-    if (-1 == target_device) {
+    if (MCA_ACCELERATOR_NO_DEVICE_ID == target_device) {
         opal_accelerator.mem_release_stream(device, target, stream);
         //CHECK(cuMemFreeAsync, ((CUdeviceptr)target, mca_op_cuda_component.cu_stream));
     }
@@ -168,19 +180,20 @@ static inline void device_op_post(void *source1,
     static                                                                                                      \
     void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,                       \
                                                    struct ompi_datatype_t **dtype,                              \
+                                                   int device,                                                  \
                                                    opal_accelerator_stream_t *stream,                           \
                                                    struct ompi_op_base_module_1_0_0_t *module) {                \
-        int threads_per_block;                                                                                  \
-        int source_device, target_device, device;                                                               \
+        int threads_per_block, max_blocks;                                                                      \
+        int source_device, target_device;                                                                       \
         type *source, *target;                                                                                  \
         int n = *count;                                                                                         \
         device_op_pre(in, (void**)&source, &source_device, NULL, NULL, NULL,                                    \
                       inout, (void**)&target, &target_device,                                                   \
                       n, *dtype,                                                                                \
-                      &threads_per_block, &device, stream);                                                     \
+                      &threads_per_block, &max_blocks, &device, stream);                                        \
         CUstream *custream = (CUstream*)stream->stream;                                                         \
-        ompi_op_cuda_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, *custream);      \
-        device_op_post(source, source_device, NULL, -1, inout, target, target_device, n, *dtype, device, stream);\
+        ompi_op_cuda_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, max_blocks, *custream); \
+        device_op_post(source, source_device, NULL, -1, inout, target, target_device, n, *dtype, device, stream); \
     }
 
 #define OP_FUNC(name, type_name, type, op, ...) FUNC(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
@@ -784,21 +797,22 @@ LOC_FUNC(minloc, long_double_int, <)
  */
 #define FUNC_3BUF(name, type_name, type)                                                                        \
     static                                                                                                      \
-    void ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,      \
+    void ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,       \
                                                    struct ompi_datatype_t **dtype,                              \
+                                                   int device,                                                  \
                                                    opal_accelerator_stream_t *stream,                           \
                                                    struct ompi_op_base_module_1_0_0_t *module) {                \
-        int threads_per_block;                                                                                  \
-        int source1_device, source2_device, target_device, device;                                              \
+        int threads_per_block, max_blocks;                                                                      \
+        int source1_device, source2_device, target_device;                                                      \
         type *source1, *source2, *target;                                                                       \
         int n = *count;                                                                                         \
         device_op_pre(in1, (void**)&source1, &source1_device,                                                   \
                       in2, (void**)&source2, &source2_device,                                                   \
                       out, (void**)&target, &target_device,                                                     \
                       n, *dtype,                                                                                \
-                      &threads_per_block, &device, stream);                                                     \
+                      &threads_per_block, &max_blocks, &device, stream);                                        \
         CUstream *custream = (CUstream*)stream->stream;                                                         \
-        ompi_op_cuda_3buff_##name##_##type_name##_submit(source1, source2, target, n, threads_per_block, *custream);\
+        ompi_op_cuda_3buff_##name##_##type_name##_submit(source1, source2, target, n, threads_per_block, max_blocks, *custream);\
         device_op_post(source1, source1_device, source2, source2_device, out, target, target_device, n, *dtype, device, stream);\
     }
 
diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index 2045d6a4aaa..6ee3e0a512b 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -15,22 +15,23 @@
 
 #include <cuComplex.h>
 
+#include <stdio.h>
+
 #include "op_cuda_impl.h"
 
 /* TODO: missing support for
  * - short float (conditional on whether short float is available)
  * - complex
- * - 3buff implementation
  */
 
-#define THREADS_PER_BLOCK 512
-
 #define OP_FUNC(name, type_name, type, op)                                                          \
     static __global__ void                                                                          \
-    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {          \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
+        /*if (index < n) { int i = index;*/ \
             inout[i] = inout[i] op in[i];                                                           \
         }                                                                                           \
     }                                                                                               \
@@ -38,18 +39,54 @@
                                                    type *inout,                                     \
                                                    int count,                                       \
                                                    int threads_per_block,                           \
+                                                   int max_blocks,                                  \
                                                    CUstream stream) {                               \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count + threads-1) / threads;                                                \
+        int threads = min(count, threads_per_block);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
         int n = count;                                                                              \
         CUstream s = stream;                                                                        \
         ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
     }
 
 
+#if defined(USE_VECTORS)
+#define OPV_FUNC(name, type_name, type, vtype, vlen, op)                                            \
+    static __global__ void                                                                          \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n/vlen; i += stride) {                                              \
+            ((vtype*)inout)[i] = ((vtype*)inout)[i] op ((vtype*)in)[i];                             \
+        }                                                                                           \
+        int remainder = n%vlen;                                                                     \
+        if (index == (n/vlen) && remainder != 0) {                                                  \
+            while(remainder) {                                                                      \
+                int idx = n - remainder--;                                                          \
+                inout[idx] = inout[idx] op in[idx];                                                 \
+            }                                                                                       \
+        }                                                                                           \
+    }                                                                                               \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                           \
+                                                   type *inout,                                     \
+                                                   int count,                                       \
+                                                   int threads_per_block,                           \
+                                                   int max_blocks,                                  \
+                                                   CUstream stream) {                               \
+        int threads = min(count/vlen, threads_per_block);                                           \
+        int blocks  = min(((count/vlen) + threads-1) / threads, max_blocks);                        \
+        int n = count;                                                                              \
+        CUstream s = stream;                                                                        \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+    }
+#else // USE_VECTORS
+#define OPV_FUNC(name, type_name, type, vtype, vlen, op) OP_FUNC(name, type_name, type, op)
+#endif // USE_VECTORS
+
 #define FUNC_FUNC(name, type_name, type)                                                            \
     static __global__ void                                                                          \
-    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {          \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
@@ -61,11 +98,13 @@
                                               type *inout,                                          \
                                               int count,                                            \
                                               int threads_per_block,                                \
+                                              int max_blocks,                                       \
                                               CUstream stream) {                                    \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count + threads-1) / threads;                                                \
+        int threads = min(count, threads_per_block);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
         int n = count;                                                                              \
         CUstream s = stream;                                                                        \
+        blocks = (blocks > 64) ? 64 : blocks;                                                       \
         ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
     }
 
@@ -79,8 +118,8 @@
 
 #define LOC_FUNC(name, type_name, op)                                                               \
     static __global__ void                                                                          \
-    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in,  \
-                                                     ompi_op_predefined_##type_name##_t *inout,     \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *__restrict__ in,  \
+                                                     ompi_op_predefined_##type_name##_t *__restrict__ inout,     \
                                                      int n)                                         \
     {                                                                                               \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
@@ -101,9 +140,10 @@
                                             ompi_op_predefined_##type_name##_t *b,                  \
                                             int count,                                              \
                                             int threads_per_block,                                  \
+                                            int max_blocks,                                         \
                                             CUstream stream) {                                      \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count + threads-1) / threads;                                                \
+        int threads = min(count, threads_per_block);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
         CUstream s = stream;                                                                        \
         ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(a, b, count);   \
     }
@@ -113,7 +153,11 @@
  *************************************************************************/
 
 #undef current_func
+#if defined(DO_NOT_USE_INTRINSICS)
 #define current_func(a, b) ((a) > (b) ? (a) : (b))
+#else  // DO_NOT_USE_INTRINSICS
+#define current_func(a, b) max(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 /* C integer */
 FUNC_FUNC(max,   int8_t,   int8_t)
 FUNC_FUNC(max,  uint8_t,  uint8_t)
@@ -126,16 +170,32 @@ FUNC_FUNC(max, uint64_t, uint64_t)
 FUNC_FUNC(max,  long,  long)
 FUNC_FUNC(max,  unsigned_long, unsigned long)
 
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+FUNC_FUNC(max, long_double, long double)
+
+#if !defined(DO_NOT_USE_INTRINSICS)
+#undef current_func
+#define current_func(a, b) fmaxf(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(max, float, float)
+
+#if !defined(DO_NOT_USE_INTRINSICS)
+#undef current_func
+#define current_func(a, b) fmax(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(max, double, double)
-FUNC_FUNC(max, long_double, long double)
 
 /*************************************************************************
  * Min
  *************************************************************************/
 
 #undef current_func
+#if defined(DO_NOT_USE_INTRINSICS)
 #define current_func(a, b) ((a) < (b) ? (a) : (b))
+#else  // DO_NOT_USE_INTRINSICS
+#define current_func(a, b) min(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 /* C integer */
 FUNC_FUNC(min,   int8_t,   int8_t)
 FUNC_FUNC(min,  uint8_t,  uint8_t)
@@ -149,8 +209,20 @@ FUNC_FUNC(min,  long,  long)
 FUNC_FUNC(min,  unsigned_long, unsigned long)
 
 
+#if !defined(DO_NOT_USE_INTRINSICS)
+#undef current_func
+#define current_func(a, b) fminf(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(min, float, float)
+
+#if !defined(DO_NOT_USE_INTRINSICS)
+#undef current_func
+#define current_func(a, b) fmin(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(min, double, double)
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
 FUNC_FUNC(min, long_double, long double)
 
 /*************************************************************************
@@ -158,20 +230,20 @@ FUNC_FUNC(min, long_double, long double)
  *************************************************************************/
 
 /* C integer */
-OP_FUNC(sum,   int8_t,   int8_t, +=)
-OP_FUNC(sum,  uint8_t,  uint8_t, +=)
-OP_FUNC(sum,  int16_t,  int16_t, +=)
-OP_FUNC(sum, uint16_t, uint16_t, +=)
-OP_FUNC(sum,  int32_t,  int32_t, +=)
-OP_FUNC(sum, uint32_t, uint32_t, +=)
-OP_FUNC(sum,  int64_t,  int64_t, +=)
-OP_FUNC(sum, uint64_t, uint64_t, +=)
-OP_FUNC(sum,  long,  long, +=)
-OP_FUNC(sum,  unsigned_long, unsigned long, +=)
-
-OP_FUNC(sum, float, float, +=)
-OP_FUNC(sum, double, double, +=)
-OP_FUNC(sum, long_double, long double, +=)
+OP_FUNC(sum,   int8_t,   int8_t, +)
+OP_FUNC(sum,  uint8_t,  uint8_t, +)
+OP_FUNC(sum,  int16_t,  int16_t, +)
+OP_FUNC(sum, uint16_t, uint16_t, +)
+OP_FUNC(sum,  int32_t,  int32_t, +)
+OP_FUNC(sum, uint32_t, uint32_t, +)
+OP_FUNC(sum,  int64_t,  int64_t, +)
+OP_FUNC(sum, uint64_t, uint64_t, +)
+OP_FUNC(sum,  long,  long, +)
+OP_FUNC(sum,  unsigned_long, unsigned long, +)
+
+OPV_FUNC(sum, float, float, float4, 4, +)
+OPV_FUNC(sum, double, double, double4, 4, +)
+OP_FUNC(sum, long_double, long double, +)
 
 /* Complex */
 #if 0
@@ -194,20 +266,20 @@ FUNC_FUNC(sum, c_double_complex, cuDoubleComplex)
  *************************************************************************/
 
 /* C integer */
-OP_FUNC(prod,   int8_t,   int8_t, *=)
-OP_FUNC(prod,  uint8_t,  uint8_t, *=)
-OP_FUNC(prod,  int16_t,  int16_t, *=)
-OP_FUNC(prod, uint16_t, uint16_t, *=)
-OP_FUNC(prod,  int32_t,  int32_t, *=)
-OP_FUNC(prod, uint32_t, uint32_t, *=)
-OP_FUNC(prod,  int64_t,  int64_t, *=)
-OP_FUNC(prod, uint64_t, uint64_t, *=)
-OP_FUNC(prod,  long,  long, *=)
-OP_FUNC(prod,  unsigned_long, unsigned long, *=)
-
-OP_FUNC(prod, float, float, *=)
-OP_FUNC(prod, double, double, *=)
-OP_FUNC(prod, long_double, long double, *=)
+OP_FUNC(prod,   int8_t,   int8_t, *)
+OP_FUNC(prod,  uint8_t,  uint8_t, *)
+OP_FUNC(prod,  int16_t,  int16_t, *)
+OP_FUNC(prod, uint16_t, uint16_t, *)
+OP_FUNC(prod,  int32_t,  int32_t, *)
+OP_FUNC(prod, uint32_t, uint32_t, *)
+OP_FUNC(prod,  int64_t,  int64_t, *)
+OP_FUNC(prod, uint64_t, uint64_t, *)
+OP_FUNC(prod,  long,  long, *)
+OP_FUNC(prod,  unsigned_long, unsigned long, *)
+
+OPV_FUNC(prod, float, float, float4, 4, *)
+OPV_FUNC(prod, double, double, double4, 4, *)
+OP_FUNC(prod, long_double, long double, *)
 
 /* Complex */
 #if 0
@@ -380,8 +452,9 @@ LOC_FUNC(minloc, long_double_int, <)
  */
 #define OP_FUNC_3BUF(name, type_name, type, op)                                                     \
     static __global__ void                                                                          \
-    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type* in2,              \
-                                                     type *out, int n) {                            \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *__restrict__ in1,                  \
+                                                     const type *__restrict__ in2,                  \
+                                                     type *__restrict__ out, int n) {               \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
@@ -391,9 +464,10 @@ LOC_FUNC(minloc, long_double_int, <)
     void ompi_op_cuda_3buff_##name##_##type_name##_submit(const type *in1, const type *in2,         \
                                                           type *out, int count,                     \
                                                           int threads_per_block,                    \
+                                                          int max_blocks,                           \
                                                           CUstream stream) {                        \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count+threads-1) / threads;                                                  \
+        int threads = min(count, threads_per_block);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
         ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
                                                            0, stream>>>(in1, in2, out, count);      \
     }
@@ -408,8 +482,9 @@ LOC_FUNC(minloc, long_double_int, <)
  */
 #define FUNC_FUNC_3BUF(name, type_name, type)                                                       \
     static __global__ void                                                                          \
-    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *in1, const type *in2,              \
-                                                     type *out, int n) {                            \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const type *__restrict__ in1,                  \
+                                                     const type *__restrict__ in2,                  \
+                                                     type *__restrict__ out, int n) {               \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
@@ -420,9 +495,10 @@ LOC_FUNC(minloc, long_double_int, <)
     ompi_op_cuda_3buff_##name##_##type_name##_submit(const type *in1, const type *in2,              \
                                                      type *out, int count,                          \
                                                      int threads_per_block,                         \
+                                                     int max_blocks,                                \
                                                      CUstream stream) {                             \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count+threads-1) / threads;                                                  \
+        int threads = min(count, threads_per_block);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
         ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
                                                            0, stream>>>(in1, in2, out, count);      \
     }
@@ -444,9 +520,9 @@ LOC_FUNC(minloc, long_double_int, <)
 
 #define LOC_FUNC_3BUF(name, type_name, op)                                                          \
     static __global__ void                                                                          \
-    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in1, \
-                                                     const ompi_op_predefined_##type_name##_t *in2, \
-                                                     ompi_op_predefined_##type_name##_t *out,       \
+    ompi_op_cuda_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *__restrict__ in1, \
+                                                     const ompi_op_predefined_##type_name##_t *__restrict__ in2, \
+                                                     ompi_op_predefined_##type_name##_t *__restrict__ out,       \
                                                      int n)                                         \
     {                                                                                               \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
@@ -473,10 +549,11 @@ LOC_FUNC(minloc, long_double_int, <)
                                                      ompi_op_predefined_##type_name##_t *out,       \
                                                      int count,                                     \
                                                      int threads_per_block,                         \
+                                                     int max_blocks,                                \
                                                      CUstream stream)                               \
     {                                                                                               \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count+threads-1) / threads;                                                  \
+        int threads = min(count, threads_per_block);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
         ompi_op_cuda_3buff_##name##_##type_name##_kernel<<<blocks, threads,                         \
                                                            0, stream>>>(in1, in2, out, count);      \
     }
diff --git a/ompi/mca/op/cuda/op_cuda_impl.h b/ompi/mca/op/cuda/op_cuda_impl.h
index 7ab95cd446b..2f9b5961f44 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.h
+++ b/ompi/mca/op/cuda/op_cuda_impl.h
@@ -28,18 +28,20 @@
 
 BEGIN_C_DECLS
 
-#define OP_FUNC_SIG(name, type_name, type, op)                                               \
-    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                  \
-                                                   type *inout,                     \
-                                                   int count,                      \
-                                                   int threads_per_block,           \
+#define OP_FUNC_SIG(name, type_name, type, op)                              \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,   \
+                                                   type *inout,             \
+                                                   int count,               \
+                                                   int threads_per_block,   \
+                                                   int max_blocks,          \
                                                    CUstream stream);
 
-#define FUNC_FUNC_SIG(name, type_name, type)                                            \
-    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                  \
-                                                   type *inout,                     \
-                                                   int count,                      \
-                                                   int threads_per_block,           \
+#define FUNC_FUNC_SIG(name, type_name, type)                                \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,   \
+                                                   type *inout,             \
+                                                   int count,               \
+                                                   int threads_per_block,   \
+                                                   int max_blocks,          \
                                                    CUstream stream);
 
 /*
@@ -56,10 +58,11 @@ BEGIN_C_DECLS
   } ompi_op_predefined_##type_name##_t;
 
 #define LOC_FUNC_SIG(name, type_name, op) \
-    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a, \
-                                            ompi_op_predefined_##type_name##_t *b,    \
-                                            int count,                                   \
-                                            int threads_per_block,                        \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a,  \
+                                            ompi_op_predefined_##type_name##_t *b,                      \
+                                            int count,                                                  \
+                                            int threads_per_block,                                      \
+                                            int max_blocks,                                             \
                                             CUstream stream);
 
 /*************************************************************************
@@ -369,28 +372,31 @@ LOC_FUNC_SIG(minloc, long_double_int, <)
 
 
 
-#define OP_FUNC_3BUF_SIG(name, type_name, type, op)                                               \
+#define OP_FUNC_3BUF_SIG(name, type_name, type, op)                                         \
     void ompi_op_cuda_3buff_##name##_##type_name##_submit(const type *in1,                  \
                                                           const type *in2,                  \
-                                                          type *inout,                     \
-                                                          int count,                      \
-                                                          int threads_per_block,           \
+                                                          type *inout,                      \
+                                                          int count,                        \
+                                                          int threads_per_block,            \
+                                                          int max_blocks,                   \
                                                           CUstream stream);
 
-#define FUNC_FUNC_3BUF_SIG(name, type_name, type)                                            \
+#define FUNC_FUNC_3BUF_SIG(name, type_name, type)                                           \
     void ompi_op_cuda_3buff_##name##_##type_name##_submit(const type *in1,                  \
                                                           const type *in2,                  \
-                                                          type *inout,                     \
-                                                          int count,                      \
-                                                          int threads_per_block,           \
+                                                          type *inout,                      \
+                                                          int count,                        \
+                                                          int threads_per_block,            \
+                                                          int max_blocks,                   \
                                                           CUstream stream);
 
 #define LOC_FUNC_3BUF_SIG(name, type_name, op) \
     void ompi_op_cuda_3buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a1, \
                                                           const ompi_op_predefined_##type_name##_t *a2, \
-                                                          ompi_op_predefined_##type_name##_t *b,    \
-                                                          int count,                                   \
+                                                          ompi_op_predefined_##type_name##_t *b,        \
+                                                          int count,                                    \
                                                           int threads_per_block,                        \
+                                                          int max_blocks,                               \
                                                           CUstream stream);
 
 
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 838daf6e765..2c0d3ee337a 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -268,7 +268,7 @@ typedef void (*ompi_op_base_handler_fn_1_0_0_t)(const void *, void *, int *,
 typedef ompi_op_base_handler_fn_1_0_0_t ompi_op_base_handler_fn_t;
 
 /**
- * Typedef for 2-buffer op functions.
+ * Typedef for 2-buffer op functions on streams/devices.
  *
  * We don't use MPI_User_function because this would create a
  * confusing dependency loop between this file and mpi.h.  So this is
@@ -277,6 +277,7 @@ typedef ompi_op_base_handler_fn_1_0_0_t ompi_op_base_handler_fn_t;
  */
 typedef void (*ompi_op_base_stream_handler_fn_1_0_0_t)(const void *, void *, int *,
                                                        struct ompi_datatype_t **,
+                                                       int device,
                                                        opal_accelerator_stream_t *stream,
                                                        struct ompi_op_base_module_1_0_0_t *);
 
@@ -294,12 +295,13 @@ typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(const void *,
 typedef ompi_op_base_3buff_handler_fn_1_0_0_t ompi_op_base_3buff_handler_fn_t;
 
 /*
- * Typedef for 3-buffer (two input and one output) op functions.
+ * Typedef for 3-buffer (two input and one output) op functions on streams.
  */
 typedef void (*ompi_op_base_3buff_stream_handler_fn_1_0_0_t)(const void *,
                                                              const void *,
                                                              void *, int *,
                                                              struct ompi_datatype_t **,
+                                                             int device,
                                                              opal_accelerator_stream_t*,
                                                              struct ompi_op_base_module_1_0_0_t *);
 
diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index 4994f7dea50..091033f5284 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -24,11 +24,11 @@ rocm_sources = op_rocm_impl.cpp
 
 HIPCC = hipcc
 HIPCCFLAGS= -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.5.0/include
- 
+
 
 .cpp.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -prefer-non-pic $(HIPCCFLAGS) -Wc,-fPIC,-g -c $<
+	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -O2 -fvectorize -prefer-non-pic $(HIPCCFLAGS) -Wc,-fPIC,-g -c $<
 
 # -o $($@.o:.lo)
 
diff --git a/ompi/mca/op/rocm/configure.m4 b/ompi/mca/op/rocm/configure.m4
new file mode 100644
index 00000000000..79de7769fa9
--- /dev/null
+++ b/ompi/mca/op/rocm/configure.m4
@@ -0,0 +1,36 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2023      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2022      Amazon.com, Inc. or its affiliates.
+#                         All Rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If ROCm support was requested, then build the ROCm support library.
+# This code checks makes sure the check was done earlier by the
+# opal_check_rocm.m4 code. It also copies the flags and libs under
+# opal_rocm_CPPFLAGS, opal_rocm_LDFLAGS, and opal_rocm_LIBS
+
+AC_DEFUN([MCA_ompi_op_rocm_CONFIG],[
+
+    AC_CONFIG_FILES([ompi/mca/op/rocm/Makefile])
+
+    OPAL_CHECK_CUDA([op_rocm])
+
+    AS_IF([test "x$ROCM_SUPPORT" = "x1"],
+          [$1],
+          [$2])
+
+    AC_SUBST([op_rocm_CPPFLAGS])
+    AC_SUBST([op_rocm_LDFLAGS])
+    AC_SUBST([op_rocm_LIBS])
+
+])dnl
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
index 0773a519aff..6e8139fa239 100644
--- a/ompi/mca/op/rocm/op_rocm.h
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -56,6 +56,7 @@ typedef struct {
     hipCtx_t *ro_ctx;
 #endif // 0
     int *ro_max_threads_per_block;
+    int *ro_max_blocks;
     hipDevice_t *ro_devices;
     int ro_num_devices;
 } ompi_op_rocm_component_t;
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
index 85477f18be4..844ee4224a3 100644
--- a/ompi/mca/op/rocm/op_rocm_component.c
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -59,6 +59,7 @@ ompi_op_rocm_component_t mca_op_rocm_component = {
         .opc_op_query = rocm_component_op_query,
     },
     .ro_max_threads_per_block = NULL,
+    .ro_max_blocks = NULL,
     .ro_devices = NULL,
     .ro_num_devices  = 0,
 };
@@ -92,6 +93,8 @@ static int rocm_component_close(void)
         //hipStreamDestroy(mca_op_rocm_component.ro_stream);
         free(mca_op_rocm_component.ro_max_threads_per_block);
         mca_op_rocm_component.ro_max_threads_per_block = NULL;
+        free(mca_op_rocm_component.ro_max_blocks);
+        mca_op_rocm_component.ro_max_blocks = NULL;
         free(mca_op_rocm_component.ro_devices);
         mca_op_rocm_component.ro_devices = NULL;
         mca_op_rocm_component.ro_num_devices = 0;
@@ -129,29 +132,25 @@ rocm_component_init_query(bool enable_progress_threads,
     CHECK(hipGetDeviceCount, (&num_devices));
     mca_op_rocm_component.ro_num_devices = num_devices;
     mca_op_rocm_component.ro_devices = (hipDevice_t*)malloc(num_devices*sizeof(hipDevice_t));
-#if 0
-    mca_op_rocm_component.ro_ctx = (hipCtx_t*)malloc(num_devices*sizeof(hipCtx_t));
-#endif // 0
     mca_op_rocm_component.ro_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
+    mca_op_rocm_component.ro_max_blocks = (int*)malloc(num_devices*sizeof(int));
     for (int i = 0; i < num_devices; ++i) {
         CHECK(hipDeviceGet, (&mca_op_rocm_component.ro_devices[i], i));
-#if 0
-        rc = hipCtxCreate(&mca_op_rocm_component.ro_ctx[i],
-                         0, mca_op_rocm_component.ro_devices[i]);
-        if (hipSuccess != rc) {
-            CHECK(hipDevicePrimaryCtxRetain,
-                  (&mca_op_rocm_component.ro_ctx[i], mca_op_rocm_component.ro_devices[i]));
-        }
-#endif // 0
         rc = hipDeviceGetAttribute(&mca_op_rocm_component.ro_max_threads_per_block[i],
-                                  hipDeviceAttributeMaxThreadsPerBlock,
+                                  hipDeviceAttributeMaxBlockDimX,
                                   mca_op_rocm_component.ro_devices[i]);
         if (hipSuccess != rc) {
             /* fall-back to value that should work on every device */
             mca_op_rocm_component.ro_max_threads_per_block[i] = 512;
         }
-        //TODO
-        printf("OUTPUT - nthreads: %d\n", mca_op_rocm_component.ro_max_threads_per_block[i]);
+
+        rc = hipDeviceGetAttribute(&mca_op_rocm_component.ro_max_blocks[i],
+                                  hipDeviceAttributeMaxGridDimX,
+                                  mca_op_rocm_component.ro_devices[i]);
+        if (hipSuccess != rc) {
+            /* we'll try to max out the blocks */
+            mca_op_rocm_component.ro_max_blocks[i] = 512;
+        }
     }
 
 #if 0
diff --git a/ompi/mca/op/rocm/op_rocm_functions.c b/ompi/mca/op/rocm/op_rocm_functions.c
index a4d15c999a3..59436c0897f 100644
--- a/ompi/mca/op/rocm/op_rocm_functions.c
+++ b/ompi/mca/op/rocm/op_rocm_functions.c
@@ -41,6 +41,7 @@ static inline void device_op_pre(const void *orig_source1,
                                  int count,
                                  struct ompi_datatype_t *dtype,
                                  int *threads_per_block,
+                                 int *max_blocks,
                                  int *device,
                                  opal_accelerator_stream_t *stream)
 {
@@ -53,82 +54,95 @@ static inline void device_op_pre(const void *orig_source1,
         *source2 = (void*)orig_source2;
     }
 
-    target_rc = opal_accelerator.check_addr(*target, target_device, &target_flags);
-    source1_rc = opal_accelerator.check_addr(*source1, source1_device, &source1_flags);
-    *device = *target_device;
+    if (*device != MCA_ACCELERATOR_NO_DEVICE_ID) {
+        /* we got the device from the caller, just adjust the output parameters */
+        *target_device = *device;
+        *source1_device = *device;
+        if (NULL != source2_device) {
+            *source2_device = *device;
+        }
+    } else {
+
+        target_rc = opal_accelerator.check_addr(*target, target_device, &target_flags);
+        source1_rc = opal_accelerator.check_addr(*source1, source1_device, &source1_flags);
+        *device = *target_device;
+
+        // TODO
+        //printf("OUT - target device & rc %d %d source %d %d\n", *target_device, target_rc, *source1_device, source1_rc);
+        if (NULL != orig_source2) {
+            source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
+            //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
+            //       orig_target, target_rc, *target_device, orig_source1, source1_rc, *source1_device, orig_source2, source2_rc, *source2_device, *device);
+        }
 
-    // TODO
-    printf("OUT - target device & rc %d %d source %d %d\n", *target_device, target_rc, *source1_device, source1_rc);
-    if (NULL != orig_source2) {
-        source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
-        //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
-        //       orig_target, target_rc, *target_device, orig_source1, source1_rc, *source1_device, orig_source2, source2_rc, *source2_device, *device);
-    }
+        //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
+        //       target_rc, *target_device, source_rc, *source_device, *device);
 
-    //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
-    //       target_rc, *target_device, source_rc, *source_device, *device);
-
-    if (0 == target_rc && 0 == source1_rc && 0 == source2_rc) {
-        /* no buffers are on any device, select device 0 */
-        *device = 0;
-    } else if (*target_device == -1) {
-        if (*source1_device == -1 && NULL != orig_source2) {
-            *device = *source2_device;
-        } else {
-            *device = *source1_device;
+        if (0 == target_rc && 0 == source1_rc && 0 == source2_rc) {
+            /* no buffers are on any device, select device 0 */
+            *device = 0;
+        } else if (*target_device == -1) {
+            if (*source1_device == -1 && NULL != orig_source2) {
+                *device = *source2_device;
+            } else {
+                *device = *source1_device;
+            }
         }
-    }
 
-    if (0 == target_rc || 0 == source1_rc || *target_device != *source1_device) {
-        size_t nbytes;
-        ompi_datatype_type_size(dtype, &nbytes);
-        nbytes *= count;
+        if (0 == target_rc || 0 == source1_rc || *target_device != *source1_device) {
+            size_t nbytes;
+            ompi_datatype_type_size(dtype, &nbytes);
+            nbytes *= count;
+
+            if (0 == target_rc) {
+                // allocate memory on the device for the target buffer
+                //printf("copying target from device %d to host\n", *target_device);
+                opal_accelerator.mem_alloc_stream(*device, target, nbytes, stream);
+                CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*target, orig_target, nbytes, *(hipStream_t*)stream->stream));
+                *target_device = -1; // mark target device as host
+            }
 
-        if (0 == target_rc) {
-            // allocate memory on the device for the target buffer
-            //printf("copying target from device %d to host\n", *target_device);
-            opal_accelerator.mem_alloc_stream(*device, target, nbytes, stream);
-            CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*target, orig_target, nbytes, *(hipStream_t*)stream->stream));
-            *target_device = -1; // mark target device as host
-        }
+            if (0 == source1_rc || *device != *source1_device) {
+                // allocate memory on the device for the source buffer
+                //printf("allocating source on device %d\n", *device);
+                opal_accelerator.mem_alloc_stream(*device, source1, nbytes, stream);
+                if (0 == source1_rc) {
+                    /* copy from host to device */
+                    //printf("copying source from host to device %d\n", *device);
+                    CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source1, orig_source1, nbytes, *(hipStream_t*)stream->stream));
+                } else {
+                    /* copy from one device to another device */
+                    /* TODO: does this actually work? Can we enable P2P? */
+                    //printf("attempting cross-device copy for source\n");
+                    CHECK(hipMemcpyDtoDAsync, ((hipDeviceptr_t)*source1, (hipDeviceptr_t)orig_source1, nbytes, *(hipStream_t*)stream->stream));
+                }
+            }
 
-        if (0 == source1_rc || *device != *source1_device) {
+        }
+        if (NULL != source2_device && *target_device != *source2_device) {
             // allocate memory on the device for the source buffer
             //printf("allocating source on device %d\n", *device);
-            opal_accelerator.mem_alloc_stream(*device, source1, nbytes, stream);
-            if (0 == source1_rc) {
+            size_t nbytes;
+            ompi_datatype_type_size(dtype, &nbytes);
+            nbytes *= count;
+
+            opal_accelerator.mem_alloc_stream(*device, source2, nbytes, stream);
+            if (0 == source2_rc) {
                 /* copy from host to device */
                 //printf("copying source from host to device %d\n", *device);
-                CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source1, orig_source1, nbytes, *(hipStream_t*)stream->stream));
+                CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source2, orig_source2, nbytes, *(hipStream_t*)stream->stream));
             } else {
                 /* copy from one device to another device */
                 /* TODO: does this actually work? Can we enable P2P? */
                 //printf("attempting cross-device copy for source\n");
-                CHECK(hipMemcpyDtoDAsync, ((hipDeviceptr_t)*source1, (hipDeviceptr_t)orig_source1, nbytes, *(hipStream_t*)stream->stream));
+                CHECK(hipMemcpyDtoDAsync, ((hipDeviceptr_t)*source2, (hipDeviceptr_t)orig_source2, nbytes, *(hipStream_t*)stream->stream));
             }
         }
-
     }
-    if (NULL != source2_device && *target_device != *source2_device) {
-        // allocate memory on the device for the source buffer
-        //printf("allocating source on device %d\n", *device);
-        size_t nbytes;
-        ompi_datatype_type_size(dtype, &nbytes);
-        nbytes *= count;
 
-        opal_accelerator.mem_alloc_stream(*device, source2, nbytes, stream);
-        if (0 == source2_rc) {
-            /* copy from host to device */
-            //printf("copying source from host to device %d\n", *device);
-            CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source2, orig_source2, nbytes, *(hipStream_t*)stream->stream));
-        } else {
-            /* copy from one device to another device */
-            /* TODO: does this actually work? Can we enable P2P? */
-            //printf("attempting cross-device copy for source\n");
-            CHECK(hipMemcpyDtoDAsync, ((hipDeviceptr_t)*source2, (hipDeviceptr_t)orig_source2, nbytes, *(hipStream_t*)stream->stream));
-        }
-    }
     *threads_per_block = mca_op_rocm_component.ro_max_threads_per_block[*device];
+    *max_blocks        = mca_op_rocm_component.ro_max_blocks[*device];
+
 }
 
 static inline void device_op_post(void *source1,
@@ -143,7 +157,7 @@ static inline void device_op_post(void *source1,
                                   int device,
                                   opal_accelerator_stream_t *stream)
 {
-    if (-1 == target_device) {
+    if (MCA_ACCELERATOR_NO_DEVICE_ID == target_device) {
 
         size_t nbytes;
         ompi_datatype_type_size(dtype, &nbytes);
@@ -152,17 +166,14 @@ static inline void device_op_post(void *source1,
         CHECK(hipMemcpyDtoHAsync, (orig_target, (hipDeviceptr_t)target, nbytes, *(hipStream_t *)stream->stream));
     }
 
-    if (-1 == target_device) {
+    if (MCA_ACCELERATOR_NO_DEVICE_ID == target_device) {
         opal_accelerator.mem_release_stream(device, target, stream);
-        //CHECK(hipFreeAsync, ((hipDeviceptr_t)target, mca_op_rocm_component.ro_stream));
     }
     if (source1_device != device) {
         opal_accelerator.mem_release_stream(device, source1, stream);
-        //CHECK(hipFreeAsync, ((hipDeviceptr_t)source, mca_op_rocm_component.ro_stream));
     }
     if (NULL != source2 && source2_device != device) {
         opal_accelerator.mem_release_stream(device, source2, stream);
-        //CHECK(hipFreeAsync, ((hipDeviceptr_t)source, mca_op_rocm_component.ro_stream));
     }
 }
 
@@ -170,19 +181,20 @@ static inline void device_op_post(void *source1,
     static                                                                                                      \
     void ompi_op_rocm_2buff_##name##_##type_name(const void *in, void *inout, int *count,                       \
                                                    struct ompi_datatype_t **dtype,                              \
+                                                   int device,                                                  \
                                                    opal_accelerator_stream_t *stream,                           \
                                                    struct ompi_op_base_module_1_0_0_t *module) {                \
-        int threads_per_block;                                                                                  \
-        int source_device, target_device, device;                                                               \
+        int threads_per_block, max_blocks;                                                                      \
+        int source_device, target_device;                                                                       \
         type *source, *target;                                                                                  \
         int n = *count;                                                                                         \
         device_op_pre(in, (void**)&source, &source_device, NULL, NULL, NULL,                                    \
                       inout, (void**)&target, &target_device,                                                   \
                       n, *dtype,                                                                                \
-                      &threads_per_block, &device, stream);                                                     \
-        hipStream_t *custream = (hipStream_t*)stream->stream;                                                         \
-        ompi_op_rocm_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, *custream);      \
-        device_op_post(source, source_device, NULL, -1, inout, target, target_device, n, *dtype, device, stream);\
+                      &threads_per_block, &max_blocks, &device, stream);                                        \
+        hipStream_t *custream = (hipStream_t*)stream->stream;                                                   \
+        ompi_op_rocm_2buff_##name##_##type_name##_submit(source, target, n, threads_per_block, max_blocks, *custream);\
+        device_op_post(source, source_device, NULL, -1, inout, target, target_device, n, *dtype, device, stream); \
     }
 
 #define OP_FUNC(name, type_name, type, op, ...) FUNC(name, __VA_ARGS__##type_name, __VA_ARGS__##type)
@@ -204,22 +216,23 @@ static inline void device_op_post(void *source1,
     static                                                                                       \
     void ompi_op_rocm_2buff_##name##_##type_name(const void *in, void *inout, int *count,        \
                                                    struct ompi_datatype_t **dtype,               \
+                                                   int device,                                   \
                                                    opal_accelerator_stream_t *stream,            \
                                                    struct ompi_op_base_module_1_0_0_t *module) { \
                                                                                                  \
         _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
         switch(sizeof(type)) {  \
             case sizeof(int8_t):  \
-                ompi_op_rocm_2buff_##name##_int8_t(in, inout, count, dtype, stream, module); \
+                ompi_op_rocm_2buff_##name##_int8_t(in, inout, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int16_t): \
-                ompi_op_rocm_2buff_##name##_int16_t(in, inout, count, dtype, stream, module); \
+                ompi_op_rocm_2buff_##name##_int16_t(in, inout, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int32_t): \
-                ompi_op_rocm_2buff_##name##_int32_t(in, inout, count, dtype, stream, module); \
+                ompi_op_rocm_2buff_##name##_int32_t(in, inout, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int64_t): \
-                ompi_op_rocm_2buff_##name##_int64_t(in, inout, count, dtype, stream, module); \
+                ompi_op_rocm_2buff_##name##_int64_t(in, inout, count, dtype, device, stream, module); \
                 break; \
         } \
     }
@@ -229,18 +242,19 @@ static inline void device_op_post(void *source1,
     static                                                                                          \
     void ompi_op_rocm_2buff_##name##_##type_name(const void *in, void *inout, int *count,           \
                                                    struct ompi_datatype_t **dtype,                  \
+                                                   int device,                                      \
                                                    opal_accelerator_stream_t *stream,               \
                                                    struct ompi_op_base_module_1_0_0_t *module) {    \
         _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
         switch(sizeof(type)) {  \
             case sizeof(float):  \
-                ompi_op_rocm_2buff_##name##_float(in, inout, count, dtype, stream, module);  \
+                ompi_op_rocm_2buff_##name##_float(in, inout, count, dtype, device, stream, module);  \
                 break;  \
             case sizeof(double): \
-                ompi_op_rocm_2buff_##name##_double(in, inout, count, dtype, stream, module); \
+                ompi_op_rocm_2buff_##name##_double(in, inout, count, dtype, device, stream, module); \
                 break; \
             case sizeof(long double): \
-                ompi_op_rocm_2buff_##name##_long_double(in, inout, count, dtype, stream, module); \
+                ompi_op_rocm_2buff_##name##_long_double(in, inout, count, dtype, device, stream, module); \
                 break; \
         } \
     }
@@ -786,21 +800,22 @@ LOC_FUNC(minloc, long_double_int, <)
  */
 #define FUNC_3BUF(name, type_name, type)                                                                        \
     static                                                                                                      \
-    void ompi_op_rocm_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,      \
+    void ompi_op_rocm_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,       \
                                                    struct ompi_datatype_t **dtype,                              \
+                                                   int device,                                                  \
                                                    opal_accelerator_stream_t *stream,                           \
                                                    struct ompi_op_base_module_1_0_0_t *module) {                \
-        int threads_per_block;                                                                                  \
-        int source1_device, source2_device, target_device, device;                                              \
+        int threads_per_block, max_blocks;                                                                      \
+        int source1_device, source2_device, target_device;                                                      \
         type *source1, *source2, *target;                                                                       \
         int n = *count;                                                                                         \
         device_op_pre(in1, (void**)&source1, &source1_device,                                                   \
                       in2, (void**)&source2, &source2_device,                                                   \
                       out, (void**)&target, &target_device,                                                     \
                       n, *dtype,                                                                                \
-                      &threads_per_block, &device, stream);                                                     \
+                      &threads_per_block, &max_blocks, &device, stream);                                        \
         hipStream_t *custream = (hipStream_t*)stream->stream;                                                         \
-        ompi_op_rocm_3buff_##name##_##type_name##_submit(source1, source2, target, n, threads_per_block, *custream);\
+        ompi_op_rocm_3buff_##name##_##type_name##_submit(source1, source2, target, n, threads_per_block, max_blocks, *custream);\
         device_op_post(source1, source1_device, source2, source2_device, out, target, target_device, n, *dtype, device, stream);\
     }
 
@@ -825,22 +840,23 @@ LOC_FUNC(minloc, long_double_int, <)
     static                                                                                       \
     void ompi_op_rocm_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,        \
                                                    struct ompi_datatype_t **dtype,               \
+                                                   int device,                                                  \
                                                    opal_accelerator_stream_t *stream,            \
                                                    struct ompi_op_base_module_1_0_0_t *module) { \
                                                                                                  \
         _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
         switch(sizeof(type)) {  \
             case sizeof(int8_t):  \
-                ompi_op_rocm_3buff_##name##_int8_t(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_rocm_3buff_##name##_int8_t(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int16_t): \
-                ompi_op_rocm_3buff_##name##_int16_t(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_rocm_3buff_##name##_int16_t(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int32_t): \
-                ompi_op_rocm_3buff_##name##_int32_t(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_rocm_3buff_##name##_int32_t(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int64_t): \
-                ompi_op_rocm_3buff_##name##_int64_t(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_rocm_3buff_##name##_int64_t(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
         } \
     }
@@ -850,18 +866,19 @@ LOC_FUNC(minloc, long_double_int, <)
     static                                                                                          \
     void ompi_op_rocm_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,           \
                                                    struct ompi_datatype_t **dtype,                  \
+                                                   int device,                                                  \
                                                    opal_accelerator_stream_t *stream,               \
                                                    struct ompi_op_base_module_1_0_0_t *module) {    \
         _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
         switch(sizeof(type)) {  \
             case sizeof(float):  \
-                ompi_op_rocm_3buff_##name##_float(in1, in2, out, count, dtype, stream, module);  \
+                ompi_op_rocm_3buff_##name##_float(in1, in2, out, count, dtype, device, stream, module);  \
                 break;  \
             case sizeof(double): \
-                ompi_op_rocm_3buff_##name##_double(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_rocm_3buff_##name##_double(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
             case sizeof(long double): \
-                ompi_op_rocm_3buff_##name##_long_double(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_rocm_3buff_##name##_long_double(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
         } \
     }
diff --git a/ompi/mca/op/rocm/op_rocm_impl.cpp b/ompi/mca/op/rocm/op_rocm_impl.cpp
index c174bc31f29..be73c493007 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.cpp
+++ b/ompi/mca/op/rocm/op_rocm_impl.cpp
@@ -18,6 +18,9 @@
 
 #include "op_rocm_impl.h"
 
+//#define DO_NOT_USE_INTRINSICS 1
+#define USE_VECTORS 1
+
 /* TODO: missing support for
  * - short float (conditional on whether short float is available)
  * - complex
@@ -26,9 +29,13 @@
 
 #define THREADS_PER_BLOCK 512
 
+#define VECLEN 2
+#define VECTYPE(t) t##VECLEN
+
 #define OP_FUNC(name, type_name, type, op)                                                          \
     static __global__ void                                                                          \
-    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {          \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
@@ -39,20 +46,58 @@
                                                    type *inout,                                     \
                                                    int count,                                       \
                                                    int threads_per_block,                           \
-                                                   hipStream_t stream) {                               \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count + threads-1) / threads;                                                \
+                                                   int max_blocks,                                  \
+                                                   hipStream_t stream) {                            \
+        int threads = min(threads_per_block, count);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
+        int n = count;                                                                              \
+        hipStream_t s = stream;                                                                     \
+        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel,                        \
+                dim3(blocks), dim3(threads), 0, s,                                                  \
+                in, inout, n);                                                                      \
+    }
+
+#if defined(USE_VECTORS)
+#define OPV_FUNC(name, type_name, type, vtype, vlen, op)                                            \
+    static __global__ void                                                                          \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n/vlen; i += stride) {                                              \
+            ((vtype*)inout)[i] = ((vtype*)inout)[i] op ((vtype*)in)[i];                             \
+        }                                                                                           \
+        int remainder = n%vlen;                                                                     \
+        if (index == (n/vlen) && remainder != 0) {                                                  \
+            while(remainder) {                                                                      \
+                int idx = n - remainder--;                                                          \
+                inout[idx] = inout[idx] op in[idx];                                                 \
+            }                                                                                       \
+        }                                                                                           \
+    }                                                                                               \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                           \
+                                                   type *inout,                                     \
+                                                   int count,                                       \
+                                                   int threads_per_block,                           \
+                                                   int max_blocks,                                  \
+                                                   hipStream_t stream) {                            \
+        int threads = min(threads_per_block, (count/vlen));                                         \
+        int blocks  = min(((count/vlen) + threads-1) / threads, max_blocks);                        \
         int n = count;                                                                              \
-        hipStream_t s = stream;                                                                        \
-        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel, \
-                dim3(blocks), dim3(threads), 0, s,\
-                in, inout, n);  \
+        hipStream_t s = stream;                                                                     \
+        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel,                        \
+                dim3(blocks), dim3(threads), 0, s,                                                  \
+                in, inout, n);                                                                      \
     }
+#else // USE_VECTORS
+#define OPV_FUNC(name, type_name, type, vtype, vlen, op) OP_FUNC(name, type_name, type, op)
+#endif // USE_VECTORS
 
 
 #define FUNC_FUNC(name, type_name, type)                                                            \
     static __global__ void                                                                          \
-    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *in, type *inout, int n) {          \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
@@ -64,14 +109,15 @@
                                               type *inout,                                          \
                                               int count,                                            \
                                               int threads_per_block,                                \
-                                              hipStream_t stream) {                                    \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count + threads-1) / threads;                                                \
+                                              int max_blocks,                                       \
+                                              hipStream_t stream) {                                 \
+        int threads = min(threads_per_block, count);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
         int n = count;                                                                              \
-        hipStream_t s = stream;                                                                        \
-        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel, \
-                dim3(blocks), dim3(threads), 0, s, \
-                in, inout, n);  \
+        hipStream_t s = stream;                                                                     \
+        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel,                        \
+                dim3(blocks), dim3(threads), 0, s,                                                  \
+                in, inout, n);                                                                      \
     }
 
 /*
@@ -84,8 +130,8 @@
 
 #define LOC_FUNC(name, type_name, op)                                                               \
     static __global__ void                                                                          \
-    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in,  \
-                                                     ompi_op_predefined_##type_name##_t *inout,     \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *__restrict__ in,  \
+                                                     ompi_op_predefined_##type_name##_t *__restrict__ inout,     \
                                                      int n)                                         \
     {                                                                                               \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
@@ -106,13 +152,14 @@
                                             ompi_op_predefined_##type_name##_t *b,                  \
                                             int count,                                              \
                                             int threads_per_block,                                  \
-                                            hipStream_t stream) {                                      \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count + threads-1) / threads;                                                \
-        hipStream_t s = stream;                                                                        \
-        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel, \
-                dim3(blocks), dim3(threads), 0, s, \
-                a, b, count);   \
+                                            int max_blocks,                                         \
+                                            hipStream_t stream) {                                   \
+        int threads = min(threads_per_block, count);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
+        hipStream_t s = stream;                                                                     \
+        hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel,                        \
+                dim3(blocks), dim3(threads), 0, s,                                                  \
+                a, b, count);                                                                       \
     }
 
 /*************************************************************************
@@ -120,7 +167,11 @@
  *************************************************************************/
 
 #undef current_func
+#if defined(DO_NOT_USE_INTRINSICS)
 #define current_func(a, b) ((a) > (b) ? (a) : (b))
+#else  // DO_NOT_USE_INTRINSICS
+#define current_func(a, b) max(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 /* C integer */
 FUNC_FUNC(max,   int8_t,   int8_t)
 FUNC_FUNC(max,  uint8_t,  uint8_t)
@@ -133,16 +184,31 @@ FUNC_FUNC(max, uint64_t, uint64_t)
 FUNC_FUNC(max,  long,  long)
 FUNC_FUNC(max,  unsigned_long, unsigned long)
 
+#if !defined(DO_NOT_USE_INTRINSICS)
+#undef current_func
+#define current_func(a, b) fmaxf(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(max, float, float)
+
+#if !defined(DO_NOT_USE_INTRINSICS)
+#undef current_func
+#define current_func(a, b) fmax(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(max, double, double)
+
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
 FUNC_FUNC(max, long_double, long double)
 
 /*************************************************************************
  * Min
  *************************************************************************/
-
 #undef current_func
+#if defined(DO_NOT_USE_INTRINSICS)
 #define current_func(a, b) ((a) < (b) ? (a) : (b))
+#else  // DO_NOT_USE_INTRINSICS
+#define current_func(a, b) min(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 /* C integer */
 FUNC_FUNC(min,   int8_t,   int8_t)
 FUNC_FUNC(min,  uint8_t,  uint8_t)
@@ -155,30 +221,42 @@ FUNC_FUNC(min, uint64_t, uint64_t)
 FUNC_FUNC(min,  long,  long)
 FUNC_FUNC(min,  unsigned_long, unsigned long)
 
-
+#if !defined(DO_NOT_USE_INTRINSICS)
+#undef current_func
+#define current_func(a, b) fminf(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(min, float, float)
+
+#if !defined(DO_NOT_USE_INTRINSICS)
+#undef current_func
+#define current_func(a, b) fmin(a, b)
+#endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(min, double, double)
+
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
 FUNC_FUNC(min, long_double, long double)
 
+
 /*************************************************************************
  * Sum
  *************************************************************************/
 
 /* C integer */
-OP_FUNC(sum,   int8_t,   int8_t, +=)
-OP_FUNC(sum,  uint8_t,  uint8_t, +=)
-OP_FUNC(sum,  int16_t,  int16_t, +=)
-OP_FUNC(sum, uint16_t, uint16_t, +=)
-OP_FUNC(sum,  int32_t,  int32_t, +=)
-OP_FUNC(sum, uint32_t, uint32_t, +=)
-OP_FUNC(sum,  int64_t,  int64_t, +=)
-OP_FUNC(sum, uint64_t, uint64_t, +=)
-OP_FUNC(sum,  long,  long, +=)
-OP_FUNC(sum,  unsigned_long, unsigned long, +=)
-
-OP_FUNC(sum, float, float, +=)
-OP_FUNC(sum, double, double, +=)
-OP_FUNC(sum, long_double, long double, +=)
+OP_FUNC(sum,   int8_t,   int8_t, +)
+OP_FUNC(sum,  uint8_t,  uint8_t, +)
+OP_FUNC(sum,  int16_t,  int16_t, +)
+OP_FUNC(sum, uint16_t, uint16_t, +)
+OP_FUNC(sum,  int32_t,  int32_t, +)
+OP_FUNC(sum, uint32_t, uint32_t, +)
+OP_FUNC(sum,  int64_t,  int64_t, +)
+OP_FUNC(sum, uint64_t, uint64_t, +)
+OP_FUNC(sum,  long,  long, +)
+OP_FUNC(sum,  unsigned_long, unsigned long, +)
+
+OPV_FUNC(sum, float, float, float4, 4, +)
+OPV_FUNC(sum, double, double, double4, 4, +)
+OP_FUNC(sum, long_double, long double, +)
 
 /* Complex */
 #if 0
@@ -201,20 +279,20 @@ FUNC_FUNC(sum, c_double_complex, hipDoubleComplex)
  *************************************************************************/
 
 /* C integer */
-OP_FUNC(prod,   int8_t,   int8_t, *=)
-OP_FUNC(prod,  uint8_t,  uint8_t, *=)
-OP_FUNC(prod,  int16_t,  int16_t, *=)
-OP_FUNC(prod, uint16_t, uint16_t, *=)
-OP_FUNC(prod,  int32_t,  int32_t, *=)
-OP_FUNC(prod, uint32_t, uint32_t, *=)
-OP_FUNC(prod,  int64_t,  int64_t, *=)
-OP_FUNC(prod, uint64_t, uint64_t, *=)
-OP_FUNC(prod,  long,  long, *=)
-OP_FUNC(prod,  unsigned_long, unsigned long, *=)
-
-OP_FUNC(prod, float, float, *=)
-OP_FUNC(prod, double, double, *=)
-OP_FUNC(prod, long_double, long double, *=)
+OP_FUNC(prod,   int8_t,   int8_t, *)
+OP_FUNC(prod,  uint8_t,  uint8_t, *)
+OP_FUNC(prod,  int16_t,  int16_t, *)
+OP_FUNC(prod, uint16_t, uint16_t, *)
+OP_FUNC(prod,  int32_t,  int32_t, *)
+OP_FUNC(prod, uint32_t, uint32_t, *)
+OP_FUNC(prod,  int64_t,  int64_t, *)
+OP_FUNC(prod, uint64_t, uint64_t, *)
+OP_FUNC(prod,  long,  long, *)
+OP_FUNC(prod,  unsigned_long, unsigned long, *)
+
+OPV_FUNC(prod, float, float, float4, 4, *)
+OPV_FUNC(prod, double, double, double4, 4, *)
+OP_FUNC(prod, long_double, long double, *)
 
 /* Complex */
 #if 0
@@ -387,8 +465,9 @@ LOC_FUNC(minloc, long_double_int, <)
  */
 #define OP_FUNC_3BUF(name, type_name, type, op)                                                     \
     static __global__ void                                                                          \
-    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const type *in1, const type* in2,              \
-                                                     type *out, int n) {                            \
+    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const type *__restrict__ in1,                  \
+                                                     const type *__restrict__ in2,                  \
+                                                     type *__restrict__ out, int n) {               \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
@@ -398,12 +477,13 @@ LOC_FUNC(minloc, long_double_int, <)
     void ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1, const type *in2,         \
                                                           type *out, int count,                     \
                                                           int threads_per_block,                    \
-                                                          hipStream_t stream) {                        \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count+threads-1) / threads;                                                  \
-        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel, \
-                dim3(blocks), dim3(threads), 0, stream, \
-                in1, in2, out, count);      \
+                                                          int max_blocks,                           \
+                                                          hipStream_t stream) {                     \
+        int threads = min(threads_per_block, count);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
+        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel,                        \
+                dim3(blocks), dim3(threads), 0, stream,                                             \
+                in1, in2, out, count);                                                              \
     }
 
 
@@ -416,8 +496,9 @@ LOC_FUNC(minloc, long_double_int, <)
  */
 #define FUNC_FUNC_3BUF(name, type_name, type)                                                       \
     static __global__ void                                                                          \
-    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const type *in1, const type *in2,              \
-                                                     type *out, int n) {                            \
+    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const type *__restrict__ in1,                  \
+                                                     const type *__restrict__ in2,                  \
+                                                     type *__restrict__ out, int n) {               \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
@@ -428,12 +509,13 @@ LOC_FUNC(minloc, long_double_int, <)
     ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1, const type *in2,              \
                                                      type *out, int count,                          \
                                                      int threads_per_block,                         \
-                                                     hipStream_t stream) {                             \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count+threads-1) / threads;                                                  \
-        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel, \
-                dim3(blocks), dim3(threads), 0, stream,  \
-                in1, in2, out, count);      \
+                                                     int max_blocks,                                \
+                                                     hipStream_t stream) {                          \
+        int threads = min(threads_per_block, count);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
+        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel,                        \
+                dim3(blocks), dim3(threads), 0, stream,                                             \
+                in1, in2, out, count);                                                              \
     }
 
 /*
@@ -453,9 +535,9 @@ LOC_FUNC(minloc, long_double_int, <)
 
 #define LOC_FUNC_3BUF(name, type_name, op)                                                          \
     static __global__ void                                                                          \
-    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *in1, \
-                                                     const ompi_op_predefined_##type_name##_t *in2, \
-                                                     ompi_op_predefined_##type_name##_t *out,       \
+    ompi_op_rocm_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *__restrict__ in1, \
+                                                     const ompi_op_predefined_##type_name##_t *__restrict__ in2, \
+                                                     ompi_op_predefined_##type_name##_t *__restrict__ out,       \
                                                      int n)                                         \
     {                                                                                               \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
@@ -477,18 +559,19 @@ LOC_FUNC(minloc, long_double_int, <)
         }                                                                                           \
     }                                                                                               \
     void                                                                                            \
-    ompi_op_rocm_3buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *in1, \
-                                                     const ompi_op_predefined_##type_name##_t *in2, \
-                                                     ompi_op_predefined_##type_name##_t *out,       \
+    ompi_op_rocm_3buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *__restrict__ in1, \
+                                                     const ompi_op_predefined_##type_name##_t *__restrict__ in2, \
+                                                     ompi_op_predefined_##type_name##_t *__restrict__ out,       \
                                                      int count,                                     \
                                                      int threads_per_block,                         \
-                                                     hipStream_t stream)                               \
+                                                     int max_blocks,                                \
+                                                     hipStream_t stream)                            \
     {                                                                                               \
-        int threads = threads_per_block;                                                            \
-        int blocks  = (count+threads-1) / threads;                                                  \
-        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel,   \
-                dim3(blocks), dim3(threads), 0, stream, \
-                in1, in2, out, count);      \
+        int threads = min(threads_per_block, count);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
+        hipLaunchKernelGGL(ompi_op_rocm_3buff_##name##_##type_name##_kernel,                        \
+                dim3(blocks), dim3(threads), 0, stream,                                             \
+                in1, in2, out, count);                                                              \
     }
 
 
diff --git a/ompi/mca/op/rocm/op_rocm_impl.h b/ompi/mca/op/rocm/op_rocm_impl.h
index 0606c508280..5af40fb9c92 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.h
+++ b/ompi/mca/op/rocm/op_rocm_impl.h
@@ -28,18 +28,20 @@
 
 BEGIN_C_DECLS
 
-#define OP_FUNC_SIG(name, type_name, type, op)                                               \
-    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                  \
+#define OP_FUNC_SIG(name, type_name, type, op)                                      \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,           \
                                                    type *inout,                     \
-                                                   int count,                      \
+                                                   int count,                       \
                                                    int threads_per_block,           \
+                                                   int max_blocks,                  \
                                                    hipStream_t stream);
 
-#define FUNC_FUNC_SIG(name, type_name, type)                                            \
-    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                  \
+#define FUNC_FUNC_SIG(name, type_name, type)                                        \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,           \
                                                    type *inout,                     \
-                                                   int count,                      \
+                                                   int count,                       \
                                                    int threads_per_block,           \
+                                                   int max_blocks,                  \
                                                    hipStream_t stream);
 
 /*
@@ -56,10 +58,11 @@ BEGIN_C_DECLS
   } ompi_op_predefined_##type_name##_t;
 
 #define LOC_FUNC_SIG(name, type_name, op) \
-    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a, \
-                                            ompi_op_predefined_##type_name##_t *b,    \
-                                            int count,                                   \
-                                            int threads_per_block,                        \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a,  \
+                                            ompi_op_predefined_##type_name##_t *b,                      \
+                                            int count,                                                  \
+                                            int threads_per_block,                                      \
+                                            int max_blocks,                                             \
                                             hipStream_t stream);
 
 /*************************************************************************
@@ -369,28 +372,31 @@ LOC_FUNC_SIG(minloc, long_double_int, <)
 
 
 
-#define OP_FUNC_3BUF_SIG(name, type_name, type, op)                                               \
-    void ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1,                  \
-                                                          const type *in2,                  \
+#define OP_FUNC_3BUF_SIG(name, type_name, type, op)                                        \
+    void ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1,                 \
+                                                          const type *in2,                 \
                                                           type *inout,                     \
-                                                          int count,                      \
+                                                          int count,                       \
                                                           int threads_per_block,           \
+                                                          int max_blocks,                  \
                                                           hipStream_t stream);
 
-#define FUNC_FUNC_3BUF_SIG(name, type_name, type)                                            \
-    void ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1,                  \
-                                                          const type *in2,                  \
+#define FUNC_FUNC_3BUF_SIG(name, type_name, type)                                          \
+    void ompi_op_rocm_3buff_##name##_##type_name##_submit(const type *in1,                 \
+                                                          const type *in2,                 \
                                                           type *inout,                     \
-                                                          int count,                      \
+                                                          int count,                       \
                                                           int threads_per_block,           \
+                                                          int max_blocks,                  \
                                                           hipStream_t stream);
 
 #define LOC_FUNC_3BUF_SIG(name, type_name, op) \
     void ompi_op_rocm_3buff_##name##_##type_name##_submit(const ompi_op_predefined_##type_name##_t *a1, \
                                                           const ompi_op_predefined_##type_name##_t *a2, \
-                                                          ompi_op_predefined_##type_name##_t *b,    \
-                                                          int count,                                   \
+                                                          ompi_op_predefined_##type_name##_t *b,        \
+                                                          int count,                                    \
                                                           int threads_per_block,                        \
+                                                          int max_blocks,                  \
                                                           hipStream_t stream);
 
 
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 814a8464030..519520f1712 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -551,6 +551,7 @@ static inline bool ompi_op_supports_device(const ompi_op_t * op, const ompi_data
 static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
                                          void *target, size_t full_count,
                                          ompi_datatype_t * dtype,
+                                         int device,
                                          opal_accelerator_stream_t *stream)
 {
     MPI_Fint f_dtype, f_count;
@@ -580,7 +581,7 @@ static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
             }
             shift = done_count * ext;
             // Recurse one level in iterations of 'int'
-            ompi_op_reduce_stream(op, (char*)source + shift, (char*)target + shift, iter_count, dtype, stream);
+            ompi_op_reduce_stream(op, (char*)source + shift, (char*)target + shift, iter_count, dtype, device, stream);
             done_count += iter_count;
         }
         return;
@@ -610,17 +611,31 @@ static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
      */
 
     bool use_device_op = false;
-    int source_dev_id, target_dev_id;
-    uint64_t source_flags, target_flags;
-    int target_check_addr = opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
-    int source_check_addr = opal_accelerator.check_addr(source, &source_dev_id, &source_flags);
     /* check if either of the buffers is on a device and if so make sure we can
      * access handle it properly */
-    if (target_check_addr > 0 || source_check_addr > 0) {
-        if (ompi_datatype_is_predefined(dtype) &&
+    if (device != MCA_ACCELERATOR_NO_DEVICE_ID &&
+        ompi_datatype_is_predefined(dtype) &&
+        0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC) &&
+        NULL != op->o_device_op) {
+        use_device_op = true;
+    }
+
+    if (!use_device_op) {
+        /* query the accelerator for whether we can still execute */
+        int source_dev_id, target_dev_id;
+        uint64_t source_flags, target_flags;
+        int target_check_addr = opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
+        int source_check_addr = opal_accelerator.check_addr(source, &source_dev_id, &source_flags);
+        if (target_check_addr > 0 &&
+            source_check_addr > 0 &&
+            ompi_datatype_is_predefined(dtype) &&
             0 != (op->o_flags & OMPI_OP_FLAGS_INTRINSIC) &&
             NULL != op->o_device_op) {
             use_device_op = true;
+            if (target_dev_id == source_dev_id) {
+                /* both inputs are on the same device; if not the op will take of that */
+                device = target_dev_id;
+            }
         } else {
             /* check whether we can access the memory from the host */
             if ((target_check_addr == 0 || (target_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) &&
@@ -644,16 +659,17 @@ static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
         }
         if (use_device_op) {
             if (NULL == op->o_device_op) {
+                fprintf(stderr, "no suitable device op module found!");
                 abort(); // TODO: be more graceful!
             }
             opal_accelerator_stream_t *actual_stream = stream;
             bool flush_stream = false;
             if (NULL == stream) {
-                opal_accelerator.get_default_stream(target_dev_id, &actual_stream);
+                opal_accelerator.get_default_stream(device, &actual_stream);
                 flush_stream = true;
             }
             op->o_device_op->do_intrinsic.fns[dtype_id](source, target,
-                                                        &count, &dtype, actual_stream,
+                                                        &count, &dtype, device, actual_stream,
                                                         op->o_device_op->do_intrinsic.modules[dtype_id]);
             if (flush_stream) {
                 opal_accelerator.wait_stream(actual_stream);
@@ -687,7 +703,7 @@ static inline void ompi_op_reduce(ompi_op_t * op, void *source,
                                   void *target, size_t full_count,
                                   ompi_datatype_t * dtype)
 {
-    ompi_op_reduce_stream(op, source, target, full_count, dtype, NULL);
+    ompi_op_reduce_stream(op, source, target, full_count, dtype, MCA_ACCELERATOR_NO_DEVICE_ID, NULL);
 }
 
 static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, void * restrict source2,
@@ -723,11 +739,13 @@ static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, v
 static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, void *source1,
                                                void *source2, void *target,
                                                int count, ompi_datatype_t * dtype,
+                                               int device,
                                                opal_accelerator_stream_t *stream)
 {
     void *restrict src1;
     void *restrict src2;
     void *restrict tgt;
+    bool use_device_op = false;
     src1 = source1;
     src2 = source2;
     tgt = target;
@@ -738,28 +756,37 @@ static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, void *source1,
         return;
     }
 
-    bool use_device_op = false;
-    int source1_dev_id, source2_dev_id, target_dev_id;
-    uint64_t source1_flags, source2_flags, target_flags;
-    int target_check_addr = opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
-    int source1_check_addr = opal_accelerator.check_addr(source1, &source1_dev_id, &source1_flags);
-    int source2_check_addr = opal_accelerator.check_addr(source2, &source2_dev_id, &source2_flags);
-    /* check if either of the buffers is on a device and if so make sure we can
-     * access handle it properly */
-    if (target_check_addr > 0 || source1_check_addr > 0 || source2_check_addr > 0) {
-        if (ompi_datatype_is_predefined(dtype) &&
-            op->o_flags & OMPI_OP_FLAGS_INTRINSIC &&
-            NULL != op->o_device_op) {
-            use_device_op = true;
-        } else {
-            /* check whether we can access the memory from the host */
-            if ((target_check_addr  == 0 || (target_flags  & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) &&
-                (source1_check_addr == 0 || (source1_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) &&
-                (source2_check_addr == 0 || (source2_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY))) {
-                /* nothing to be done, we won't need device-capable ops */
+    if (device != MCA_ACCELERATOR_NO_DEVICE_ID &&
+        ompi_datatype_is_predefined(dtype) &&
+        op->o_flags & OMPI_OP_FLAGS_INTRINSIC &&
+        NULL != op->o_device_op) {
+        use_device_op = true;
+    }
+
+    if (!use_device_op) {
+        int source1_dev_id, source2_dev_id, target_dev_id;
+        uint64_t source1_flags, source2_flags, target_flags;
+        int target_check_addr = opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
+        int source1_check_addr = opal_accelerator.check_addr(source1, &source1_dev_id, &source1_flags);
+        int source2_check_addr = opal_accelerator.check_addr(source2, &source2_dev_id, &source2_flags);
+        /* check if either of the buffers is on a device and if so make sure we can
+        * access handle it properly */
+        if (target_check_addr > 0 || source1_check_addr > 0 || source2_check_addr > 0) {
+            if (ompi_datatype_is_predefined(dtype) &&
+                op->o_flags & OMPI_OP_FLAGS_INTRINSIC &&
+                NULL != op->o_device_op) {
+                use_device_op = true;
+                device = target_dev_id;
             } else {
-                fprintf(stderr, "3buff op: no suitable op module found for device memory!\n");
-                abort();
+                /* check whether we can access the memory from the host */
+                if ((target_check_addr  == 0 || (target_flags  & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) &&
+                    (source1_check_addr == 0 || (source1_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) &&
+                    (source2_check_addr == 0 || (source2_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY))) {
+                    /* nothing to be done, we won't need device-capable ops */
+                } else {
+                    fprintf(stderr, "3buff op: no suitable op module found for device memory!\n");
+                    abort();
+                }
             }
         }
     }
@@ -780,11 +807,11 @@ static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, void *source1,
             opal_accelerator_stream_t *actual_stream = stream;
             bool flush_stream = false;
             if (NULL == stream) {
-                opal_accelerator.get_default_stream(target_dev_id, &actual_stream);
+                opal_accelerator.get_default_stream(device, &actual_stream);
                 flush_stream = true;
             }
             op->o_device_op->do_3buff_intrinsic.fns[dtype_id](source1, source2, target,
-                                                              &count, &dtype, actual_stream,
+                                                              &count, &dtype, device, actual_stream,
                                                               op->o_device_op->do_3buff_intrinsic.modules[dtype_id]);
             if (flush_stream) {
                 opal_accelerator.wait_stream(actual_stream);
@@ -810,7 +837,7 @@ static inline void ompi_3buff_op_reduce(ompi_op_t * op, void *source1,
     tgt = target;
 
     if (OPAL_LIKELY(ompi_op_is_intrinsic (op))) {
-        ompi_3buff_op_reduce_stream(op, source1, source2, target, count, dtype, NULL);
+        ompi_3buff_op_reduce_stream(op, source1, source2, target, count, dtype, MCA_ACCELERATOR_NO_DEVICE_ID, NULL);
 #if 0
         op->o_3buff_intrinsic.fns[ompi_op_ddt_map[dtype->id]](src1, src2,
                                                               tgt, &count,
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 71684227480..0f0ce05235f 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -112,6 +112,7 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6
             //*flags |= MCA_ACCELERATOR_FLAGS_HOST_ATOMICS;
             /* First access on a device pointer triggers ROCM support lazy initialization. */
             opal_accelerator_rocm_lazy_init();
+            *dev_id = srcAttr.device;
             ret = 1;
 #if HIP_VERSION >= 50731921
         } else if (hipMemoryTypeUnified == srcAttr.type) {
@@ -122,6 +123,7 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6
             //*flags |= MCA_ACCELERATOR_FLAGS_HOST_LDSTR;
             //*flags |= MCA_ACCELERATOR_FLAGS_HOST_ATOMICS;
             ret = 1;
+            *dev_id = srcAttr.device;
         }
     }
 

From ee31b602a943763ad6c538624b77a78e11b07d13 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 28 Jun 2023 18:54:26 +0000
Subject: [PATCH 37/74] CUDA/ROCm: Fix vectorized ops and rocm integration

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/Makefile.am         |  2 +-
 ompi/mca/op/cuda/op_cuda_functions.c | 32 ++++++++++++++++------------
 ompi/mca/op/cuda/op_cuda_impl.cu     | 11 +++-------
 ompi/mca/op/rocm/Makefile.am         |  8 +++----
 ompi/mca/op/rocm/op_rocm_impl.cpp    |  5 +++--
 5 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index f96d8b8a896..27b3edf6aaf 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -23,7 +23,7 @@ sources = op_cuda_component.c op_cuda.h op_cuda_functions.c op_cuda_impl.h
 #sources_extended = op_cuda_functions.cu
 cu_sources = op_cuda_impl.cu
 
-NVCC = nvcc -allow-unsupported-compiler
+NVCC = nvcc -O2
 
 .cu.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index f3e211aad1e..125c5140aa6 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -215,22 +215,23 @@ static inline void device_op_post(void *source1,
     static                                                                                       \
     void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,        \
                                                    struct ompi_datatype_t **dtype,               \
+                                                   int device,                                   \
                                                    opal_accelerator_stream_t *stream,            \
                                                    struct ompi_op_base_module_1_0_0_t *module) { \
                                                                                                  \
         _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
         switch(sizeof(type)) {  \
             case sizeof(int8_t):  \
-                ompi_op_cuda_2buff_##name##_int8_t(in, inout, count, dtype, stream, module); \
+                ompi_op_cuda_2buff_##name##_int8_t(in, inout, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int16_t): \
-                ompi_op_cuda_2buff_##name##_int16_t(in, inout, count, dtype, stream, module); \
+                ompi_op_cuda_2buff_##name##_int16_t(in, inout, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int32_t): \
-                ompi_op_cuda_2buff_##name##_int32_t(in, inout, count, dtype, stream, module); \
+                ompi_op_cuda_2buff_##name##_int32_t(in, inout, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int64_t): \
-                ompi_op_cuda_2buff_##name##_int64_t(in, inout, count, dtype, stream, module); \
+                ompi_op_cuda_2buff_##name##_int64_t(in, inout, count, dtype, device, stream, module); \
                 break; \
         } \
     }
@@ -240,18 +241,19 @@ static inline void device_op_post(void *source1,
     static                                                                                          \
     void ompi_op_cuda_2buff_##name##_##type_name(const void *in, void *inout, int *count,           \
                                                    struct ompi_datatype_t **dtype,                  \
+                                                   int device,                                      \
                                                    opal_accelerator_stream_t *stream,               \
                                                    struct ompi_op_base_module_1_0_0_t *module) {    \
         _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
         switch(sizeof(type)) {  \
             case sizeof(float):  \
-                ompi_op_cuda_2buff_##name##_float(in, inout, count, dtype, stream, module);  \
+                ompi_op_cuda_2buff_##name##_float(in, inout, count, dtype, device, stream, module);  \
                 break;  \
             case sizeof(double): \
-                ompi_op_cuda_2buff_##name##_double(in, inout, count, dtype, stream, module); \
+                ompi_op_cuda_2buff_##name##_double(in, inout, count, dtype, device, stream, module); \
                 break; \
             case sizeof(long double): \
-                ompi_op_cuda_2buff_##name##_long_double(in, inout, count, dtype, stream, module); \
+                ompi_op_cuda_2buff_##name##_long_double(in, inout, count, dtype, device, stream, module); \
                 break; \
         } \
     }
@@ -837,22 +839,23 @@ LOC_FUNC(minloc, long_double_int, <)
     static                                                                                       \
     void ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,        \
                                                    struct ompi_datatype_t **dtype,               \
+                                                   int device,                                   \
                                                    opal_accelerator_stream_t *stream,            \
                                                    struct ompi_op_base_module_1_0_0_t *module) { \
                                                                                                  \
         _Static_assert(sizeof(type) >= sizeof(int8_t) && sizeof(type) <= sizeof(int64_t));       \
         switch(sizeof(type)) {  \
             case sizeof(int8_t):  \
-                ompi_op_cuda_3buff_##name##_int8_t(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_cuda_3buff_##name##_int8_t(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int16_t): \
-                ompi_op_cuda_3buff_##name##_int16_t(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_cuda_3buff_##name##_int16_t(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int32_t): \
-                ompi_op_cuda_3buff_##name##_int32_t(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_cuda_3buff_##name##_int32_t(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
             case sizeof(int64_t): \
-                ompi_op_cuda_3buff_##name##_int64_t(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_cuda_3buff_##name##_int64_t(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
         } \
     }
@@ -862,18 +865,19 @@ LOC_FUNC(minloc, long_double_int, <)
     static                                                                                          \
     void ompi_op_cuda_3buff_##name##_##type_name(const void *in1, const void *in2, void *out, int *count,           \
                                                    struct ompi_datatype_t **dtype,                  \
+                                                   int device,                                      \
                                                    opal_accelerator_stream_t *stream,               \
                                                    struct ompi_op_base_module_1_0_0_t *module) {    \
         _Static_assert(sizeof(type) >= sizeof(float) && sizeof(type) <= sizeof(long double));       \
         switch(sizeof(type)) {  \
             case sizeof(float):  \
-                ompi_op_cuda_3buff_##name##_float(in1, in2, out, count, dtype, stream, module);  \
+                ompi_op_cuda_3buff_##name##_float(in1, in2, out, count, dtype, device, stream, module);  \
                 break;  \
             case sizeof(double): \
-                ompi_op_cuda_3buff_##name##_double(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_cuda_3buff_##name##_double(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
             case sizeof(long double): \
-                ompi_op_cuda_3buff_##name##_long_double(in1, in2, out, count, dtype, stream, module); \
+                ompi_op_cuda_3buff_##name##_long_double(in1, in2, out, count, dtype, device, stream, module); \
                 break; \
         } \
     }
diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index 6ee3e0a512b..6dd61ab5ced 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -11,12 +11,6 @@
  * $HEADER$
  */
 
-#include <sys/types.h>
-
-#include <cuComplex.h>
-
-#include <stdio.h>
-
 #include "op_cuda_impl.h"
 
 /* TODO: missing support for
@@ -73,8 +67,9 @@
                                                    int threads_per_block,                           \
                                                    int max_blocks,                                  \
                                                    CUstream stream) {                               \
-        int threads = min(count/vlen, threads_per_block);                                           \
-        int blocks  = min(((count/vlen) + threads-1) / threads, max_blocks);                        \
+        int vcount  = (count + vlen-1)/vlen;                                                        \
+        int threads = min(threads_per_block, vcount);                                               \
+        int blocks  = min((vcount + threads-1) / threads, max_blocks);                              \
         int n = count;                                                                              \
         CUstream s = stream;                                                                        \
         ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index 091033f5284..d89724719e8 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -23,7 +23,7 @@ sources = op_rocm_component.c op_rocm.h op_rocm_functions.c op_rocm_impl.h
 rocm_sources = op_rocm_impl.cpp
 
 HIPCC = hipcc
-HIPCCFLAGS= -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.5.0/include
+#HIPCCFLAGS= -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.5.0/include
 
 
 .cpp.l$(OBJEXT):
@@ -60,14 +60,12 @@ endif
 # The DSO should install itself in $(ompilibdir) (by default,
 # $prefix/lib/openmpi).
 
-ROCMDIR=/opt/rocm-5.5.0/lib
-
 mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_rocm_la_SOURCES = $(sources)
 mca_op_rocm_la_LIBADD = $(rocm_sources:.cpp=.lo)
 mca_op_rocm_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
-		$(accelerator_rocm_LIBS) $(HIPCCFLAGS) -L$(ROCMDIR)/lib -lhiprtc
+		$(accelerator_rocm_LIBS) $(HIPCCFLAGS)
 EXTRA_mca_op_rocm_la_SOURCES = $(rocm_sources)
 
 # Specific information for static builds.
@@ -79,6 +77,6 @@ noinst_LTLIBRARIES = $(component_noinst)
 libmca_op_rocm_la_SOURCES = $(sources)
 libmca_op_rocm_la_LIBADD = $(rocm_sources:.cpp=.lo)
 libmca_op_rocm_la_LDFLAGS = -module -avoid-version\
-		$(accelerator_rocm_LIBS) ${HIPCCFLAGS} -L$(ROCMDIR)/lib -lhiprtc
+		$(accelerator_rocm_LIBS) ${HIPCCFLAGS}
 EXTRA_libmca_op_rocm_la_SOURCES = $(rocm_sources)
 
diff --git a/ompi/mca/op/rocm/op_rocm_impl.cpp b/ompi/mca/op/rocm/op_rocm_impl.cpp
index be73c493007..d8ad5f7fc90 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.cpp
+++ b/ompi/mca/op/rocm/op_rocm_impl.cpp
@@ -81,8 +81,9 @@
                                                    int threads_per_block,                           \
                                                    int max_blocks,                                  \
                                                    hipStream_t stream) {                            \
-        int threads = min(threads_per_block, (count/vlen));                                         \
-        int blocks  = min(((count/vlen) + threads-1) / threads, max_blocks);                        \
+        int vcount  = (count + vlen-1)/vlen;                                                        \
+        int threads = min(threads_per_block, vcount);                                               \
+        int blocks  = min((vcount + threads-1) / threads, max_blocks);                              \
         int n = count;                                                                              \
         hipStream_t s = stream;                                                                     \
         hipLaunchKernelGGL(ompi_op_rocm_2buff_##name##_##type_name##_kernel,                        \

From 9ab499a6aafe1650971380b43a5c3577b7ec2ef9 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 28 Jun 2023 19:42:19 +0000
Subject: [PATCH 38/74] Reduce_local: use OPAL defines to detect device support

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 test/datatype/reduce_local.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index e5a2d078abd..134376676a2 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -22,13 +22,15 @@
 
 // TODO: detect through configure
 //#define HAVE_CUDA 1
-#define HAVE_ROCM 1
+//#define HAVE_ROCM 1
 
 #include "mpi.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/runtime/mpiruntime.h"
 
+#include "opal_config.h"
+
 typedef struct op_name_s {
     char *name;
     char *mpi_op_name;
@@ -244,7 +246,7 @@ static allocator_t host_allocator = {
     .free     = &host_free,
     .fini     = &host_fini};
 
-#if defined(HAVE_CUDA)
+#if defined(OPAL_CUDA_SUPPORT)
 #include <cuda_runtime.h>
 static void cuda_init() {
     // nothing to be done
@@ -278,7 +280,7 @@ static allocator_t cuda_allocator = {
     .free     = &cuda_free,
     .fini     = &cuda_fini};
 
-#elif defined(HAVE_ROCM)
+#elif defined(OPAL_ROCM_SUPPORT)
 #include <hip/hip_runtime.h>
 static void rocm_init() {
     hipError_t ret = hipInit(0);
@@ -412,12 +414,12 @@ int main(int argc, char **argv)
                 // default allocator
                 break;
             } else
-#if defined(HAVE_CUDA)
+#if defined(OPAL_CUDA_SUPPORT)
             if (0 == strncmp("cuda", optarg, 4)) {
                 allocator = &cuda_allocator;
                 break;
             } else
-#elif defined(HAVE_ROCM)
+#elif defined(OPAL_ROCM_SUPPORT)
             if (0 == strncmp("rocm", optarg, 4)) {
                 allocator = &rocm_allocator;
                 break;
@@ -438,10 +440,10 @@ int main(int argc, char **argv)
                     " -o <op> : comma separated list of operations to execute among\n"
                     "           sum, min, max, prod, bor, bxor, band\n"
                     " -d <memory-space> : host"
-#ifdef HAVE_CUDA
+#ifdef OPAL_CUDA_SUPPORT
                     ", cuda"
 #endif
-#ifdef HAVE_ROCM
+#ifdef OPAL_ROCM_SUPPORT
                     ", rocm"
 #endif
                     "\n"

From dbd855d4dcff92f2c9cea6dda4818043715bf29e Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 28 Jun 2023 19:42:58 +0000
Subject: [PATCH 39/74] CUDA op: fix vectorized ops

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_impl.cu | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index 6dd61ab5ced..64af502d968 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -18,6 +18,8 @@
  * - complex
  */
 
+#define USE_VECTORS 1
+
 #define OP_FUNC(name, type_name, type, op)                                                          \
     static __global__ void                                                                          \
     ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
@@ -51,7 +53,13 @@
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n/vlen; i += stride) {                                              \
-            ((vtype*)inout)[i] = ((vtype*)inout)[i] op ((vtype*)in)[i];                             \
+            vtype vin = ((vtype*)in)[i];                                                            \
+            vtype vinout = ((vtype*)inout)[i];                                                      \
+            vinout.x = vinout.x op vin.x;                                                         \
+            vinout.y = vinout.y op vin.y;                                                         \
+            vinout.z = vinout.z op vin.z;                                                         \
+            vinout.w = vinout.w op vin.w;                                                         \
+            ((vtype*)inout)[i] = vinout;                                                            \
         }                                                                                           \
         int remainder = n%vlen;                                                                     \
         if (index == (n/vlen) && remainder != 0) {                                                  \

From 02120c9e36c62660b4925196f050e991adf272e0 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 12 Jul 2023 16:05:28 +0000
Subject: [PATCH 40/74] Reduce: add vectors to cuda implementation

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/Makefile.am     |   1 +
 ompi/mca/op/cuda/op_cuda_impl.cu | 320 +++++++++++++++++++++++++------
 ompi/mca/op/cuda/op_cuda_impl.h  |   6 +
 test/datatype/reduce_local.c     |   2 +-
 4 files changed, 272 insertions(+), 57 deletions(-)

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index 27b3edf6aaf..26af0c3e7f6 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -24,6 +24,7 @@ sources = op_cuda_component.c op_cuda.h op_cuda_functions.c op_cuda_impl.h
 cu_sources = op_cuda_impl.cu
 
 NVCC = nvcc -O2
+NVCCFLAGS=-Wc,--gpu-architecture=compute_53
 
 .cu.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index 64af502d968..d10084ffa37 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -13,12 +13,59 @@
 
 #include "op_cuda_impl.h"
 
+#include <limits.h>
+
+#include <type_traits>
+
+#define ISSIGNED(x) std::is_signed_v<x>
+
+template<typename T>
+static inline __device__ constexpr T tmax(T a, T b) {
+    return (a > b) ? a : b;
+}
+
+template<typename T>
+static inline __device__ constexpr T tmin(T a, T b) {
+    return (a < b) ? a : b;
+}
+
+template<typename T>
+static inline __device__ constexpr T tsum(T a, T b) {
+    return a+b;
+}
+
+template<typename T>
+static inline __device__ constexpr T tprod(T a, T b) {
+    return a*b;
+}
+
+template<typename T>
+static inline __device__ T vmax(const T& a, const T& b) {
+    return T{tmax(a.x, b.x), tmax(a.y, b.y), tmax(a.z, b.z), tmax(a.w, b.w)};
+}
+
+template<typename T>
+static inline __device__ T vmin(const T& a, const T& b) {
+    return T{tmin(a.x, b.x), tmin(a.y, b.y), tmin(a.z, b.z), tmin(a.w, b.w)};
+}
+
+template<typename T>
+static inline __device__ T vsum(const T& a, const T& b) {
+    return T{tsum(a.x, b.x), tsum(a.y, b.y), tsum(a.z, b.z), tsum(a.w, b.w)};
+}
+
+template<typename T>
+static inline __device__ T vprod(const T& a, const T& b) {
+    return T{(a.x * b.x), (a.y * b.y), (a.z * b.z), (a.w * b.w)};
+}
+
+
 /* TODO: missing support for
  * - short float (conditional on whether short float is available)
  * - complex
  */
 
-#define USE_VECTORS 1
+//#define USE_VECTORS 1
 
 #define OP_FUNC(name, type_name, type, op)                                                          \
     static __global__ void                                                                          \
@@ -86,14 +133,14 @@
 #define OPV_FUNC(name, type_name, type, vtype, vlen, op) OP_FUNC(name, type_name, type, op)
 #endif // USE_VECTORS
 
-#define FUNC_FUNC(name, type_name, type)                                                            \
+#define FUNC_FUNC_FN(name, type_name, type, fn)                                                     \
     static __global__ void                                                                          \
     ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
                                                      type *__restrict__ inout, int n) {             \
         const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
         const int stride = blockDim.x * gridDim.x;                                                  \
         for (int i = index; i < n; i += stride) {                                                   \
-            inout[i] = current_func(inout[i], in[i]);                                               \
+            inout[i] = fn(inout[i], in[i]);                                                         \
         }                                                                                           \
     }                                                                                               \
     void                                                                                            \
@@ -107,10 +154,47 @@
         int blocks  = min((count + threads-1) / threads, max_blocks);                               \
         int n = count;                                                                              \
         CUstream s = stream;                                                                        \
-        blocks = (blocks > 64) ? 64 : blocks;                                                       \
         ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
     }
 
+#define FUNC_FUNC(name, type_name, type) FUNC_FUNC_FN(name, type_name, type, current_func)
+
+#if defined(USE_VECTORS)
+#define VFUNC_FUNC(name, type_name, type, vtype, vlen, vfn, fn)                                     \
+    static __global__ void                                                                          \
+    ompi_op_cuda_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n/vlen; i += stride) {                                              \
+            ((vtype*)inout)[i] = vfn(((vtype*)inout)[i], ((vtype*)in)[i]);                          \
+        }                                                                                           \
+        int remainder = n%vlen;                                                                     \
+        if (index == (n/vlen) && remainder != 0) {                                                  \
+            while(remainder) {                                                                      \
+                int idx = n - remainder--;                                                          \
+                inout[idx] = fn(inout[idx], in[idx]);                                               \
+            }                                                                                       \
+        }                                                                                           \
+    }                                                                                               \
+    static void                                                                                     \
+    ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                                \
+                                              type *inout,                                          \
+                                              int count,                                            \
+                                              int threads_per_block,                                \
+                                              int max_blocks,                                       \
+                                              CUstream stream) {                                    \
+        int vcount  = (count + vlen-1)/vlen;                                                        \
+        int threads = min(threads_per_block, vcount);                                               \
+        int blocks  = min((vcount + threads-1) / threads, max_blocks);                              \
+        int n = count;                                                                              \
+        CUstream s = stream;                                                                        \
+        ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+    }
+#else
+#define VFUNC_FUNC(name, type_name, type, vtype, vlen, vfn, fn) FUNC_FUNC_FN(name, type_name, type, fn)
+#endif // defined(USE_VECTORS)
+
 /*
  * Since all the functions in this file are essentially identical, we
  * use a macro to substitute in names and types.  The core operation
@@ -151,27 +235,89 @@
         ompi_op_cuda_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(a, b, count);   \
     }
 
+#define OPV_DISPATCH(name, type_name, type)                                                         \
+    void ompi_op_cuda_2buff_##name##_##type_name##_submit(const type *in,                           \
+                                                   type *inout,                                     \
+                                                   int count,                                       \
+                                                   int threads_per_block,                           \
+                                                   int max_blocks,                                  \
+                                                   CUstream stream) {                               \
+        static_assert(sizeof(type_name) <= sizeof(unsigned long long), "Unknown size type");        \
+        if constexpr(!ISSIGNED(type)) {                                                                      \
+            if constexpr(sizeof(type_name) == sizeof(unsigned char)) {                                               \
+                ompi_op_cuda_2buff_##name##_uchar_submit((const unsigned char*)in, (unsigned char*)inout, count,    \
+                                                         threads_per_block,                         \
+                                                         max_blocks, stream);                        \
+            } else if constexpr(sizeof(type_name) == sizeof(unsigned short)) {                                       \
+                ompi_op_cuda_2buff_##name##_ushort_submit((const unsigned short*)in, (unsigned short*)inout, count, \
+                                                          threads_per_block,                        \
+                                                          max_blocks, stream);                       \
+            } else if constexpr(sizeof(type_name) == sizeof(unsigned int)) {                                 \
+                ompi_op_cuda_2buff_##name##_uint_submit((const unsigned int*)in, (unsigned int*)inout, count,         \
+                                                        threads_per_block,                          \
+                                                        max_blocks, stream);                         \
+            } else if constexpr(sizeof(type_name) == sizeof(unsigned long)) {                                \
+                ompi_op_cuda_2buff_##name##_ulong_submit((const unsigned long*)in, (unsigned long*)inout, count, \
+                                                         threads_per_block,                         \
+                                                         max_blocks, stream);                        \
+            } else if constexpr(sizeof(type_name) == sizeof(unsigned long long)) {                           \
+                ompi_op_cuda_2buff_##name##_ulonglong_submit((const unsigned long long*)in, (unsigned long long*)inout, count, \
+                                                             threads_per_block,                     \
+                                                             max_blocks, stream);                    \
+            }                                                                                       \
+        } else {                                                                                    \
+            if constexpr(sizeof(type_name) == sizeof(char)) {                                                \
+                ompi_op_cuda_2buff_##name##_char_submit((const char*)in, (char*)inout, count,       \
+                                                        threads_per_block,                          \
+                                                        max_blocks, stream);                         \
+            } else if constexpr(sizeof(type_name) == sizeof(short)) {                                        \
+                ompi_op_cuda_2buff_##name##_short_submit((const short*)in, (short*)inout, count,    \
+                                                          threads_per_block,                        \
+                                                          max_blocks, stream);                       \
+            } else if constexpr(sizeof(type_name) == sizeof(int)) {                                          \
+                ompi_op_cuda_2buff_##name##_int_submit((const int*)in, (int*)inout, count,          \
+                                                        threads_per_block,                          \
+                                                        max_blocks, stream);                         \
+            } else if constexpr(sizeof(type_name) == sizeof(long)) {                                         \
+                ompi_op_cuda_2buff_##name##_long_submit((const long*)in, (long*)inout, count,       \
+                                                         threads_per_block,                         \
+                                                         max_blocks, stream);                        \
+            } else if constexpr(sizeof(type_name) == sizeof(long long)) {                                    \
+                ompi_op_cuda_2buff_##name##_longlong_submit((const long long*)in, (long long*)inout, count,\
+                                                             threads_per_block,                     \
+                                                             max_blocks, stream);                    \
+            }                                                                                       \
+        }                                                                                           \
+    }
+
 /*************************************************************************
  * Max
  *************************************************************************/
 
+/* C integer */
+VFUNC_FUNC(max, char, char, char4, 4, vmax, max)
+VFUNC_FUNC(max, uchar, unsigned char, uchar4, 4, vmax, max)
+VFUNC_FUNC(max, short, short, short4, 4, vmax, max)
+VFUNC_FUNC(max, ushort, unsigned short, ushort4, 4, vmax, max)
+VFUNC_FUNC(max, int, int, int4, 4, vmax, max)
+VFUNC_FUNC(max, uint, unsigned int, uint4, 4, vmax, max)
+
 #undef current_func
-#if defined(DO_NOT_USE_INTRINSICS)
-#define current_func(a, b) ((a) > (b) ? (a) : (b))
-#else  // DO_NOT_USE_INTRINSICS
 #define current_func(a, b) max(a, b)
-#endif // DO_NOT_USE_INTRINSICS
-/* C integer */
-FUNC_FUNC(max,   int8_t,   int8_t)
-FUNC_FUNC(max,  uint8_t,  uint8_t)
-FUNC_FUNC(max,  int16_t,  int16_t)
-FUNC_FUNC(max, uint16_t, uint16_t)
-FUNC_FUNC(max,  int32_t,  int32_t)
-FUNC_FUNC(max, uint32_t, uint32_t)
-FUNC_FUNC(max,  int64_t,  int64_t)
-FUNC_FUNC(max, uint64_t, uint64_t)
 FUNC_FUNC(max,  long,  long)
-FUNC_FUNC(max,  unsigned_long, unsigned long)
+FUNC_FUNC(max,  ulong, unsigned long)
+FUNC_FUNC(max,  longlong, long long)
+FUNC_FUNC(max,  ulonglong, unsigned long long)
+
+/* dispatch fixed-size types */
+OPV_DISPATCH(max,   int8_t,   int8_t)
+OPV_DISPATCH(max,  uint8_t,  uint8_t)
+OPV_DISPATCH(max,  int16_t,  int16_t)
+OPV_DISPATCH(max, uint16_t, uint16_t)
+OPV_DISPATCH(max,  int32_t,  int32_t)
+OPV_DISPATCH(max, uint32_t, uint32_t)
+OPV_DISPATCH(max,  int64_t,  int64_t)
+OPV_DISPATCH(max, uint64_t, uint64_t)
 
 #undef current_func
 #define current_func(a, b) ((a) > (b) ? (a) : (b))
@@ -189,27 +335,40 @@ FUNC_FUNC(max, float, float)
 #endif // DO_NOT_USE_INTRINSICS
 FUNC_FUNC(max, double, double)
 
+// __CUDA_ARCH__ is only defined when compiling device code
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
+#undef current_func
+#define current_func(a, b) __hmax2(a, b)
+VFUNC_FUNC(max, halfx, half, half2, 2, __hmax2, __hmax)
+#endif // __CUDA_ARCH__
+
 /*************************************************************************
  * Min
  *************************************************************************/
 
+/* C integer */
+VFUNC_FUNC(min, char, char, char4, 4, vmin, min)
+VFUNC_FUNC(min, uchar, unsigned char, uchar4, 4, vmin, min)
+VFUNC_FUNC(min, short, short, short4, 4, vmin, min)
+VFUNC_FUNC(min, ushort, unsigned short, ushort4, 4, vmin, min)
+VFUNC_FUNC(min, int, int, int4, 4, vmin, min)
+VFUNC_FUNC(min, uint, unsigned int, uint4, 4, vmin, min)
+
 #undef current_func
-#if defined(DO_NOT_USE_INTRINSICS)
-#define current_func(a, b) ((a) < (b) ? (a) : (b))
-#else  // DO_NOT_USE_INTRINSICS
 #define current_func(a, b) min(a, b)
-#endif // DO_NOT_USE_INTRINSICS
-/* C integer */
-FUNC_FUNC(min,   int8_t,   int8_t)
-FUNC_FUNC(min,  uint8_t,  uint8_t)
-FUNC_FUNC(min,  int16_t,  int16_t)
-FUNC_FUNC(min, uint16_t, uint16_t)
-FUNC_FUNC(min,  int32_t,  int32_t)
-FUNC_FUNC(min, uint32_t, uint32_t)
-FUNC_FUNC(min,  int64_t,  int64_t)
-FUNC_FUNC(min, uint64_t, uint64_t)
 FUNC_FUNC(min,  long,  long)
-FUNC_FUNC(min,  unsigned_long, unsigned long)
+FUNC_FUNC(min,  ulong, unsigned long)
+FUNC_FUNC(min,  longlong, long long)
+FUNC_FUNC(min,  ulonglong, unsigned long long)
+OPV_DISPATCH(min,   int8_t,   int8_t)
+OPV_DISPATCH(min,  uint8_t,  uint8_t)
+OPV_DISPATCH(min,  int16_t,  int16_t)
+OPV_DISPATCH(min, uint16_t, uint16_t)
+OPV_DISPATCH(min,  int32_t,  int32_t)
+OPV_DISPATCH(min, uint32_t, uint32_t)
+OPV_DISPATCH(min,  int64_t,  int64_t)
+OPV_DISPATCH(min, uint64_t, uint64_t)
+
 
 
 #if !defined(DO_NOT_USE_INTRINSICS)
@@ -228,26 +387,52 @@ FUNC_FUNC(min, double, double)
 #define current_func(a, b) ((a) < (b) ? (a) : (b))
 FUNC_FUNC(min, long_double, long double)
 
+// __CUDA_ARCH__ is only defined when compiling device code
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
+#undef current_func
+#define current_func(a, b) __hmin2(a, b)
+VFUNC_FUNC(min, half, half, half2, 2, __hmin2, __hmin)
+#endif // __CUDA_ARCH__
+
 /*************************************************************************
  * Sum
  *************************************************************************/
 
 /* C integer */
-OP_FUNC(sum,   int8_t,   int8_t, +)
-OP_FUNC(sum,  uint8_t,  uint8_t, +)
-OP_FUNC(sum,  int16_t,  int16_t, +)
-OP_FUNC(sum, uint16_t, uint16_t, +)
-OP_FUNC(sum,  int32_t,  int32_t, +)
-OP_FUNC(sum, uint32_t, uint32_t, +)
-OP_FUNC(sum,  int64_t,  int64_t, +)
-OP_FUNC(sum, uint64_t, uint64_t, +)
-OP_FUNC(sum,  long,  long, +)
-OP_FUNC(sum,  unsigned_long, unsigned long, +)
+VFUNC_FUNC(sum, char, char, char4, 4, vsum, tsum)
+VFUNC_FUNC(sum, uchar, unsigned char, uchar4, 4, vsum, tsum)
+VFUNC_FUNC(sum, short, short, short4, 4, vsum, tsum)
+VFUNC_FUNC(sum, ushort, unsigned short, ushort4, 4, vsum, tsum)
+VFUNC_FUNC(sum, int, int, int4, 4, vsum, tsum)
+VFUNC_FUNC(sum, uint, unsigned int, uint4, 4, vsum, tsum)
+
+#undef current_func
+#define current_func(a, b) tsum(a, b)
+FUNC_FUNC(sum,  long,  long)
+FUNC_FUNC(sum,  ulong, unsigned long)
+FUNC_FUNC(sum,  longlong, long long)
+FUNC_FUNC(sum,  ulonglong, unsigned long long)
+
+OPV_DISPATCH(sum,   int8_t,   int8_t)
+OPV_DISPATCH(sum,  uint8_t,  uint8_t)
+OPV_DISPATCH(sum,  int16_t,  int16_t)
+OPV_DISPATCH(sum, uint16_t, uint16_t)
+OPV_DISPATCH(sum,  int32_t,  int32_t)
+OPV_DISPATCH(sum, uint32_t, uint32_t)
+OPV_DISPATCH(sum,  int64_t,  int64_t)
+OPV_DISPATCH(sum, uint64_t, uint64_t)
 
 OPV_FUNC(sum, float, float, float4, 4, +)
 OPV_FUNC(sum, double, double, double4, 4, +)
 OP_FUNC(sum, long_double, long double, +)
 
+// __CUDA_ARCH__ is only defined when compiling device code
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
+#undef current_func
+#define current_func(a, b) __hadd2(a, b)
+VFUNC_FUNC(sum, half, half, half2, 2, __hadd2, __hadd)
+#endif // __CUDA_ARCH__
+
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -257,10 +442,10 @@ COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
 #endif // 0
 #undef current_func
-#define current_func(a, b) (cuCmulf(a,b))
+#define current_func(a, b) (cuCaddf(a,b))
 FUNC_FUNC(sum, c_float_complex, cuFloatComplex)
 #undef current_func
-#define current_func(a, b) (cuCmul(a,b))
+#define current_func(a, b) (cuCadd(a,b))
 FUNC_FUNC(sum, c_double_complex, cuDoubleComplex)
 //OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 
@@ -269,21 +454,40 @@ FUNC_FUNC(sum, c_double_complex, cuDoubleComplex)
  *************************************************************************/
 
 /* C integer */
-OP_FUNC(prod,   int8_t,   int8_t, *)
-OP_FUNC(prod,  uint8_t,  uint8_t, *)
-OP_FUNC(prod,  int16_t,  int16_t, *)
-OP_FUNC(prod, uint16_t, uint16_t, *)
-OP_FUNC(prod,  int32_t,  int32_t, *)
-OP_FUNC(prod, uint32_t, uint32_t, *)
-OP_FUNC(prod,  int64_t,  int64_t, *)
-OP_FUNC(prod, uint64_t, uint64_t, *)
-OP_FUNC(prod,  long,  long, *)
-OP_FUNC(prod,  unsigned_long, unsigned long, *)
+#undef current_func
+#define current_func(a, b) tprod(a, b)
+FUNC_FUNC(prod, char, char)
+FUNC_FUNC(prod, uchar, unsigned char)
+FUNC_FUNC(prod, short, short)
+FUNC_FUNC(prod, ushort, unsigned short)
+FUNC_FUNC(prod, int, int)
+FUNC_FUNC(prod, uint, unsigned int)
+FUNC_FUNC(prod,  long,  long)
+FUNC_FUNC(prod,  ulong, unsigned long)
+FUNC_FUNC(prod,  longlong, long long)
+FUNC_FUNC(prod,  ulonglong, unsigned long long)
+
+OPV_DISPATCH(prod,   int8_t,   int8_t)
+OPV_DISPATCH(prod,  uint8_t,  uint8_t)
+OPV_DISPATCH(prod,  int16_t,  int16_t)
+OPV_DISPATCH(prod, uint16_t, uint16_t)
+OPV_DISPATCH(prod,  int32_t,  int32_t)
+OPV_DISPATCH(prod, uint32_t, uint32_t)
+OPV_DISPATCH(prod,  int64_t,  int64_t)
+OPV_DISPATCH(prod, uint64_t, uint64_t)
+
 
 OPV_FUNC(prod, float, float, float4, 4, *)
 OPV_FUNC(prod, double, double, double4, 4, *)
 OP_FUNC(prod, long_double, long double, *)
 
+// __CUDA_ARCH__ is only defined when compiling device code
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
+#undef current_func
+#define current_func(a, b) __hmul2(a, b)
+VFUNC_FUNC(prod, half, half, half2, 2, __hmul2, __hmul)
+#endif // __CUDA_ARCH__
+
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -291,10 +495,14 @@ OP_FUNC(prod, c_short_float_complex, short float _Complex, *=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC(prod, c_float_complex, float _Complex, *=)
-OP_FUNC(prod, c_double_complex, double _Complex, *=)
 OP_FUNC(prod, c_long_double_complex, long double _Complex, *=)
 #endif // 0
+#undef current_func
+#define current_func(a, b) (cuCmulf(a,b))
+FUNC_FUNC(prod, c_float_complex, cuFloatComplex)
+#undef current_func
+#define current_func(a, b) (cuCmul(a,b))
+FUNC_FUNC(prod, c_double_complex, cuDoubleComplex)
 
 /*************************************************************************
  * Logical AND
diff --git a/ompi/mca/op/cuda/op_cuda_impl.h b/ompi/mca/op/cuda/op_cuda_impl.h
index 2f9b5961f44..501ee802a31 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.h
+++ b/ompi/mca/op/cuda/op_cuda_impl.h
@@ -15,6 +15,7 @@
 
 #include <cuda.h>
 #include <cuComplex.h>
+#include <cuda_fp16.h>
 
 #ifndef BEGIN_C_DECLS
 #if defined(c_plusplus) || defined(__cplusplus)
@@ -139,6 +140,10 @@ OP_FUNC_SIG(sum, uint64_t, uint64_t, +=)
 OP_FUNC_SIG(sum,  long,  long, +=)
 OP_FUNC_SIG(sum,  unsigned_long, unsigned long, +=)
 
+#if __CUDA_ARCH__ >= 530
+OP_FUNC_SIG(sum, half, half, +=)
+#endif // __CUDA_ARCH__
+
 #if 0
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
@@ -191,6 +196,7 @@ OP_FUNC_SIG(prod, short_float, opal_short_float_t, *=)
 #endif
 #endif // 0
 
+OP_FUNC_SIG(prod, float, float, *=)
 OP_FUNC_SIG(prod, float, float, *=)
 OP_FUNC_SIG(prod, double, double, *=)
 OP_FUNC_SIG(prod, long_double, long double, *=)
diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index 134376676a2..4bf3ce93990 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -1006,7 +1006,7 @@ int main(int argc, char **argv)
                         MPI_OP_TEST(+, mpi_op, MPI_DOUBLE, double,
                                     init_in_double, in_double,
                                     init_inout_double, inout_double,
-                                    inout_double_for_check, count, "g");
+                                    inout_double_for_check, count, "f");
                     }
                     if (0 == strcmp(op, "prod")) {
                         MPI_OP_TEST(*, mpi_op, MPI_DOUBLE, double,

From 7cdbe248c27f25c522a6fb54886566bf77a8005e Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 18 Jul 2023 13:38:58 +0000
Subject: [PATCH 41/74] Allreduce: cleanup and minor fixes

Replace ompi_op_reduce with ompi_op_reduce_stream(..., NULL) to avoid
repeated checking for locality in ompi_op_reduce

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c | 73 ++++++++++++------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 504cd9468c5..5a042b07489 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -172,6 +172,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         opal_accelerator.get_default_stream(op_dev, &stream);
     }
 
+    /* TODO: These copies are only relevant if buffers are not on the same device.
+     *       Can we figure out whether the op-device can access these remote buffers directly? */
     tmpsend = (char*) sbuf;
     if (op_dev != recvbuf_dev) {
         /* copy data to where the op wants it to be */
@@ -212,11 +214,6 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     adjsize = opal_next_poweroftwo (size);
     adjsize >>= 1;
 
-    /* wait for above copies to complete */
-    if (NULL != stream) {
-        opal_accelerator.wait_stream(stream);
-    }
-
     /* Handle non-power-of-two case:
        - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
        sets new rank to -1.
@@ -227,6 +224,10 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     extra_ranks = size - adjsize;
     if (rank <  (2 * extra_ranks)) {
         if (0 == (rank % 2)) {
+            /* wait for above copies to complete */
+            if (NULL != stream) {
+                opal_accelerator.wait_stream(stream);
+            }
             /* wait for tmpsend to be copied */
             ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
                                     MCA_COLL_BASE_TAG_ALLREDUCE,
@@ -337,7 +338,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
             if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
             tmpsend = (char*)rbuf;
         } else {
-            /* wait for previous ops to complete to complete */
+            /* wait for previous ops to complete */
             if (NULL != stream) {
                 opal_accelerator.wait_stream(stream);
             }
@@ -350,6 +351,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Ensure that the final result is in rbuf */
     if (tmpsend != rbuf) {
+        /* TODO: catch this case in the 3buf selection above. Maybe already caught? */
         ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, (char*)rbuf, tmpsend, stream);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
@@ -530,13 +532,6 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
         use_sbuf = false;
     }
 
-#if 0
-    if (MPI_IN_PLACE != sbuf) {
-        ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
-        if (ret < 0) { line = __LINE__; goto error_hndl; }
-    }
-#endif // 0
-
     /* Computation loop */
 
     /*
@@ -598,10 +593,12 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
         if (use_sbuf) {
             void *tmpsbuf = ((char*)sbuf) + (ptrdiff_t)block_offset * extent;
             /* tmprecv = inbuf[inbi ^ 0x1] (op) sbuf */
-            ompi_3buff_op_reduce(op, inbuf[inbi ^ 0x1], tmpsbuf, tmprecv, block_count, dtype);
+            ompi_3buff_op_reduce_stream(op, inbuf[inbi ^ 0x1], tmpsbuf, tmprecv, block_count,
+                                        dtype, op_dev, NULL);
         } else {
             /* tmprecv = inbuf[inbi ^ 0x1] (op) tmprecv */
-            ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
+            ompi_op_reduce_stream(op, inbuf[inbi ^ 0x1], tmprecv, block_count,
+                                  dtype, op_dev, NULL);
         }
 
         /* send previous block to send_to */
@@ -623,7 +620,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
                     ((ptrdiff_t)recv_from * late_segcount + split_rank));
     block_count = ((recv_from < split_rank)? early_segcount : late_segcount);
     tmprecv = ((char*)recvbuf) + (ptrdiff_t)block_offset * extent;
-    ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype);
+    ompi_op_reduce_stream(op, inbuf[inbi], tmprecv, block_count, dtype, op_dev, NULL);
 
     /* Distribution loop - variation of ring allgather */
     send_to = (rank + 1) % size;
@@ -766,7 +763,6 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
     int ret, line, rank, size, k, recv_from, send_to;
     int early_blockcount, late_blockcount, split_rank;
     int segcount, max_segcount, num_phases, phase, block_count, inbi;
-    int inbuf_dev[2] = {-1, -1};
     size_t typelng;
     char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
     ptrdiff_t block_offset, max_real_segsize;
@@ -826,16 +822,19 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
     if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
+    int sendbuf_dev, recvbuf_dev, op_dev;
+    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev, &op_dev);
     /* Allocate and initialize temporary buffers */
-    inbuf[0] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, count, dtype, &inbuf_dev[0], module);
+    inbuf[0] = ompi_coll_base_allocate_on_device(op_dev, max_real_segsize, module);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = ompi_coll_base_allocate_op_tmpbuf(NULL, rbuf, max_real_segsize, op, count, dtype, &inbuf_dev[1], module);
+        inbuf[1] = ompi_coll_base_allocate_on_device(op_dev, max_real_segsize, module);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
     /* Handle MPI_IN_PLACE */
     if (MPI_IN_PLACE != sbuf) {
+        /* TODO: can we avoid this copy? */
         ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
@@ -921,7 +920,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
                             ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
                             ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
             tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
-            ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype);
+            ompi_op_reduce_stream(op, inbuf[inbi ^ 0x1], tmprecv, phase_count,
+                                  dtype, op_dev, NULL);
 
             /* send previous block to send_to */
             ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to,
@@ -950,7 +950,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
                         ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
                         ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
         tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
-        ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype);
+        ompi_op_reduce_stream(op, inbuf[inbi], tmprecv, phase_count,
+                              dtype, op_dev, NULL);
     }
 
     /* Distribution loop - variation of ring allgather */
@@ -982,8 +983,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
 
     }
 
-    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0], module);
-    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1], module);
+    ompi_coll_base_free_tmpbuf(inbuf[0], op_dev, module);
+    ompi_coll_base_free_tmpbuf(inbuf[1], op_dev, module);
 
     return MPI_SUCCESS;
 
@@ -992,8 +993,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    ompi_coll_base_free_tmpbuf(inbuf[0], inbuf_dev[0], module);
-    ompi_coll_base_free_tmpbuf(inbuf[1], inbuf_dev[1], module);
+    ompi_coll_base_free_tmpbuf(inbuf[0], op_dev, module);
+    ompi_coll_base_free_tmpbuf(inbuf[1], op_dev, module);
     return ret;
 }
 
@@ -1217,15 +1218,16 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
              * We're not using a stream here, the reduction will make sure that the result is available upon return */
             if (MPI_IN_PLACE != sbuf) {
                 /* rbuf = sbuf (op) tmp_buf */
-                ompi_3buff_op_reduce(op,
-                                     (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
-                                     (char *)sbuf + (ptrdiff_t)count_lhalf * extent,
-                                     (char *)recvbuf + count_lhalf * extent,
-                                     count_rhalf, dtype);
+                ompi_3buff_op_reduce_stream(op,
+                                            (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
+                                            (char *)sbuf + (ptrdiff_t)count_lhalf * extent,
+                                            (char *)recvbuf + count_lhalf * extent,
+                                            count_rhalf, dtype, op_dev, NULL);
             } else {
                 /* rbuf = rbuf (op) tmp_buf */
-                ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
-                               (char *)recvbuf + count_lhalf * extent, count_rhalf, dtype);
+                ompi_op_reduce_stream(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
+                                      (char *)recvbuf + count_lhalf * extent, count_rhalf,
+                                      dtype, op_dev, NULL);
             }
 
             /* Send the right half to the left neighbor */
@@ -1256,7 +1258,6 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_IN_PLACE != sbuf) {
                 /* rbuf = sbuf (op) tmp_buf */
                 ompi_3buff_op_reduce_stream(op, sbuf, tmp_buf, recvbuf, count_lhalf, dtype, op_dev, stream);
-
             } else {
                 /* rbuf = rbuf (op) tmp_buf */
                 ompi_op_reduce_stream(op, tmp_buf, recvbuf, count_lhalf, dtype, op_dev, stream);
@@ -1349,9 +1350,9 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
-                           (char *)recvbuf + (ptrdiff_t)rindex[step] * extent,
-                           rcount[step], dtype);
+            ompi_op_reduce_stream(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
+                                  (char *)recvbuf + (ptrdiff_t)rindex[step] * extent,
+                                  rcount[step], dtype, op_dev, NULL);
 
             /* Move the current window to the received message */
             if (step + 1 < nsteps) {

From c7fe5f6b831533a7c6ac991e08d3900780c7daf3 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 19 Jul 2023 15:39:45 +0000
Subject: [PATCH 42/74] Add MCA op_[cuda|rocm]_max_num_[blocks|threads]

These variables allow users to limit the maximum number of blocks and
threads per block in the reduction kernels. The implementation
will fall back to the device limit if lower.

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda.h           |  2 ++
 ompi/mca/op/cuda/op_cuda_component.c | 39 +++++++++++++++++++++++++++-
 ompi/mca/op/rocm/op_rocm.h           |  2 ++
 ompi/mca/op/rocm/op_rocm_component.c | 33 +++++++++++++++++++++--
 4 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index 86fdc3c6ace..2b7bb2831ee 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -55,6 +55,8 @@ typedef struct {
     CUstream cu_stream;
     CUcontext *cu_ctx;
 #endif // 0
+    int cu_max_num_blocks;
+    int cu_max_num_threads;
     int *cu_max_threads_per_block;
     int *cu_max_blocks;
     CUdevice *cu_devices;
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index b519ffe60ec..611d936cd42 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -58,6 +58,8 @@ ompi_op_cuda_component_t mca_op_cuda_component = {
         .opc_init_query = cuda_component_init_query,
         .opc_op_query = cuda_component_op_query,
     },
+    .cu_max_num_blocks = -1,
+    .cu_max_num_threads = -1,
     .cu_max_threads_per_block = NULL,
     .cu_max_blocks = NULL,
     .cu_devices = NULL,
@@ -109,7 +111,24 @@ static int cuda_component_close(void)
 static int
 cuda_component_register(void)
 {
-    /* TODO: add mca paramters */
+    mca_base_var_enum_flag_t *new_enum_flag = NULL;
+    (void) mca_base_component_var_register(&mca_op_cuda_component.super.opc_version,
+                                           "max_num_blocks",
+                                           "Maximum number of thread blocks in kernels (-1: device limit)",
+                                           MCA_BASE_VAR_TYPE_INT,
+                                           &(new_enum_flag->super), 0, 0,
+                                           OPAL_INFO_LVL_4,
+                                           MCA_BASE_VAR_SCOPE_LOCAL,
+                                           &mca_op_cuda_component.cu_max_num_blocks);
+
+    (void) mca_base_component_var_register(&mca_op_cuda_component.super.opc_version,
+                                           "max_num_threads",
+                                           "Maximum number of threads per block in kernels (-1: device limit)",
+                                           MCA_BASE_VAR_TYPE_INT,
+                                           &(new_enum_flag->super), 0, 0,
+                                           OPAL_INFO_LVL_4,
+                                           MCA_BASE_VAR_SCOPE_LOCAL,
+                                           &mca_op_cuda_component.cu_max_num_threads);
 
     return OMPI_SUCCESS;
 }
@@ -141,6 +160,15 @@ cuda_component_init_query(bool enable_progress_threads,
             /* fall-back to value that should work on every device */
             mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
         }
+        if (-1 < mca_op_cuda_component.cu_max_num_threads) {
+            if (mca_op_cuda_component.cu_max_threads_per_block[i] >= mca_op_cuda_component.cu_max_num_threads) {
+                mca_op_cuda_component.cu_max_threads_per_block[i] = mca_op_cuda_component.cu_max_num_threads;
+            } else {
+                printf("WARN: CUDA device %d does not support %d threads per block, falling back to %d\n",
+                       i, mca_op_cuda_component.cu_max_num_threads, mca_op_cuda_component.cu_max_threads_per_block[i]);
+            }
+        }
+
         rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_blocks[i],
                                   CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
                                   mca_op_cuda_component.cu_devices[i]);
@@ -148,6 +176,15 @@ cuda_component_init_query(bool enable_progress_threads,
             /* fall-back to value that should work on every device */
             mca_op_cuda_component.cu_max_blocks[i] = 512;
         }
+        printf("max threads %d max blocks %d\n", mca_op_cuda_component.cu_max_threads_per_block[i], mca_op_cuda_component.cu_max_blocks[i]);
+        if (-1 < mca_op_cuda_component.cu_max_num_blocks) {
+            if (mca_op_cuda_component.cu_max_blocks[i] >= mca_op_cuda_component.cu_max_num_blocks) {
+                mca_op_cuda_component.cu_max_blocks[i] = mca_op_cuda_component.cu_max_num_blocks;
+            } else {
+                printf("WARN: CUDA device %d does not support %d blocks, falling back to %d\n",
+                       i, mca_op_cuda_component.cu_max_num_blocks, mca_op_cuda_component.cu_max_blocks[i]);
+            }
+        }
     }
 
 #if 0
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
index 6e8139fa239..dba668f828a 100644
--- a/ompi/mca/op/rocm/op_rocm.h
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -55,6 +55,8 @@ typedef struct {
     hipStream_t ro_stream;
     hipCtx_t *ro_ctx;
 #endif // 0
+    int ro_max_num_blocks;
+    int ro_max_num_threads;
     int *ro_max_threads_per_block;
     int *ro_max_blocks;
     hipDevice_t *ro_devices;
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
index 844ee4224a3..d17bf53663c 100644
--- a/ompi/mca/op/rocm/op_rocm_component.c
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -58,6 +58,8 @@ ompi_op_rocm_component_t mca_op_rocm_component = {
         .opc_init_query = rocm_component_init_query,
         .opc_op_query = rocm_component_op_query,
     },
+    .ro_max_num_blocks = -1,
+    .ro_max_num_threads = -1,
     .ro_max_threads_per_block = NULL,
     .ro_max_blocks = NULL,
     .ro_devices = NULL,
@@ -112,7 +114,25 @@ rocm_component_register(void)
 {
     /* TODO: add mca paramters */
 
-    printf("op rocm_component_register\n");
+    mca_base_var_enum_flag_t *new_enum_flag = NULL;
+    (void) mca_base_component_var_register(&mca_op_rocm_component.super.opc_version,
+                                           "max_num_blocks",
+                                           "Maximum number of thread blocks in kernels (-1: device limit)",
+                                           MCA_BASE_VAR_TYPE_INT,
+                                           &(new_enum_flag->super), 0, 0,
+                                           OPAL_INFO_LVL_4,
+                                           MCA_BASE_VAR_SCOPE_LOCAL,
+                                           &mca_op_rocm_component.ro_max_num_blocks);
+
+    (void) mca_base_component_var_register(&mca_op_rocm_component.super.opc_version,
+                                           "max_num_threads",
+                                           "Maximum number of threads per block in kernels (-1: device limit)",
+                                           MCA_BASE_VAR_TYPE_INT,
+                                           &(new_enum_flag->super), 0, 0,
+                                           OPAL_INFO_LVL_4,
+                                           MCA_BASE_VAR_SCOPE_LOCAL,
+                                           &mca_op_rocm_component.ro_max_num_threads);
+
     return OMPI_SUCCESS;
 }
 
@@ -143,6 +163,11 @@ rocm_component_init_query(bool enable_progress_threads,
             /* fall-back to value that should work on every device */
             mca_op_rocm_component.ro_max_threads_per_block[i] = 512;
         }
+        if (-1 < mca_op_rocm_component.ro_max_num_threads) {
+            if (mca_op_rocm_component.ro_max_threads_per_block[i] > mca_op_rocm_component.ro_max_num_threads) {
+                mca_op_rocm_component.ro_max_threads_per_block[i] = mca_op_rocm_component.ro_max_num_threads;
+            }
+        }
 
         rc = hipDeviceGetAttribute(&mca_op_rocm_component.ro_max_blocks[i],
                                   hipDeviceAttributeMaxGridDimX,
@@ -151,6 +176,11 @@ rocm_component_init_query(bool enable_progress_threads,
             /* we'll try to max out the blocks */
             mca_op_rocm_component.ro_max_blocks[i] = 512;
         }
+        if (-1 < mca_op_rocm_component.ro_max_num_blocks) {
+            if (mca_op_rocm_component.ro_max_blocks[i] > mca_op_rocm_component.ro_max_num_blocks) {
+                mca_op_rocm_component.ro_max_blocks[i] = mca_op_rocm_component.ro_max_num_blocks;
+            }
+        }
     }
 
 #if 0
@@ -162,7 +192,6 @@ rocm_component_init_query(bool enable_progress_threads,
         mca_op_rocm_component.ro_stream = 0;
     }
 #endif // 0
-    printf("op rocm_component_init_query\n");
     return OMPI_SUCCESS;
 }
 

From 42bd424fc3c053188ef7fc715c9ae165b6950f00 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Wed, 19 Jul 2023 22:10:03 +0000
Subject: [PATCH 43/74] Fix the generation of "unsigned char" ops.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_impl.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index d10084ffa37..2b5dba7e3d5 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -257,7 +257,7 @@ static inline __device__ T vprod(const T& a, const T& b) {
                                                         threads_per_block,                          \
                                                         max_blocks, stream);                         \
             } else if constexpr(sizeof(type_name) == sizeof(unsigned long)) {                                \
-                ompi_op_cuda_2buff_##name##_ulong_submit((const unsigned long*)in, (unsigned long*)inout, count, \
+                ompi_op_cuda_2buff_##name##_unsigned_long_submit((const unsigned long*)in, (unsigned long*)inout, count, \
                                                          threads_per_block,                         \
                                                          max_blocks, stream);                        \
             } else if constexpr(sizeof(type_name) == sizeof(unsigned long long)) {                           \
@@ -305,7 +305,7 @@ VFUNC_FUNC(max, uint, unsigned int, uint4, 4, vmax, max)
 #undef current_func
 #define current_func(a, b) max(a, b)
 FUNC_FUNC(max,  long,  long)
-FUNC_FUNC(max,  ulong, unsigned long)
+FUNC_FUNC(max,  unsigned_long, unsigned long)
 FUNC_FUNC(max,  longlong, long long)
 FUNC_FUNC(max,  ulonglong, unsigned long long)
 
@@ -357,7 +357,7 @@ VFUNC_FUNC(min, uint, unsigned int, uint4, 4, vmin, min)
 #undef current_func
 #define current_func(a, b) min(a, b)
 FUNC_FUNC(min,  long,  long)
-FUNC_FUNC(min,  ulong, unsigned long)
+FUNC_FUNC(min,  unsigned_long, unsigned long)
 FUNC_FUNC(min,  longlong, long long)
 FUNC_FUNC(min,  ulonglong, unsigned long long)
 OPV_DISPATCH(min,   int8_t,   int8_t)
@@ -409,7 +409,7 @@ VFUNC_FUNC(sum, uint, unsigned int, uint4, 4, vsum, tsum)
 #undef current_func
 #define current_func(a, b) tsum(a, b)
 FUNC_FUNC(sum,  long,  long)
-FUNC_FUNC(sum,  ulong, unsigned long)
+FUNC_FUNC(sum,  unsigned_long, unsigned long)
 FUNC_FUNC(sum,  longlong, long long)
 FUNC_FUNC(sum,  ulonglong, unsigned long long)
 
@@ -463,7 +463,7 @@ FUNC_FUNC(prod, ushort, unsigned short)
 FUNC_FUNC(prod, int, int)
 FUNC_FUNC(prod, uint, unsigned int)
 FUNC_FUNC(prod,  long,  long)
-FUNC_FUNC(prod,  ulong, unsigned long)
+FUNC_FUNC(prod,  unsigned_long, unsigned long)
 FUNC_FUNC(prod,  longlong, long long)
 FUNC_FUNC(prod,  ulonglong, unsigned long long)
 

From 8e3d042b008d7259a950f2e8dec2f20c77d41664 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Wed, 19 Jul 2023 22:10:29 +0000
Subject: [PATCH 44/74] We need CXX17 for the CUDA ops.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/op/cuda/Makefile.am | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index 26af0c3e7f6..6f7dc89d97e 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -23,8 +23,8 @@ sources = op_cuda_component.c op_cuda.h op_cuda_functions.c op_cuda_impl.h
 #sources_extended = op_cuda_functions.cu
 cu_sources = op_cuda_impl.cu
 
-NVCC = nvcc -O2
-NVCCFLAGS=-Wc,--gpu-architecture=compute_53
+NVCC = nvcc -g
+NVCCFLAGS= --std c++17 --gpu-architecture=compute_80
 
 .cu.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \

From 7524f99ee0c2bec011f993f3c904b4359140133e Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 09:51:41 +0000
Subject: [PATCH 45/74] ROCM: add vectorization of some basic ops

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_impl.cu  |   2 +-
 ompi/mca/op/rocm/Makefile.am      |   2 +-
 ompi/mca/op/rocm/op_rocm_impl.cpp | 297 ++++++++++++++++++++++++------
 3 files changed, 240 insertions(+), 61 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index 2b5dba7e3d5..faa52e43860 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -65,7 +65,7 @@ static inline __device__ T vprod(const T& a, const T& b) {
  * - complex
  */
 
-//#define USE_VECTORS 1
+#define USE_VECTORS 1
 
 #define OP_FUNC(name, type_name, type, op)                                                          \
     static __global__ void                                                                          \
diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index d89724719e8..c2e941dee96 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -28,7 +28,7 @@ HIPCC = hipcc
 
 .cpp.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -O2 -fvectorize -prefer-non-pic $(HIPCCFLAGS) -Wc,-fPIC,-g -c $<
+	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -O2 -std=c++17 -fvectorize -prefer-non-pic $(HIPCCFLAGS) -Wc,-fPIC,-g -c $<
 
 # -o $($@.o:.lo)
 
diff --git a/ompi/mca/op/rocm/op_rocm_impl.cpp b/ompi/mca/op/rocm/op_rocm_impl.cpp
index d8ad5f7fc90..c1789a13f14 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.cpp
+++ b/ompi/mca/op/rocm/op_rocm_impl.cpp
@@ -1,4 +1,3 @@
-#include "hip/hip_runtime.h"
 /*
  * Copyright (c) 2019-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
@@ -12,7 +11,9 @@
  * $HEADER$
  */
 
+#include "hip/hip_runtime.h"
 #include <sys/types.h>
+#include <cstdint>
 
 #include <hip/hip_complex.h>
 
@@ -21,14 +22,56 @@
 //#define DO_NOT_USE_INTRINSICS 1
 #define USE_VECTORS 1
 
+#include <type_traits>
+
+#define ISSIGNED(x) std::is_signed_v<x>
+
+template<typename T>
+static inline __device__ constexpr T tmax(T a, T b) {
+    return (a > b) ? a : b;
+}
+
+template<typename T>
+static inline __device__ constexpr T tmin(T a, T b) {
+    return (a < b) ? a : b;
+}
+
+template<typename T>
+static inline __device__ constexpr T tsum(T a, T b) {
+    return a+b;
+}
+
+template<typename T>
+static inline __device__ constexpr T tprod(T a, T b) {
+    return a*b;
+}
+
+template<typename T>
+static inline __device__ T vmax(const T& a, const T& b) {
+    return T{tmax(a.x, b.x), tmax(a.y, b.y), tmax(a.z, b.z), tmax(a.w, b.w)};
+}
+
+template<typename T>
+static inline __device__ T vmin(const T& a, const T& b) {
+    return T{tmin(a.x, b.x), tmin(a.y, b.y), tmin(a.z, b.z), tmin(a.w, b.w)};
+}
+
+template<typename T>
+static inline __device__ T vsum(const T& a, const T& b) {
+    return T{tsum(a.x, b.x), tsum(a.y, b.y), tsum(a.z, b.z), tsum(a.w, b.w)};
+}
+
+template<typename T>
+static inline __device__ T vprod(const T& a, const T& b) {
+    return T{(a.x * b.x), (a.y * b.y), (a.z * b.z), (a.w * b.w)};
+}
+
+
 /* TODO: missing support for
  * - short float (conditional on whether short float is available)
  * - complex
- * - 3buff implementation
  */
 
-#define THREADS_PER_BLOCK 512
-
 #define VECLEN 2
 #define VECTYPE(t) t##VECLEN
 
@@ -121,6 +164,44 @@
                 in, inout, n);                                                                      \
     }
 
+
+#if defined(USE_VECTORS)
+#define VFUNC_FUNC(name, type_name, type, vtype, vlen, vfn, fn)                                     \
+    static __global__ void                                                                          \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n/vlen; i += stride) {                                              \
+            ((vtype*)inout)[i] = vfn(((vtype*)inout)[i], ((vtype*)in)[i]);                          \
+        }                                                                                           \
+        int remainder = n%vlen;                                                                     \
+        if (index == (n/vlen) && remainder != 0) {                                                  \
+            while(remainder) {                                                                      \
+                int idx = n - remainder--;                                                          \
+                inout[idx] = fn(inout[idx], in[idx]);                                               \
+            }                                                                                       \
+        }                                                                                           \
+    }                                                                                               \
+    static void                                                                                     \
+    ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                                \
+                                              type *inout,                                          \
+                                              int count,                                            \
+                                              int threads_per_block,                                \
+                                              int max_blocks,                                       \
+                                              hipStream_t stream) {                                    \
+        int vcount  = (count + vlen-1)/vlen;                                                        \
+        int threads = min(threads_per_block, vcount);                                               \
+        int blocks  = min((vcount + threads-1) / threads, max_blocks);                              \
+        int n = count;                                                                              \
+        hipStream_t s = stream;                                                                        \
+        ompi_op_rocm_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+    }
+#else
+#define VFUNC_FUNC(name, type_name, type, vtype, vlen, vfn, fn) FUNC_FUNC_FN(name, type_name, type, fn)
+#endif // defined(USE_VECTORS)
+
+
 /*
  * Since all the functions in this file are essentially identical, we
  * use a macro to substitute in names and types.  The core operation
@@ -163,27 +244,91 @@
                 a, b, count);                                                                       \
     }
 
+
+#define OPV_DISPATCH(name, type_name, type)                                                         \
+    void ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                           \
+                                                   type *inout,                                     \
+                                                   int count,                                       \
+                                                   int threads_per_block,                           \
+                                                   int max_blocks,                                  \
+                                                   hipStream_t stream) {                               \
+        static_assert(sizeof(type_name) <= sizeof(unsigned long long), "Unknown size type");        \
+        if constexpr(!ISSIGNED(type)) {                                                                      \
+            if constexpr(sizeof(type_name) == sizeof(unsigned char)) {                                               \
+                ompi_op_rocm_2buff_##name##_uchar_submit((const unsigned char*)in, (unsigned char*)inout, count,    \
+                                                         threads_per_block,                         \
+                                                         max_blocks, stream);                        \
+            } else if constexpr(sizeof(type_name) == sizeof(unsigned short)) {                                       \
+                ompi_op_rocm_2buff_##name##_ushort_submit((const unsigned short*)in, (unsigned short*)inout, count, \
+                                                          threads_per_block,                        \
+                                                          max_blocks, stream);                       \
+            } else if constexpr(sizeof(type_name) == sizeof(unsigned int)) {                                 \
+                ompi_op_rocm_2buff_##name##_uint_submit((const unsigned int*)in, (unsigned int*)inout, count,         \
+                                                        threads_per_block,                          \
+                                                        max_blocks, stream);                         \
+            } else if constexpr(sizeof(type_name) == sizeof(unsigned long)) {                                \
+                ompi_op_rocm_2buff_##name##_ulong_submit((const unsigned long*)in, (unsigned long*)inout, count, \
+                                                         threads_per_block,                         \
+                                                         max_blocks, stream);                        \
+            } else if constexpr(sizeof(type_name) == sizeof(unsigned long long)) {                           \
+                ompi_op_rocm_2buff_##name##_ulonglong_submit((const unsigned long long*)in, (unsigned long long*)inout, count, \
+                                                             threads_per_block,                     \
+                                                             max_blocks, stream);                    \
+            }                                                                                       \
+        } else {                                                                                    \
+            if constexpr(sizeof(type_name) == sizeof(char)) {                                                \
+                ompi_op_rocm_2buff_##name##_char_submit((const char*)in, (char*)inout, count,       \
+                                                        threads_per_block,                          \
+                                                        max_blocks, stream);                         \
+            } else if constexpr(sizeof(type_name) == sizeof(short)) {                                        \
+                ompi_op_rocm_2buff_##name##_short_submit((const short*)in, (short*)inout, count,    \
+                                                          threads_per_block,                        \
+                                                          max_blocks, stream);                       \
+            } else if constexpr(sizeof(type_name) == sizeof(int)) {                                          \
+                ompi_op_rocm_2buff_##name##_int_submit((const int*)in, (int*)inout, count,          \
+                                                        threads_per_block,                          \
+                                                        max_blocks, stream);                         \
+            } else if constexpr(sizeof(type_name) == sizeof(long)) {                                         \
+                ompi_op_rocm_2buff_##name##_long_submit((const long*)in, (long*)inout, count,       \
+                                                         threads_per_block,                         \
+                                                         max_blocks, stream);                        \
+            } else if constexpr(sizeof(type_name) == sizeof(long long)) {                                    \
+                ompi_op_rocm_2buff_##name##_longlong_submit((const long long*)in, (long long*)inout, count,\
+                                                             threads_per_block,                     \
+                                                             max_blocks, stream);                    \
+            }                                                                                       \
+        }                                                                                           \
+    }
+
 /*************************************************************************
  * Max
  *************************************************************************/
 
+/* C integer */
+VFUNC_FUNC(max, char, char, char4, 4, vmax, max)
+VFUNC_FUNC(max, uchar, unsigned char, uchar4, 4, vmax, max)
+VFUNC_FUNC(max, short, short, short4, 4, vmax, max)
+VFUNC_FUNC(max, ushort, unsigned short, ushort4, 4, vmax, max)
+VFUNC_FUNC(max, int, int, int4, 4, vmax, max)
+VFUNC_FUNC(max, uint, unsigned int, uint4, 4, vmax, max)
+
 #undef current_func
-#if defined(DO_NOT_USE_INTRINSICS)
-#define current_func(a, b) ((a) > (b) ? (a) : (b))
-#else  // DO_NOT_USE_INTRINSICS
 #define current_func(a, b) max(a, b)
-#endif // DO_NOT_USE_INTRINSICS
-/* C integer */
-FUNC_FUNC(max,   int8_t,   int8_t)
-FUNC_FUNC(max,  uint8_t,  uint8_t)
-FUNC_FUNC(max,  int16_t,  int16_t)
-FUNC_FUNC(max, uint16_t, uint16_t)
-FUNC_FUNC(max,  int32_t,  int32_t)
-FUNC_FUNC(max, uint32_t, uint32_t)
-FUNC_FUNC(max,  int64_t,  int64_t)
-FUNC_FUNC(max, uint64_t, uint64_t)
 FUNC_FUNC(max,  long,  long)
-FUNC_FUNC(max,  unsigned_long, unsigned long)
+FUNC_FUNC(max,  ulong, unsigned long)
+FUNC_FUNC(max,  longlong, long long)
+FUNC_FUNC(max,  ulonglong, unsigned long long)
+
+
+/* dispatch fixed-size types */
+OPV_DISPATCH(max,   int8_t,   int8_t)
+OPV_DISPATCH(max,  uint8_t,  uint8_t)
+OPV_DISPATCH(max,  int16_t,  int16_t)
+OPV_DISPATCH(max, uint16_t, uint16_t)
+OPV_DISPATCH(max,  int32_t,  int32_t)
+OPV_DISPATCH(max, uint32_t, uint32_t)
+OPV_DISPATCH(max,  int64_t,  int64_t)
+OPV_DISPATCH(max, uint64_t, uint64_t)
 
 #if !defined(DO_NOT_USE_INTRINSICS)
 #undef current_func
@@ -204,23 +349,29 @@ FUNC_FUNC(max, long_double, long double)
 /*************************************************************************
  * Min
  *************************************************************************/
+
+/* C integer */
+VFUNC_FUNC(min, char, char, char4, 4, vmin, min)
+VFUNC_FUNC(min, uchar, unsigned char, uchar4, 4, vmin, min)
+VFUNC_FUNC(min, short, short, short4, 4, vmin, min)
+VFUNC_FUNC(min, ushort, unsigned short, ushort4, 4, vmin, min)
+VFUNC_FUNC(min, int, int, int4, 4, vmin, min)
+VFUNC_FUNC(min, uint, unsigned int, uint4, 4, vmin, min)
+
 #undef current_func
-#if defined(DO_NOT_USE_INTRINSICS)
-#define current_func(a, b) ((a) < (b) ? (a) : (b))
-#else  // DO_NOT_USE_INTRINSICS
 #define current_func(a, b) min(a, b)
-#endif // DO_NOT_USE_INTRINSICS
-/* C integer */
-FUNC_FUNC(min,   int8_t,   int8_t)
-FUNC_FUNC(min,  uint8_t,  uint8_t)
-FUNC_FUNC(min,  int16_t,  int16_t)
-FUNC_FUNC(min, uint16_t, uint16_t)
-FUNC_FUNC(min,  int32_t,  int32_t)
-FUNC_FUNC(min, uint32_t, uint32_t)
-FUNC_FUNC(min,  int64_t,  int64_t)
-FUNC_FUNC(min, uint64_t, uint64_t)
 FUNC_FUNC(min,  long,  long)
-FUNC_FUNC(min,  unsigned_long, unsigned long)
+FUNC_FUNC(min,  ulong, unsigned long)
+FUNC_FUNC(min,  longlong, long long)
+FUNC_FUNC(min,  ulonglong, unsigned long long)
+OPV_DISPATCH(min,   int8_t,   int8_t)
+OPV_DISPATCH(min,  uint8_t,  uint8_t)
+OPV_DISPATCH(min,  int16_t,  int16_t)
+OPV_DISPATCH(min, uint16_t, uint16_t)
+OPV_DISPATCH(min,  int32_t,  int32_t)
+OPV_DISPATCH(min, uint32_t, uint32_t)
+OPV_DISPATCH(min,  int64_t,  int64_t)
+OPV_DISPATCH(min, uint64_t, uint64_t)
 
 #if !defined(DO_NOT_USE_INTRINSICS)
 #undef current_func
@@ -244,16 +395,28 @@ FUNC_FUNC(min, long_double, long double)
  *************************************************************************/
 
 /* C integer */
-OP_FUNC(sum,   int8_t,   int8_t, +)
-OP_FUNC(sum,  uint8_t,  uint8_t, +)
-OP_FUNC(sum,  int16_t,  int16_t, +)
-OP_FUNC(sum, uint16_t, uint16_t, +)
-OP_FUNC(sum,  int32_t,  int32_t, +)
-OP_FUNC(sum, uint32_t, uint32_t, +)
-OP_FUNC(sum,  int64_t,  int64_t, +)
-OP_FUNC(sum, uint64_t, uint64_t, +)
-OP_FUNC(sum,  long,  long, +)
-OP_FUNC(sum,  unsigned_long, unsigned long, +)
+VFUNC_FUNC(sum, char, char, char4, 4, vsum, tsum)
+VFUNC_FUNC(sum, uchar, unsigned char, uchar4, 4, vsum, tsum)
+VFUNC_FUNC(sum, short, short, short4, 4, vsum, tsum)
+VFUNC_FUNC(sum, ushort, unsigned short, ushort4, 4, vsum, tsum)
+VFUNC_FUNC(sum, int, int, int4, 4, vsum, tsum)
+VFUNC_FUNC(sum, uint, unsigned int, uint4, 4, vsum, tsum)
+
+#undef current_func
+#define current_func(a, b) tsum(a, b)
+FUNC_FUNC(sum,  long,  long)
+FUNC_FUNC(sum,  ulong, unsigned long)
+FUNC_FUNC(sum,  longlong, long long)
+FUNC_FUNC(sum,  ulonglong, unsigned long long)
+
+OPV_DISPATCH(sum,   int8_t,   int8_t)
+OPV_DISPATCH(sum,  uint8_t,  uint8_t)
+OPV_DISPATCH(sum,  int16_t,  int16_t)
+OPV_DISPATCH(sum, uint16_t, uint16_t)
+OPV_DISPATCH(sum,  int32_t,  int32_t)
+OPV_DISPATCH(sum, uint32_t, uint32_t)
+OPV_DISPATCH(sum,  int64_t,  int64_t)
+OPV_DISPATCH(sum, uint64_t, uint64_t)
 
 OPV_FUNC(sum, float, float, float4, 4, +)
 OPV_FUNC(sum, double, double, double4, 4, +)
@@ -268,10 +431,10 @@ COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
 #endif // 0
 #undef current_func
-#define current_func(a, b) (hipCmulf(a,b))
+#define current_func(a, b) (hipCaddf(a,b))
 FUNC_FUNC(sum, c_float_complex, hipFloatComplex)
 #undef current_func
-#define current_func(a, b) (hipCmul(a,b))
+#define current_func(a, b) (hipCadd(a,b))
 FUNC_FUNC(sum, c_double_complex, hipDoubleComplex)
 //OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 
@@ -280,16 +443,28 @@ FUNC_FUNC(sum, c_double_complex, hipDoubleComplex)
  *************************************************************************/
 
 /* C integer */
-OP_FUNC(prod,   int8_t,   int8_t, *)
-OP_FUNC(prod,  uint8_t,  uint8_t, *)
-OP_FUNC(prod,  int16_t,  int16_t, *)
-OP_FUNC(prod, uint16_t, uint16_t, *)
-OP_FUNC(prod,  int32_t,  int32_t, *)
-OP_FUNC(prod, uint32_t, uint32_t, *)
-OP_FUNC(prod,  int64_t,  int64_t, *)
-OP_FUNC(prod, uint64_t, uint64_t, *)
-OP_FUNC(prod,  long,  long, *)
-OP_FUNC(prod,  unsigned_long, unsigned long, *)
+#undef current_func
+#define current_func(a, b) tprod(a, b)
+FUNC_FUNC(prod, char, char)
+FUNC_FUNC(prod, uchar, unsigned char)
+FUNC_FUNC(prod, short, short)
+FUNC_FUNC(prod, ushort, unsigned short)
+FUNC_FUNC(prod, int, int)
+FUNC_FUNC(prod, uint, unsigned int)
+FUNC_FUNC(prod,  long,  long)
+FUNC_FUNC(prod,  ulong, unsigned long)
+FUNC_FUNC(prod,  longlong, long long)
+FUNC_FUNC(prod,  ulonglong, unsigned long long)
+
+OPV_DISPATCH(prod,   int8_t,   int8_t)
+OPV_DISPATCH(prod,  uint8_t,  uint8_t)
+OPV_DISPATCH(prod,  int16_t,  int16_t)
+OPV_DISPATCH(prod, uint16_t, uint16_t)
+OPV_DISPATCH(prod,  int32_t,  int32_t)
+OPV_DISPATCH(prod, uint32_t, uint32_t)
+OPV_DISPATCH(prod,  int64_t,  int64_t)
+OPV_DISPATCH(prod, uint64_t, uint64_t)
+
 
 OPV_FUNC(prod, float, float, float4, 4, *)
 OPV_FUNC(prod, double, double, double4, 4, *)
@@ -298,14 +473,18 @@ OP_FUNC(prod, long_double, long double, *)
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
-OP_FUNC(prod, c_short_float_complex, short float _Complex, *=)
+OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
-COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
+COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC(prod, c_float_complex, float _Complex, *=)
-OP_FUNC(prod, c_double_complex, double _Complex, *=)
-OP_FUNC(prod, c_long_double_complex, long double _Complex, *=)
 #endif // 0
+#undef current_func
+#define current_func(a, b) (hipCmulf(a,b))
+FUNC_FUNC(prod, c_float_complex, hipFloatComplex)
+#undef current_func
+#define current_func(a, b) (hipCmul(a,b))
+FUNC_FUNC(prod, c_double_complex, hipDoubleComplex)
+//OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 
 /*************************************************************************
  * Logical AND

From cfe8a5a1cfe7ba53d12ae9e5b896212cc7ab0775 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 12:55:47 +0200
Subject: [PATCH 46/74] Device allocators: correctly handle non-zero ID single
 accelerator

The accelerator component may report the availability of a single accelerator
whose ID is not zero.

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_frame.c     |  5 ++---
 ompi/mca/coll/base/coll_base_functions.h |  1 +
 ompi/mca/coll/base/coll_base_util.c      | 12 +++++++++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_frame.c b/ompi/mca/coll/base/coll_base_frame.c
index f7b4fa572d9..45b7fcaeb67 100644
--- a/ompi/mca/coll/base/coll_base_frame.c
+++ b/ompi/mca/coll/base/coll_base_frame.c
@@ -72,6 +72,7 @@ coll_base_comm_construct(mca_coll_base_comm_t *data)
 {
     memset ((char *) data + sizeof (data->super), 0, sizeof (*data) - sizeof (data->super));
     data->device_allocators = NULL;
+    data->num_device_allocators = 0;
 }
 
 static void
@@ -112,9 +113,7 @@ coll_base_comm_destruct(mca_coll_base_comm_t *data)
     }
 
     if (NULL != data->device_allocators) {
-        int num_devices;
-        opal_accelerator.num_devices(&num_devices);
-        for (int i = 0; i < num_devices; ++i) {
+        for (int i = 0; i < data->num_device_allocators; ++i) {
             if (NULL != data->device_allocators[i]) {
                 data->device_allocators[i]->alc_finalize(data->device_allocators[i]);
             }
diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h
index eca2502493d..b3657c2f7f4 100644
--- a/ompi/mca/coll/base/coll_base_functions.h
+++ b/ompi/mca/coll/base/coll_base_functions.h
@@ -519,6 +519,7 @@ struct mca_coll_base_comm_t {
 
     /* pointer to per-device memory cache */
     mca_allocator_base_module_t **device_allocators;
+    int num_device_allocators;
 };
 typedef struct mca_coll_base_comm_t mca_coll_base_comm_t;
 OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t);
diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c
index d18770f3a13..1efb67215f2 100644
--- a/ompi/mca/coll/base/coll_base_util.c
+++ b/ompi/mca/coll/base/coll_base_util.c
@@ -624,10 +624,16 @@ void *ompi_coll_base_allocate_on_device(int device, size_t size,
         return malloc(size);
     }
 
-    if (NULL == module->base_data->device_allocators) {
+    if (module->base_data->num_device_allocators <= device) {
         int num_dev;
         opal_accelerator.num_devices(&num_dev);
-        module->base_data->device_allocators = calloc(num_dev, sizeof(mca_allocator_base_module_t *));
+	printf("ompi_coll_base_allocate_on_device num_dev %d device %d\n", num_dev, device);
+	if (num_dev < device+1) num_dev = device+1;
+        module->base_data->device_allocators = realloc(module->base_data->device_allocators, num_dev * sizeof(mca_allocator_base_module_t *));
+	for (int i = module->base_data->num_device_allocators; i < num_dev; ++i) {
+	    module->base_data->device_allocators[i] = NULL;
+        }
+        module->base_data->num_device_allocators = num_dev;
     }
     //printf("allocators %p module %p\n", module->base_data->device_allocators, module->base_data->device_allocators[device]);
     if (NULL == (allocator_module = module->base_data->device_allocators[device])) {
@@ -654,4 +660,4 @@ void ompi_coll_base_free_on_device(int device, void *ptr, mca_coll_base_module_t
         allocator_module = module->base_data->device_allocators[device];
         allocator_module->alc_free(allocator_module, ptr);
     }
-}
\ No newline at end of file
+}

From 3bc76767948da9dc140a6002e611123a674b4407 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 12:56:40 +0200
Subject: [PATCH 47/74] CUDA op: consistently name unsigned_long functions as
 ulong

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_functions.c | 44 ++++++++++++++--------------
 ompi/mca/op/cuda/op_cuda_impl.cu     | 36 +++++++++++------------
 ompi/mca/op/cuda/op_cuda_impl.h      | 42 +++++++++++++-------------
 3 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 125c5140aa6..7335a9cd001 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -274,7 +274,7 @@ FUNC_FUNC(max, uint32_t, uint32_t)
 FUNC_FUNC(max,  int64_t,  int64_t)
 FUNC_FUNC(max, uint64_t, uint64_t)
 FUNC_FUNC(max,  long,  long)
-FUNC_FUNC(max,  unsigned_long, unsigned long)
+FUNC_FUNC(max,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -343,7 +343,7 @@ FUNC_FUNC(min, uint32_t, uint32_t)
 FUNC_FUNC(min,  int64_t,  int64_t)
 FUNC_FUNC(min, uint64_t, uint64_t)
 FUNC_FUNC(min,  long,  long)
-FUNC_FUNC(min,  unsigned_long, unsigned long)
+FUNC_FUNC(min,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -410,7 +410,7 @@ OP_FUNC(sum, uint32_t, uint32_t, +=)
 OP_FUNC(sum,  int64_t,  int64_t, +=)
 OP_FUNC(sum, uint64_t, uint64_t, +=)
 OP_FUNC(sum,  long,  long, +=)
-OP_FUNC(sum,  unsigned_long, unsigned long, +=)
+OP_FUNC(sum,  ulong, unsigned long, +=)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -488,7 +488,7 @@ OP_FUNC(prod, uint32_t, uint32_t, *=)
 OP_FUNC(prod,  int64_t,  int64_t, *=)
 OP_FUNC(prod, uint64_t, uint64_t, *=)
 OP_FUNC(prod,  long,  long, *=)
-OP_FUNC(prod,  unsigned_long, unsigned long, *=)
+OP_FUNC(prod,  ulong, unsigned long, *=)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -568,7 +568,7 @@ FUNC_FUNC(land, uint32_t, uint32_t)
 FUNC_FUNC(land,  int64_t,  int64_t)
 FUNC_FUNC(land, uint64_t, uint64_t)
 FUNC_FUNC(land,  long,  long)
-FUNC_FUNC(land,  unsigned_long, unsigned long)
+FUNC_FUNC(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -593,7 +593,7 @@ FUNC_FUNC(lor, uint32_t, uint32_t)
 FUNC_FUNC(lor,  int64_t,  int64_t)
 FUNC_FUNC(lor, uint64_t, uint64_t)
 FUNC_FUNC(lor,  long,  long)
-FUNC_FUNC(lor,  unsigned_long, unsigned long)
+FUNC_FUNC(lor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -618,7 +618,7 @@ FUNC_FUNC(lxor, uint32_t, uint32_t)
 FUNC_FUNC(lxor,  int64_t,  int64_t)
 FUNC_FUNC(lxor, uint64_t, uint64_t)
 FUNC_FUNC(lxor,  long,  long)
-FUNC_FUNC(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC(lxor,  ulong, unsigned long)
 
 
 /* Logical */
@@ -644,7 +644,7 @@ FUNC_FUNC(band, uint32_t, uint32_t)
 FUNC_FUNC(band,  int64_t,  int64_t)
 FUNC_FUNC(band, uint64_t, uint64_t)
 FUNC_FUNC(band,  long,  long)
-FUNC_FUNC(band,  unsigned_long, unsigned long)
+FUNC_FUNC(band,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -684,7 +684,7 @@ FUNC_FUNC(bor, uint32_t, uint32_t)
 FUNC_FUNC(bor,  int64_t,  int64_t)
 FUNC_FUNC(bor, uint64_t, uint64_t)
 FUNC_FUNC(bor,  long,  long)
-FUNC_FUNC(bor,  unsigned_long, unsigned long)
+FUNC_FUNC(bor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -724,7 +724,7 @@ FUNC_FUNC(bxor, uint32_t, uint32_t)
 FUNC_FUNC(bxor,  int64_t,  int64_t)
 FUNC_FUNC(bxor, uint64_t, uint64_t)
 FUNC_FUNC(bxor,  long,  long)
-FUNC_FUNC(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC(bxor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -899,7 +899,7 @@ FUNC_FUNC_3BUF(max, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(max,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(max,  long,  long)
-FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(max,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -964,7 +964,7 @@ FUNC_FUNC_3BUF(min, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(min,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(min,  long,  long)
-FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(min,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1029,7 +1029,7 @@ OP_FUNC_3BUF(sum, uint32_t, uint32_t, +)
 OP_FUNC_3BUF(sum,  int64_t,  int64_t, +)
 OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF(sum,  long,  long, +)
-OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
+OP_FUNC_3BUF(sum,  ulong, unsigned long, +)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1105,7 +1105,7 @@ OP_FUNC_3BUF(prod, uint32_t, uint32_t, *)
 OP_FUNC_3BUF(prod,  int64_t,  int64_t, *)
 OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF(prod,  long,  long, *)
-OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
+OP_FUNC_3BUF(prod,  ulong, unsigned long, *)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1183,7 +1183,7 @@ FUNC_FUNC_3BUF(land, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(land,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(land,  long,  long)
-FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1208,7 +1208,7 @@ FUNC_FUNC_3BUF(lor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(lor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lor,  long,  long)
-FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(lor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1233,7 +1233,7 @@ FUNC_FUNC_3BUF(lxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(lxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lxor,  long,  long)
-FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(lxor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1258,7 +1258,7 @@ FUNC_FUNC_3BUF(band, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(band,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(band,  long,  long)
-FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(band,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1298,7 +1298,7 @@ FUNC_FUNC_3BUF(bor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(bor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bor,  long,  long)
-FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(bor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1338,7 +1338,7 @@ FUNC_FUNC_3BUF(bxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(bxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bxor,  long,  long)
-FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(bxor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1444,7 +1444,7 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
   [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_cuda_##ftype##_##name##_uint32_t, \
   [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_cuda_##ftype##_##name##_int64_t,   \
   [OMPI_OP_BASE_TYPE_LONG] = ompi_op_cuda_##ftype##_##name##_long,   \
-  [OMPI_OP_BASE_TYPE_UNSIGNED_LONG] = ompi_op_cuda_##ftype##_##name##_unsigned_long,   \
+  [OMPI_OP_BASE_TYPE_UNSIGNED_LONG] = ompi_op_cuda_##ftype##_##name##_long,   \
   [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_cuda_##ftype##_##name##_uint64_t
 
 /** All the Fortran integers ********************************************/
@@ -1811,4 +1811,4 @@ ompi_op_base_3buff_stream_handler_fn_t ompi_op_cuda_3buff_functions[OMPI_OP_BASE
                ACCUMULATE */
             NULL,
         },
-    };
\ No newline at end of file
+    };
diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index faa52e43860..8b3b77cf477 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -339,7 +339,7 @@ FUNC_FUNC(max, double, double)
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
 #undef current_func
 #define current_func(a, b) __hmax2(a, b)
-VFUNC_FUNC(max, halfx, half, half2, 2, __hmax2, __hmax)
+//VFUNC_FUNC(max, halfx, half, half2, 2, __hmax2, __hmax)
 #endif // __CUDA_ARCH__
 
 /*************************************************************************
@@ -391,7 +391,7 @@ FUNC_FUNC(min, long_double, long double)
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
 #undef current_func
 #define current_func(a, b) __hmin2(a, b)
-VFUNC_FUNC(min, half, half, half2, 2, __hmin2, __hmin)
+//VFUNC_FUNC(min, half, half, half2, 2, __hmin2, __hmin)
 #endif // __CUDA_ARCH__
 
 /*************************************************************************
@@ -520,7 +520,7 @@ FUNC_FUNC(land, uint32_t, uint32_t)
 FUNC_FUNC(land,  int64_t,  int64_t)
 FUNC_FUNC(land, uint64_t, uint64_t)
 FUNC_FUNC(land,  long,  long)
-FUNC_FUNC(land,  unsigned_long, unsigned long)
+FUNC_FUNC(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -545,7 +545,7 @@ FUNC_FUNC(lor, uint32_t, uint32_t)
 FUNC_FUNC(lor,  int64_t,  int64_t)
 FUNC_FUNC(lor, uint64_t, uint64_t)
 FUNC_FUNC(lor,  long,  long)
-FUNC_FUNC(lor,  unsigned_long, unsigned long)
+FUNC_FUNC(lor,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC(lor, bool, bool)
@@ -566,7 +566,7 @@ FUNC_FUNC(lxor, uint32_t, uint32_t)
 FUNC_FUNC(lxor,  int64_t,  int64_t)
 FUNC_FUNC(lxor, uint64_t, uint64_t)
 FUNC_FUNC(lxor,  long,  long)
-FUNC_FUNC(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC(lxor,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC(lxor, bool, bool)
@@ -587,7 +587,7 @@ FUNC_FUNC(band, uint32_t, uint32_t)
 FUNC_FUNC(band,  int64_t,  int64_t)
 FUNC_FUNC(band, uint64_t, uint64_t)
 FUNC_FUNC(band,  long,  long)
-FUNC_FUNC(band,  unsigned_long, unsigned long)
+FUNC_FUNC(band,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC(band, byte, char)
@@ -608,7 +608,7 @@ FUNC_FUNC(bor, uint32_t, uint32_t)
 FUNC_FUNC(bor,  int64_t,  int64_t)
 FUNC_FUNC(bor, uint64_t, uint64_t)
 FUNC_FUNC(bor,  long,  long)
-FUNC_FUNC(bor,  unsigned_long, unsigned long)
+FUNC_FUNC(bor,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC(bor, byte, char)
@@ -629,7 +629,7 @@ FUNC_FUNC(bxor, uint32_t, uint32_t)
 FUNC_FUNC(bxor,  int64_t,  int64_t)
 FUNC_FUNC(bxor, uint64_t, uint64_t)
 FUNC_FUNC(bxor,  long,  long)
-FUNC_FUNC(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC(bxor,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC(bxor, byte, char)
@@ -786,7 +786,7 @@ FUNC_FUNC_3BUF(max, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(max,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(max,  long,  long)
-FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(max,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -852,7 +852,7 @@ FUNC_FUNC_3BUF(min, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(min,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(min,  long,  long)
-FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(min,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -915,7 +915,7 @@ OP_FUNC_3BUF(sum, uint32_t, uint32_t, +)
 OP_FUNC_3BUF(sum,  int64_t,  int64_t, +)
 OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF(sum,  long,  long, +)
-OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
+OP_FUNC_3BUF(sum,  ulong, unsigned long, +)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -993,7 +993,7 @@ OP_FUNC_3BUF(prod, uint32_t, uint32_t, *)
 OP_FUNC_3BUF(prod,  int64_t,  int64_t, *)
 OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF(prod,  long,  long, *)
-OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
+OP_FUNC_3BUF(prod,  ulong, unsigned long, *)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1069,7 +1069,7 @@ FUNC_FUNC_3BUF(land, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(land,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(land,  long,  long)
-FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1094,7 +1094,7 @@ FUNC_FUNC_3BUF(lor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(lor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lor,  long,  long)
-FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(lor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1119,7 +1119,7 @@ FUNC_FUNC_3BUF(lxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(lxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lxor,  long,  long)
-FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(lxor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1144,7 +1144,7 @@ FUNC_FUNC_3BUF(band, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(band,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(band,  long,  long)
-FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(band,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1184,7 +1184,7 @@ FUNC_FUNC_3BUF(bor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(bor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bor,  long,  long)
-FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(bor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1224,7 +1224,7 @@ FUNC_FUNC_3BUF(bxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(bxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bxor,  long,  long)
-FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(bxor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
diff --git a/ompi/mca/op/cuda/op_cuda_impl.h b/ompi/mca/op/cuda/op_cuda_impl.h
index 501ee802a31..10ecbd3d084 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.h
+++ b/ompi/mca/op/cuda/op_cuda_impl.h
@@ -80,7 +80,7 @@ FUNC_FUNC_SIG(max, uint32_t, uint32_t)
 FUNC_FUNC_SIG(max,  int64_t,  int64_t)
 FUNC_FUNC_SIG(max, uint64_t, uint64_t)
 FUNC_FUNC_SIG(max,  long,  long)
-FUNC_FUNC_SIG(max,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(max,  ulong, unsigned long)
 
 #if 0
 /* Floating point */
@@ -109,7 +109,7 @@ FUNC_FUNC_SIG(min, uint32_t, uint32_t)
 FUNC_FUNC_SIG(min,  int64_t,  int64_t)
 FUNC_FUNC_SIG(min, uint64_t, uint64_t)
 FUNC_FUNC_SIG(min,  long,  long)
-FUNC_FUNC_SIG(min,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(min,  ulong, unsigned long)
 
 #if 0
 /* Floating point */
@@ -138,7 +138,7 @@ OP_FUNC_SIG(sum, uint32_t, uint32_t, +=)
 OP_FUNC_SIG(sum,  int64_t,  int64_t, +=)
 OP_FUNC_SIG(sum, uint64_t, uint64_t, +=)
 OP_FUNC_SIG(sum,  long,  long, +=)
-OP_FUNC_SIG(sum,  unsigned_long, unsigned long, +=)
+OP_FUNC_SIG(sum,  ulong, unsigned long, +=)
 
 #if __CUDA_ARCH__ >= 530
 OP_FUNC_SIG(sum, half, half, +=)
@@ -185,7 +185,7 @@ OP_FUNC_SIG(prod, uint32_t, uint32_t, *=)
 OP_FUNC_SIG(prod,  int64_t,  int64_t, *=)
 OP_FUNC_SIG(prod, uint64_t, uint64_t, *=)
 OP_FUNC_SIG(prod,  long,  long, *=)
-OP_FUNC_SIG(prod,  unsigned_long, unsigned long, *=)
+OP_FUNC_SIG(prod,  ulong, unsigned long, *=)
 
 #if 0
 /* Floating point */
@@ -229,7 +229,7 @@ FUNC_FUNC_SIG(land, uint32_t, uint32_t)
 FUNC_FUNC_SIG(land,  int64_t,  int64_t)
 FUNC_FUNC_SIG(land, uint64_t, uint64_t)
 FUNC_FUNC_SIG(land,  long,  long)
-FUNC_FUNC_SIG(land,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(land,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC_SIG(land, bool, bool)
@@ -250,7 +250,7 @@ FUNC_FUNC_SIG(lor, uint32_t, uint32_t)
 FUNC_FUNC_SIG(lor,  int64_t,  int64_t)
 FUNC_FUNC_SIG(lor, uint64_t, uint64_t)
 FUNC_FUNC_SIG(lor,  long,  long)
-FUNC_FUNC_SIG(lor,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(lor,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC_SIG(lor, bool, bool)
@@ -271,7 +271,7 @@ FUNC_FUNC_SIG(lxor, uint32_t, uint32_t)
 FUNC_FUNC_SIG(lxor,  int64_t,  int64_t)
 FUNC_FUNC_SIG(lxor, uint64_t, uint64_t)
 FUNC_FUNC_SIG(lxor,  long,  long)
-FUNC_FUNC_SIG(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(lxor,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC_SIG(lxor, bool, bool)
@@ -292,7 +292,7 @@ FUNC_FUNC_SIG(band, uint32_t, uint32_t)
 FUNC_FUNC_SIG(band,  int64_t,  int64_t)
 FUNC_FUNC_SIG(band, uint64_t, uint64_t)
 FUNC_FUNC_SIG(band,  long,  long)
-FUNC_FUNC_SIG(band,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(band,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC_SIG(band, byte, char)
@@ -313,7 +313,7 @@ FUNC_FUNC_SIG(bor, uint32_t, uint32_t)
 FUNC_FUNC_SIG(bor,  int64_t,  int64_t)
 FUNC_FUNC_SIG(bor, uint64_t, uint64_t)
 FUNC_FUNC_SIG(bor,  long,  long)
-FUNC_FUNC_SIG(bor,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(bor,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC_SIG(bor, byte, char)
@@ -334,7 +334,7 @@ FUNC_FUNC_SIG(bxor, uint32_t, uint32_t)
 FUNC_FUNC_SIG(bxor,  int64_t,  int64_t)
 FUNC_FUNC_SIG(bxor, uint64_t, uint64_t)
 FUNC_FUNC_SIG(bxor,  long,  long)
-FUNC_FUNC_SIG(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(bxor,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC_SIG(bxor, byte, char)
@@ -349,7 +349,7 @@ LOC_STRUCT(long_int, long, int)
 LOC_STRUCT(2int, int, int)
 LOC_STRUCT(short_int, short, int)
 LOC_STRUCT(long_double_int, long double, int)
-LOC_STRUCT(unsigned_long, unsigned long, int)
+LOC_STRUCT(ulong, unsigned long, int)
 /* compat types for Fortran */
 LOC_STRUCT(2real, float, float)
 LOC_STRUCT(2double_precision, double, double)
@@ -420,7 +420,7 @@ FUNC_FUNC_3BUF_SIG(max, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(max,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(max,  long,  long)
-FUNC_FUNC_3BUF_SIG(max,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(max,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -484,7 +484,7 @@ FUNC_FUNC_3BUF_SIG(min, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(min,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(min,  long,  long)
-FUNC_FUNC_3BUF_SIG(min,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(min,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -547,7 +547,7 @@ OP_FUNC_3BUF_SIG(sum, uint32_t, uint32_t, +)
 OP_FUNC_3BUF_SIG(sum,  int64_t,  int64_t, +)
 OP_FUNC_3BUF_SIG(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF_SIG(sum,  long,  long, +)
-OP_FUNC_3BUF_SIG(sum,  unsigned_long, unsigned long, +)
+OP_FUNC_3BUF_SIG(sum,  ulong, unsigned long, +)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -621,7 +621,7 @@ OP_FUNC_3BUF_SIG(prod, uint32_t, uint32_t, *)
 OP_FUNC_3BUF_SIG(prod,  int64_t,  int64_t, *)
 OP_FUNC_3BUF_SIG(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF_SIG(prod,  long,  long, *)
-OP_FUNC_3BUF_SIG(prod,  unsigned_long, unsigned long, *)
+OP_FUNC_3BUF_SIG(prod,  ulong, unsigned long, *)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -695,7 +695,7 @@ FUNC_FUNC_3BUF_SIG(land, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(land,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(land,  long,  long)
-FUNC_FUNC_3BUF_SIG(land,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -718,7 +718,7 @@ FUNC_FUNC_3BUF_SIG(lor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(lor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(lor,  long,  long)
-FUNC_FUNC_3BUF_SIG(lor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(lor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -741,7 +741,7 @@ FUNC_FUNC_3BUF_SIG(lxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(lxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(lxor,  long,  long)
-FUNC_FUNC_3BUF_SIG(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(lxor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -764,7 +764,7 @@ FUNC_FUNC_3BUF_SIG(band, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(band,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(band,  long,  long)
-FUNC_FUNC_3BUF_SIG(band,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(band,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -802,7 +802,7 @@ FUNC_FUNC_3BUF_SIG(bor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(bor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(bor,  long,  long)
-FUNC_FUNC_3BUF_SIG(bor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(bor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -840,7 +840,7 @@ FUNC_FUNC_3BUF_SIG(bxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(bxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(bxor,  long,  long)
-FUNC_FUNC_3BUF_SIG(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(bxor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER

From 9c1da7eafd9530cd7649a75111e561e2012d14bc Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 12:56:55 +0200
Subject: [PATCH 48/74] ROCM op: remove debug output

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/rocm/op_rocm_component.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
index d17bf53663c..911ccb3d818 100644
--- a/ompi/mca/op/rocm/op_rocm_component.c
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -82,7 +82,6 @@ static int rocm_component_open(void)
      * component won't even be shown in ompi_info output (which is
      * probably not what you want).
      */
-    printf("op rocm_component_open\n");
     return OMPI_SUCCESS;
 }
 
@@ -102,7 +101,6 @@ static int rocm_component_close(void)
         mca_op_rocm_component.ro_num_devices = 0;
     }
 
-    printf("op rocm_component_close\n");
     return OMPI_SUCCESS;
 }
 
@@ -217,6 +215,5 @@ rocm_component_op_query(struct ompi_op_t *op, int *priority)
         }
     }
     *priority = 50;
-    printf("op rocm_component_op_query\n");
     return (ompi_op_base_module_1_0_0_t *) module;
 }

From a20f671de12baa8c2b1d57121dccb0bec8cf24ad Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 12:57:36 +0200
Subject: [PATCH 49/74] Reduce_local test: correctly test for OPAL_CUDA_SUPPORT
 and OPAL_ROCM_SUPPORT

These macros are defined to either 1 or 0

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 test/datatype/reduce_local.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index 4bf3ce93990..6a155f00fd3 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -246,7 +246,7 @@ static allocator_t host_allocator = {
     .free     = &host_free,
     .fini     = &host_fini};
 
-#if defined(OPAL_CUDA_SUPPORT)
+#if defined(OPAL_CUDA_SUPPORT) && OPAL_CUDA_SUPPORT
 #include <cuda_runtime.h>
 static void cuda_init() {
     // nothing to be done
@@ -280,7 +280,7 @@ static allocator_t cuda_allocator = {
     .free     = &cuda_free,
     .fini     = &cuda_fini};
 
-#elif defined(OPAL_ROCM_SUPPORT)
+#elif defined(OPAL_ROCM_SUPPORT) && OPAL_ROCM_SUPPORT
 #include <hip/hip_runtime.h>
 static void rocm_init() {
     hipError_t ret = hipInit(0);
@@ -414,12 +414,12 @@ int main(int argc, char **argv)
                 // default allocator
                 break;
             } else
-#if defined(OPAL_CUDA_SUPPORT)
+#if defined(OPAL_CUDA_SUPPORT) && OPAL_CUDA_SUPPORT
             if (0 == strncmp("cuda", optarg, 4)) {
                 allocator = &cuda_allocator;
                 break;
             } else
-#elif defined(OPAL_ROCM_SUPPORT)
+#elif defined(OPAL_ROCM_SUPPORT) && OPAL_ROCM_SUPPORT
             if (0 == strncmp("rocm", optarg, 4)) {
                 allocator = &rocm_allocator;
                 break;
@@ -440,10 +440,10 @@ int main(int argc, char **argv)
                     " -o <op> : comma separated list of operations to execute among\n"
                     "           sum, min, max, prod, bor, bxor, band\n"
                     " -d <memory-space> : host"
-#ifdef OPAL_CUDA_SUPPORT
+#if defined(OPAL_CUDA_SUPPORT) && OPAL_CUDA_SUPPORT
                     ", cuda"
 #endif
-#ifdef OPAL_ROCM_SUPPORT
+#if defined(OPAL_ROCM_SUPPORT) && OPAL_ROCM_SUPPORT
                     ", rocm"
 #endif
                     "\n"

From 97338db163ae20916d8c99398f72551d0aec304a Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 12:09:51 +0000
Subject: [PATCH 50/74] More unsigned_long -> ulong fixes in CUDA and ROCm op

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_functions.c |  2 +-
 ompi/mca/op/cuda/op_cuda_impl.cu     | 10 +++----
 ompi/mca/op/rocm/op_rocm_functions.c | 42 ++++++++++++++--------------
 ompi/mca/op/rocm/op_rocm_impl.cpp    | 32 ++++++++++-----------
 ompi/mca/op/rocm/op_rocm_impl.h      | 42 ++++++++++++++--------------
 5 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 7335a9cd001..97af100abfd 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -1444,7 +1444,7 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
   [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_cuda_##ftype##_##name##_uint32_t, \
   [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_cuda_##ftype##_##name##_int64_t,   \
   [OMPI_OP_BASE_TYPE_LONG] = ompi_op_cuda_##ftype##_##name##_long,   \
-  [OMPI_OP_BASE_TYPE_UNSIGNED_LONG] = ompi_op_cuda_##ftype##_##name##_long,   \
+  [OMPI_OP_BASE_TYPE_UNSIGNED_LONG] = ompi_op_cuda_##ftype##_##name##_ulong,   \
   [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_cuda_##ftype##_##name##_uint64_t
 
 /** All the Fortran integers ********************************************/
diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index 8b3b77cf477..65bdecea60e 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -257,7 +257,7 @@ static inline __device__ T vprod(const T& a, const T& b) {
                                                         threads_per_block,                          \
                                                         max_blocks, stream);                         \
             } else if constexpr(sizeof(type_name) == sizeof(unsigned long)) {                                \
-                ompi_op_cuda_2buff_##name##_unsigned_long_submit((const unsigned long*)in, (unsigned long*)inout, count, \
+                ompi_op_cuda_2buff_##name##_ulong_submit((const unsigned long*)in, (unsigned long*)inout, count, \
                                                          threads_per_block,                         \
                                                          max_blocks, stream);                        \
             } else if constexpr(sizeof(type_name) == sizeof(unsigned long long)) {                           \
@@ -305,7 +305,7 @@ VFUNC_FUNC(max, uint, unsigned int, uint4, 4, vmax, max)
 #undef current_func
 #define current_func(a, b) max(a, b)
 FUNC_FUNC(max,  long,  long)
-FUNC_FUNC(max,  unsigned_long, unsigned long)
+FUNC_FUNC(max,  ulong, unsigned long)
 FUNC_FUNC(max,  longlong, long long)
 FUNC_FUNC(max,  ulonglong, unsigned long long)
 
@@ -357,7 +357,7 @@ VFUNC_FUNC(min, uint, unsigned int, uint4, 4, vmin, min)
 #undef current_func
 #define current_func(a, b) min(a, b)
 FUNC_FUNC(min,  long,  long)
-FUNC_FUNC(min,  unsigned_long, unsigned long)
+FUNC_FUNC(min,  ulong, unsigned long)
 FUNC_FUNC(min,  longlong, long long)
 FUNC_FUNC(min,  ulonglong, unsigned long long)
 OPV_DISPATCH(min,   int8_t,   int8_t)
@@ -409,7 +409,7 @@ VFUNC_FUNC(sum, uint, unsigned int, uint4, 4, vsum, tsum)
 #undef current_func
 #define current_func(a, b) tsum(a, b)
 FUNC_FUNC(sum,  long,  long)
-FUNC_FUNC(sum,  unsigned_long, unsigned long)
+FUNC_FUNC(sum,  ulong, unsigned long)
 FUNC_FUNC(sum,  longlong, long long)
 FUNC_FUNC(sum,  ulonglong, unsigned long long)
 
@@ -463,7 +463,7 @@ FUNC_FUNC(prod, ushort, unsigned short)
 FUNC_FUNC(prod, int, int)
 FUNC_FUNC(prod, uint, unsigned int)
 FUNC_FUNC(prod,  long,  long)
-FUNC_FUNC(prod,  unsigned_long, unsigned long)
+FUNC_FUNC(prod,  ulong, unsigned long)
 FUNC_FUNC(prod,  longlong, long long)
 FUNC_FUNC(prod,  ulonglong, unsigned long long)
 
diff --git a/ompi/mca/op/rocm/op_rocm_functions.c b/ompi/mca/op/rocm/op_rocm_functions.c
index 59436c0897f..d13113a8acf 100644
--- a/ompi/mca/op/rocm/op_rocm_functions.c
+++ b/ompi/mca/op/rocm/op_rocm_functions.c
@@ -275,7 +275,7 @@ FUNC_FUNC(max, uint32_t, uint32_t)
 FUNC_FUNC(max,  int64_t,  int64_t)
 FUNC_FUNC(max, uint64_t, uint64_t)
 FUNC_FUNC(max,  long,  long)
-FUNC_FUNC(max,  unsigned_long, unsigned long)
+FUNC_FUNC(max,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -344,7 +344,7 @@ FUNC_FUNC(min, uint32_t, uint32_t)
 FUNC_FUNC(min,  int64_t,  int64_t)
 FUNC_FUNC(min, uint64_t, uint64_t)
 FUNC_FUNC(min,  long,  long)
-FUNC_FUNC(min,  unsigned_long, unsigned long)
+FUNC_FUNC(min,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -411,7 +411,7 @@ OP_FUNC(sum, uint32_t, uint32_t, +=)
 OP_FUNC(sum,  int64_t,  int64_t, +=)
 OP_FUNC(sum, uint64_t, uint64_t, +=)
 OP_FUNC(sum,  long,  long, +=)
-OP_FUNC(sum,  unsigned_long, unsigned long, +=)
+OP_FUNC(sum,  ulong, unsigned long, +=)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -489,7 +489,7 @@ OP_FUNC(prod, uint32_t, uint32_t, *=)
 OP_FUNC(prod,  int64_t,  int64_t, *=)
 OP_FUNC(prod, uint64_t, uint64_t, *=)
 OP_FUNC(prod,  long,  long, *=)
-OP_FUNC(prod,  unsigned_long, unsigned long, *=)
+OP_FUNC(prod,  ulong, unsigned long, *=)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -569,7 +569,7 @@ FUNC_FUNC(land, uint32_t, uint32_t)
 FUNC_FUNC(land,  int64_t,  int64_t)
 FUNC_FUNC(land, uint64_t, uint64_t)
 FUNC_FUNC(land,  long,  long)
-FUNC_FUNC(land,  unsigned_long, unsigned long)
+FUNC_FUNC(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -594,7 +594,7 @@ FUNC_FUNC(lor, uint32_t, uint32_t)
 FUNC_FUNC(lor,  int64_t,  int64_t)
 FUNC_FUNC(lor, uint64_t, uint64_t)
 FUNC_FUNC(lor,  long,  long)
-FUNC_FUNC(lor,  unsigned_long, unsigned long)
+FUNC_FUNC(lor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -619,7 +619,7 @@ FUNC_FUNC(lxor, uint32_t, uint32_t)
 FUNC_FUNC(lxor,  int64_t,  int64_t)
 FUNC_FUNC(lxor, uint64_t, uint64_t)
 FUNC_FUNC(lxor,  long,  long)
-FUNC_FUNC(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC(lxor,  ulong, unsigned long)
 
 
 /* Logical */
@@ -645,7 +645,7 @@ FUNC_FUNC(band, uint32_t, uint32_t)
 FUNC_FUNC(band,  int64_t,  int64_t)
 FUNC_FUNC(band, uint64_t, uint64_t)
 FUNC_FUNC(band,  long,  long)
-FUNC_FUNC(band,  unsigned_long, unsigned long)
+FUNC_FUNC(band,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -685,7 +685,7 @@ FUNC_FUNC(bor, uint32_t, uint32_t)
 FUNC_FUNC(bor,  int64_t,  int64_t)
 FUNC_FUNC(bor, uint64_t, uint64_t)
 FUNC_FUNC(bor,  long,  long)
-FUNC_FUNC(bor,  unsigned_long, unsigned long)
+FUNC_FUNC(bor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -725,7 +725,7 @@ FUNC_FUNC(bxor, uint32_t, uint32_t)
 FUNC_FUNC(bxor,  int64_t,  int64_t)
 FUNC_FUNC(bxor, uint64_t, uint64_t)
 FUNC_FUNC(bxor,  long,  long)
-FUNC_FUNC(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC(bxor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -900,7 +900,7 @@ FUNC_FUNC_3BUF(max, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(max,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(max,  long,  long)
-FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(max,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -965,7 +965,7 @@ FUNC_FUNC_3BUF(min, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(min,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(min,  long,  long)
-FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(min,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1030,7 +1030,7 @@ OP_FUNC_3BUF(sum, uint32_t, uint32_t, +)
 OP_FUNC_3BUF(sum,  int64_t,  int64_t, +)
 OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF(sum,  long,  long, +)
-OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
+OP_FUNC_3BUF(sum,  ulong, unsigned long, +)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1106,7 +1106,7 @@ OP_FUNC_3BUF(prod, uint32_t, uint32_t, *)
 OP_FUNC_3BUF(prod,  int64_t,  int64_t, *)
 OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF(prod,  long,  long, *)
-OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
+OP_FUNC_3BUF(prod,  ulong, unsigned long, *)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1184,7 +1184,7 @@ FUNC_FUNC_3BUF(land, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(land,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(land,  long,  long)
-FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1209,7 +1209,7 @@ FUNC_FUNC_3BUF(lor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(lor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lor,  long,  long)
-FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(lor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1234,7 +1234,7 @@ FUNC_FUNC_3BUF(lxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(lxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lxor,  long,  long)
-FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(lxor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1259,7 +1259,7 @@ FUNC_FUNC_3BUF(band, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(band,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(band,  long,  long)
-FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(band,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1299,7 +1299,7 @@ FUNC_FUNC_3BUF(bor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(bor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bor,  long,  long)
-FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(bor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1339,7 +1339,7 @@ FUNC_FUNC_3BUF(bxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(bxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bxor,  long,  long)
-FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(bxor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1445,7 +1445,7 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
   [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_rocm_##ftype##_##name##_uint32_t, \
   [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_rocm_##ftype##_##name##_int64_t,   \
   [OMPI_OP_BASE_TYPE_LONG] = ompi_op_rocm_##ftype##_##name##_long,   \
-  [OMPI_OP_BASE_TYPE_UNSIGNED_LONG] = ompi_op_rocm_##ftype##_##name##_unsigned_long,   \
+  [OMPI_OP_BASE_TYPE_ulong] = ompi_op_rocm_##ftype##_##name##_ulong,   \
   [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_rocm_##ftype##_##name##_uint64_t
 
 /** All the Fortran integers ********************************************/
diff --git a/ompi/mca/op/rocm/op_rocm_impl.cpp b/ompi/mca/op/rocm/op_rocm_impl.cpp
index c1789a13f14..7a39f846941 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.cpp
+++ b/ompi/mca/op/rocm/op_rocm_impl.cpp
@@ -502,7 +502,7 @@ FUNC_FUNC(land, uint32_t, uint32_t)
 FUNC_FUNC(land,  int64_t,  int64_t)
 FUNC_FUNC(land, uint64_t, uint64_t)
 FUNC_FUNC(land,  long,  long)
-FUNC_FUNC(land,  unsigned_long, unsigned long)
+FUNC_FUNC(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -527,7 +527,7 @@ FUNC_FUNC(lor, uint32_t, uint32_t)
 FUNC_FUNC(lor,  int64_t,  int64_t)
 FUNC_FUNC(lor, uint64_t, uint64_t)
 FUNC_FUNC(lor,  long,  long)
-FUNC_FUNC(lor,  unsigned_long, unsigned long)
+FUNC_FUNC(lor,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC(lor, bool, bool)
@@ -548,7 +548,7 @@ FUNC_FUNC(lxor, uint32_t, uint32_t)
 FUNC_FUNC(lxor,  int64_t,  int64_t)
 FUNC_FUNC(lxor, uint64_t, uint64_t)
 FUNC_FUNC(lxor,  long,  long)
-FUNC_FUNC(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC(lxor,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC(lxor, bool, bool)
@@ -569,7 +569,7 @@ FUNC_FUNC(band, uint32_t, uint32_t)
 FUNC_FUNC(band,  int64_t,  int64_t)
 FUNC_FUNC(band, uint64_t, uint64_t)
 FUNC_FUNC(band,  long,  long)
-FUNC_FUNC(band,  unsigned_long, unsigned long)
+FUNC_FUNC(band,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC(band, byte, char)
@@ -590,7 +590,7 @@ FUNC_FUNC(bor, uint32_t, uint32_t)
 FUNC_FUNC(bor,  int64_t,  int64_t)
 FUNC_FUNC(bor, uint64_t, uint64_t)
 FUNC_FUNC(bor,  long,  long)
-FUNC_FUNC(bor,  unsigned_long, unsigned long)
+FUNC_FUNC(bor,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC(bor, byte, char)
@@ -611,7 +611,7 @@ FUNC_FUNC(bxor, uint32_t, uint32_t)
 FUNC_FUNC(bxor,  int64_t,  int64_t)
 FUNC_FUNC(bxor, uint64_t, uint64_t)
 FUNC_FUNC(bxor,  long,  long)
-FUNC_FUNC(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC(bxor,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC(bxor, byte, char)
@@ -771,7 +771,7 @@ FUNC_FUNC_3BUF(max, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(max,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(max,  long,  long)
-FUNC_FUNC_3BUF(max,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(max,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -837,7 +837,7 @@ FUNC_FUNC_3BUF(min, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(min,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(min,  long,  long)
-FUNC_FUNC_3BUF(min,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(min,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -900,7 +900,7 @@ OP_FUNC_3BUF(sum, uint32_t, uint32_t, +)
 OP_FUNC_3BUF(sum,  int64_t,  int64_t, +)
 OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF(sum,  long,  long, +)
-OP_FUNC_3BUF(sum,  unsigned_long, unsigned long, +)
+OP_FUNC_3BUF(sum,  ulong, unsigned long, +)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -978,7 +978,7 @@ OP_FUNC_3BUF(prod, uint32_t, uint32_t, *)
 OP_FUNC_3BUF(prod,  int64_t,  int64_t, *)
 OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF(prod,  long,  long, *)
-OP_FUNC_3BUF(prod,  unsigned_long, unsigned long, *)
+OP_FUNC_3BUF(prod,  ulong, unsigned long, *)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1054,7 +1054,7 @@ FUNC_FUNC_3BUF(land, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(land,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(land,  long,  long)
-FUNC_FUNC_3BUF(land,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1079,7 +1079,7 @@ FUNC_FUNC_3BUF(lor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(lor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lor,  long,  long)
-FUNC_FUNC_3BUF(lor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(lor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1104,7 +1104,7 @@ FUNC_FUNC_3BUF(lxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(lxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lxor,  long,  long)
-FUNC_FUNC_3BUF(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(lxor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -1129,7 +1129,7 @@ FUNC_FUNC_3BUF(band, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(band,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(band,  long,  long)
-FUNC_FUNC_3BUF(band,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(band,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1169,7 +1169,7 @@ FUNC_FUNC_3BUF(bor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(bor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bor,  long,  long)
-FUNC_FUNC_3BUF(bor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(bor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -1209,7 +1209,7 @@ FUNC_FUNC_3BUF(bxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF(bxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bxor,  long,  long)
-FUNC_FUNC_3BUF(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF(bxor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
diff --git a/ompi/mca/op/rocm/op_rocm_impl.h b/ompi/mca/op/rocm/op_rocm_impl.h
index 5af40fb9c92..688218b8068 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.h
+++ b/ompi/mca/op/rocm/op_rocm_impl.h
@@ -79,7 +79,7 @@ FUNC_FUNC_SIG(max, uint32_t, uint32_t)
 FUNC_FUNC_SIG(max,  int64_t,  int64_t)
 FUNC_FUNC_SIG(max, uint64_t, uint64_t)
 FUNC_FUNC_SIG(max,  long,  long)
-FUNC_FUNC_SIG(max,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(max,  ulong, unsigned long)
 
 #if 0
 /* Floating point */
@@ -108,7 +108,7 @@ FUNC_FUNC_SIG(min, uint32_t, uint32_t)
 FUNC_FUNC_SIG(min,  int64_t,  int64_t)
 FUNC_FUNC_SIG(min, uint64_t, uint64_t)
 FUNC_FUNC_SIG(min,  long,  long)
-FUNC_FUNC_SIG(min,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(min,  ulong, unsigned long)
 
 #if 0
 /* Floating point */
@@ -137,7 +137,7 @@ OP_FUNC_SIG(sum, uint32_t, uint32_t, +=)
 OP_FUNC_SIG(sum,  int64_t,  int64_t, +=)
 OP_FUNC_SIG(sum, uint64_t, uint64_t, +=)
 OP_FUNC_SIG(sum,  long,  long, +=)
-OP_FUNC_SIG(sum,  unsigned_long, unsigned long, +=)
+OP_FUNC_SIG(sum,  ulong, unsigned long, +=)
 
 #if 0
 /* Floating point */
@@ -180,7 +180,7 @@ OP_FUNC_SIG(prod, uint32_t, uint32_t, *=)
 OP_FUNC_SIG(prod,  int64_t,  int64_t, *=)
 OP_FUNC_SIG(prod, uint64_t, uint64_t, *=)
 OP_FUNC_SIG(prod,  long,  long, *=)
-OP_FUNC_SIG(prod,  unsigned_long, unsigned long, *=)
+OP_FUNC_SIG(prod,  ulong, unsigned long, *=)
 
 #if 0
 /* Floating point */
@@ -223,7 +223,7 @@ FUNC_FUNC_SIG(land, uint32_t, uint32_t)
 FUNC_FUNC_SIG(land,  int64_t,  int64_t)
 FUNC_FUNC_SIG(land, uint64_t, uint64_t)
 FUNC_FUNC_SIG(land,  long,  long)
-FUNC_FUNC_SIG(land,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(land,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC_SIG(land, bool, bool)
@@ -244,7 +244,7 @@ FUNC_FUNC_SIG(lor, uint32_t, uint32_t)
 FUNC_FUNC_SIG(lor,  int64_t,  int64_t)
 FUNC_FUNC_SIG(lor, uint64_t, uint64_t)
 FUNC_FUNC_SIG(lor,  long,  long)
-FUNC_FUNC_SIG(lor,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(lor,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC_SIG(lor, bool, bool)
@@ -265,7 +265,7 @@ FUNC_FUNC_SIG(lxor, uint32_t, uint32_t)
 FUNC_FUNC_SIG(lxor,  int64_t,  int64_t)
 FUNC_FUNC_SIG(lxor, uint64_t, uint64_t)
 FUNC_FUNC_SIG(lxor,  long,  long)
-FUNC_FUNC_SIG(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(lxor,  ulong, unsigned long)
 
 /* C++ bool */
 FUNC_FUNC_SIG(lxor, bool, bool)
@@ -286,7 +286,7 @@ FUNC_FUNC_SIG(band, uint32_t, uint32_t)
 FUNC_FUNC_SIG(band,  int64_t,  int64_t)
 FUNC_FUNC_SIG(band, uint64_t, uint64_t)
 FUNC_FUNC_SIG(band,  long,  long)
-FUNC_FUNC_SIG(band,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(band,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC_SIG(band, byte, char)
@@ -307,7 +307,7 @@ FUNC_FUNC_SIG(bor, uint32_t, uint32_t)
 FUNC_FUNC_SIG(bor,  int64_t,  int64_t)
 FUNC_FUNC_SIG(bor, uint64_t, uint64_t)
 FUNC_FUNC_SIG(bor,  long,  long)
-FUNC_FUNC_SIG(bor,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(bor,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC_SIG(bor, byte, char)
@@ -328,7 +328,7 @@ FUNC_FUNC_SIG(bxor, uint32_t, uint32_t)
 FUNC_FUNC_SIG(bxor,  int64_t,  int64_t)
 FUNC_FUNC_SIG(bxor, uint64_t, uint64_t)
 FUNC_FUNC_SIG(bxor,  long,  long)
-FUNC_FUNC_SIG(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC_SIG(bxor,  ulong, unsigned long)
 
 /* Byte */
 FUNC_FUNC_SIG(bxor, byte, char)
@@ -343,7 +343,7 @@ LOC_STRUCT(long_int, long, int)
 LOC_STRUCT(2int, int, int)
 LOC_STRUCT(short_int, short, int)
 LOC_STRUCT(long_double_int, long double, int)
-LOC_STRUCT(unsigned_long, unsigned long, int)
+LOC_STRUCT(ulong, unsigned long, int)
 /* compat types for Fortran */
 LOC_STRUCT(2real, float, float)
 LOC_STRUCT(2double_precision, double, double)
@@ -414,7 +414,7 @@ FUNC_FUNC_3BUF_SIG(max, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(max,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(max,  long,  long)
-FUNC_FUNC_3BUF_SIG(max,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(max,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -478,7 +478,7 @@ FUNC_FUNC_3BUF_SIG(min, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(min,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(min,  long,  long)
-FUNC_FUNC_3BUF_SIG(min,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(min,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -541,7 +541,7 @@ OP_FUNC_3BUF_SIG(sum, uint32_t, uint32_t, +)
 OP_FUNC_3BUF_SIG(sum,  int64_t,  int64_t, +)
 OP_FUNC_3BUF_SIG(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF_SIG(sum,  long,  long, +)
-OP_FUNC_3BUF_SIG(sum,  unsigned_long, unsigned long, +)
+OP_FUNC_3BUF_SIG(sum,  ulong, unsigned long, +)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -615,7 +615,7 @@ OP_FUNC_3BUF_SIG(prod, uint32_t, uint32_t, *)
 OP_FUNC_3BUF_SIG(prod,  int64_t,  int64_t, *)
 OP_FUNC_3BUF_SIG(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF_SIG(prod,  long,  long, *)
-OP_FUNC_3BUF_SIG(prod,  unsigned_long, unsigned long, *)
+OP_FUNC_3BUF_SIG(prod,  ulong, unsigned long, *)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -689,7 +689,7 @@ FUNC_FUNC_3BUF_SIG(land, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(land,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(land,  long,  long)
-FUNC_FUNC_3BUF_SIG(land,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(land,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -712,7 +712,7 @@ FUNC_FUNC_3BUF_SIG(lor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(lor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(lor,  long,  long)
-FUNC_FUNC_3BUF_SIG(lor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(lor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -735,7 +735,7 @@ FUNC_FUNC_3BUF_SIG(lxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(lxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(lxor,  long,  long)
-FUNC_FUNC_3BUF_SIG(lxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(lxor,  ulong, unsigned long)
 
 /* Logical */
 #if OMPI_HAVE_FORTRAN_LOGICAL
@@ -758,7 +758,7 @@ FUNC_FUNC_3BUF_SIG(band, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(band,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(band,  long,  long)
-FUNC_FUNC_3BUF_SIG(band,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(band,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -796,7 +796,7 @@ FUNC_FUNC_3BUF_SIG(bor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(bor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(bor,  long,  long)
-FUNC_FUNC_3BUF_SIG(bor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(bor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER
@@ -834,7 +834,7 @@ FUNC_FUNC_3BUF_SIG(bxor, uint32_t, uint32_t)
 FUNC_FUNC_3BUF_SIG(bxor,  int64_t,  int64_t)
 FUNC_FUNC_3BUF_SIG(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(bxor,  long,  long)
-FUNC_FUNC_3BUF_SIG(bxor,  unsigned_long, unsigned long)
+FUNC_FUNC_3BUF_SIG(bxor,  ulong, unsigned long)
 
 /* Fortran integer */
 #if OMPI_HAVE_FORTRAN_INTEGER

From 541b8a05e7939360235890f39b15eb263ee1757c Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 12:13:32 +0000
Subject: [PATCH 51/74] Fix type in ulong conversion

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/rocm/op_rocm_functions.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ompi/mca/op/rocm/op_rocm_functions.c b/ompi/mca/op/rocm/op_rocm_functions.c
index d13113a8acf..fbb68dfdab4 100644
--- a/ompi/mca/op/rocm/op_rocm_functions.c
+++ b/ompi/mca/op/rocm/op_rocm_functions.c
@@ -1445,7 +1445,7 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
   [OMPI_OP_BASE_TYPE_UINT32_T] = ompi_op_rocm_##ftype##_##name##_uint32_t, \
   [OMPI_OP_BASE_TYPE_INT64_T] = ompi_op_rocm_##ftype##_##name##_int64_t,   \
   [OMPI_OP_BASE_TYPE_LONG] = ompi_op_rocm_##ftype##_##name##_long,   \
-  [OMPI_OP_BASE_TYPE_ulong] = ompi_op_rocm_##ftype##_##name##_ulong,   \
+  [OMPI_OP_BASE_TYPE_UNSIGNED_LONG] = ompi_op_rocm_##ftype##_##name##_ulong,   \
   [OMPI_OP_BASE_TYPE_UINT64_T] = ompi_op_rocm_##ftype##_##name##_uint64_t
 
 /** All the Fortran integers ********************************************/

From 8cb2feb68377a2a6cc887bfe22f76dc9f0a5541a Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 18:51:06 +0000
Subject: [PATCH 52/74] Reduce_local: access only host-side memory in error
 message

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 test/datatype/reduce_local.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/datatype/reduce_local.c b/test/datatype/reduce_local.c
index 6a155f00fd3..8ec54a243d8 100644
--- a/test/datatype/reduce_local.c
+++ b/test/datatype/reduce_local.c
@@ -160,7 +160,7 @@ do { \
                     if(((CHECK_BUF+_k)[i]) == (((INIT_INBUF+_k)[i]) OPNAME ((INIT_INOUT_BUF+_k)[i]))) \
                         continue; \
                     printf("First error at alignment %d position %d (%" TYPE_PREFIX " %s %" TYPE_PREFIX " != %" TYPE_PREFIX ")\n", \
-                           _k, i, (INBUF+_k)[i], (#OPNAME), (INIT_INOUT_BUF+_k)[i], (INOUT_BUF+_k)[i]); \
+                           _k, i, (INIT_INBUF+_k)[i], (#OPNAME), (INIT_INOUT_BUF+_k)[i], (INIT_INOUT_BUF+_k)[i]); \
                     correctness = 0; \
                     break; \
                 } \
@@ -189,7 +189,7 @@ do { \
                     if(_v2 == OPNAME(_v1, _v3)) \
                         continue; \
                     printf("First error at alignment %d position %d (%" TYPE_PREFIX " !=  %s(%" TYPE_PREFIX ", %" TYPE_PREFIX ")\n", \
-                           _k, i, _v1, (#OPNAME), _v3, _v2); \
+                           _k, i, _v2, (#OPNAME), _v1, _v2); \
                     correctness = 0; \
                     break; \
                 } \

From 2996ba0ad21a0a2b2a71992660e17d64e5ea53da Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 20 Jul 2023 20:27:29 +0000
Subject: [PATCH 53/74] Make sure CUDA accelerator is initialized before
 querying number of devices

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/cuda/accelerator_cuda.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index cc7b7c38489..e228a9f811e 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -783,6 +783,12 @@ static int accelerator_cuda_mem_alloc_stream(
     size_t size,
     opal_accelerator_stream_t *stream)
 {
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
 #if CUDA_VERSION >= 11020
     cudaError_t result;
 
@@ -867,6 +873,12 @@ static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream)
 
 static int accelerator_cuda_get_num_devices(int *num_devices)
 {
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
     *num_devices = opal_accelerator_cuda_num_devices;
     return OPAL_SUCCESS;
 }

From 246003fac4b9e295503dc32b7b701171b61dbe1e Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 24 Jul 2023 17:21:51 +0000
Subject: [PATCH 54/74] Accelerator: provide peak bandwidth estimate

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/op/op.h                                  | 24 +++++--
 opal/mca/accelerator/accelerator.h            |  4 ++
 opal/mca/accelerator/cuda/accelerator_cuda.c  | 22 ++++--
 opal/mca/accelerator/cuda/accelerator_cuda.h  |  2 +
 .../cuda/accelerator_cuda_component.c         | 68 ++++++++++++++++---
 .../null/accelerator_null_component.c         | 12 +++-
 opal/mca/accelerator/rocm/accelerator_rocm.h  |  2 +-
 .../rocm/accelerator_rocm_component.c         | 28 ++++++--
 .../rocm/accelerator_rocm_module.c            | 18 ++++-
 9 files changed, 154 insertions(+), 26 deletions(-)

diff --git a/ompi/op/op.h b/ompi/op/op.h
index 519520f1712..c6ee9244980 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -865,10 +865,26 @@ static inline void ompi_op_preferred_device(ompi_op_t *op, int source_dev,
         return;
     }
 
-    double host_startup_cost = 0.0; // host has no startup cost
-    double host_compute_cost = 1.0*count; // host reference 1.0 per element
-    double device_startup_cost = 10000.0; // to be filled below
-    double device_compute_cost = 0.0001*count;
+    size_t size_type;
+    ompi_datatype_type_size(dtype, &size_type);
+
+    float device_bw;
+    if (target_dev >= 0) {
+        opal_accelerator.get_mem_bw(target_dev, &device_bw);
+    } else if (source_dev >= 0) {
+        opal_accelerator.get_mem_bw(source_dev, &device_bw);
+    }
+
+    // assume we reach 50% of theoretical peak on the device
+    device_bw /= 2.0;
+
+    // TODO: determine at runtime (?)
+    const float host_bw = 10.0; // 10GB/s
+
+    float host_startup_cost = 0.0; // host has no startup cost
+    float host_compute_cost = (count*size_type) / (host_bw*1024); // assume 10GB/s memory bandwidth on host
+    float device_startup_cost = 10.0; // 10us startup cost on device
+    float device_compute_cost = (count*size_type) / (device_bw*1024);
 
     if ((host_startup_cost + host_compute_cost) > (device_startup_cost + device_compute_cost)) {
         *op_device = (target_dev >= 0) ? target_dev : source_dev;
diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h
index 34c6e3147ad..d49fd8077d8 100644
--- a/opal/mca/accelerator/accelerator.h
+++ b/opal/mca/accelerator/accelerator.h
@@ -484,6 +484,8 @@ typedef int (*opal_accelerator_base_module_wait_stream_fn_t)(opal_accelerator_st
 
 typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_devices);
 
+typedef int (*opal_accelerator_base_module_get_mem_bw_fn_t)(int num_devices, float *bw);
+
 /*
  * the standard public API data structure
  */
@@ -521,6 +523,8 @@ typedef struct {
     opal_accelerator_base_module_wait_stream_fn_t wait_stream;
 
     opal_accelerator_base_module_get_num_devices_fn_t num_devices;
+
+    opal_accelerator_base_module_get_mem_bw_fn_t get_mem_bw;
 } opal_accelerator_base_module_t;
 
 /**
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index e228a9f811e..2c4d525a4a1 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -59,6 +59,8 @@ static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream);
 
 static int accelerator_cuda_get_num_devices(int *num_devices);
 
+static int accelerator_cuda_get_mem_bw(int device, float *bw);
+
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
 {
     accelerator_cuda_get_default_stream,
@@ -91,7 +93,8 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_get_buffer_id,
 
     accelerator_cuda_wait_stream,
-    accelerator_cuda_get_num_devices
+    accelerator_cuda_get_num_devices,
+    accelerator_cuda_get_mem_bw
 };
 
 static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
@@ -792,11 +795,6 @@ static int accelerator_cuda_mem_alloc_stream(
 #if CUDA_VERSION >= 11020
     cudaError_t result;
 
-    int delayed_init = opal_accelerator_cuda_delayed_init();
-    if (OPAL_UNLIKELY(0 != delayed_init)) {
-        return delayed_init;
-    }
-
     if (NULL == stream || NULL == addr || 0 == size) {
         return OPAL_ERR_BAD_PARAM;
     }
@@ -882,3 +880,15 @@ static int accelerator_cuda_get_num_devices(int *num_devices)
     *num_devices = opal_accelerator_cuda_num_devices;
     return OPAL_SUCCESS;
 }
+
+static int accelerator_cuda_get_mem_bw(int device, float *bw)
+{
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    assert(opal_accelerator_cuda_mem_bw != NULL);
+
+    *bw = opal_accelerator_cuda_mem_bw[device];
+    return OPAL_SUCCESS;
+}
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index dd09be2325d..3ef66820d72 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -50,6 +50,8 @@ OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module
 
 OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices;
 
+OPAL_DECLSPEC extern float *opal_accelerator_cuda_mem_bw;
+
 OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
 
 END_C_DECLS
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index e84beb4264e..08abd522a8a 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -44,6 +44,8 @@ int opal_accelerator_cuda_num_devices = 0;
 static opal_mutex_t accelerator_cuda_init_lock;
 static bool accelerator_cuda_init_complete = false;
 
+float *opal_accelerator_cuda_mem_bw = NULL;
+
 
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x)  STRINGIFY2(x)
@@ -125,7 +127,7 @@ static int accelerator_cuda_component_register(void)
 
 int opal_accelerator_cuda_delayed_init()
 {
-    int result = OPAL_SUCCESS;
+    CUresult result = OPAL_SUCCESS;
     int prio_lo, prio_hi;
     CUcontext cuContext;
 
@@ -142,6 +144,8 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     }
 
+    cuDeviceGetCount(&opal_accelerator_cuda_num_devices);
+
     /* Check to see if this process is running in a CUDA context.  If
      * so, all is good.  If not, then disable registration of memory. */
     result = cuCtxGetCurrent(&cuContext);
@@ -150,13 +154,35 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
-        result = OPAL_ERROR;
-        goto out;
+        /* create a context for each device */
+        for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+            CUdevice dev;
+            result = cuDeviceGet(&dev, i);
+            if (CUDA_SUCCESS != result) {
+                opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                    "CUDA: cuDeviceGet failed");
+                goto out;
+            }
+            result = cuDevicePrimaryCtxRetain(&cuContext, dev);
+            if (CUDA_SUCCESS != result) {
+                opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                    "CUDA: cuDevicePrimaryCtxRetain failed");
+                goto out;
+            }
+            if (0 == i) {
+                result = cuCtxPushCurrent(cuContext);
+                if (CUDA_SUCCESS != result) {
+                    opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                        "CUDA: cuCtxPushCurrent failed");
+                    goto out;
+                }
+            }
+        }
+
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
     }
 
-    cuDeviceGetCount(&opal_accelerator_cuda_num_devices);
 
     /* Create stream for use in cuMemcpyAsync synchronous copies */
     CUstream memcpy_stream;
@@ -197,11 +223,6 @@ int opal_accelerator_cuda_delayed_init()
     OBJ_CONSTRUCT(&opal_accelerator_cuda_default_stream, opal_accelerator_cuda_stream_t);
     opal_accelerator_cuda_default_stream.base.stream = default_stream;
 
-    cudaMemPool_t mpool;
-    cuuint64_t threshold =  1*1024*1024;
-    cudaDeviceGetDefaultMemPool(&mpool, 0);
-    cudaMemPoolSetAttribute(mpool, cudaMemPoolAttrReleaseThreshold, &threshold);
-
     result = cuMemHostRegister(&checkmem, sizeof(int), 0);
     if (result != CUDA_SUCCESS) {
         /* If registering the memory fails, print a message and continue.
@@ -212,6 +233,32 @@ int opal_accelerator_cuda_delayed_init()
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
                             "CUDA: cuMemHostRegister OK on test region");
     }
+
+    opal_accelerator_cuda_mem_bw = malloc(sizeof(float)*opal_accelerator_cuda_num_devices);
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+        CUdevice dev;
+        result = cuDeviceGet(&dev, i);
+        if (CUDA_SUCCESS != result) {
+            opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                "CUDA: cuDeviceGet failed");
+            goto out;
+        }
+        int mem_clock_rate; // kHz
+        result = cuDeviceGetAttribute(&mem_clock_rate,
+                                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                                dev);
+        int bus_width; // bit
+        result = cuDeviceGetAttribute(&bus_width,
+                                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                                dev);
+        /* bw = clock_rate * bus width * 2bit multiplier
+         * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
+         */
+        float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
+        printf("clock rate: %d kHz, bus width: %d bit, bandwidth: %f GB/s\n", mem_clock_rate, bus_width, bw);
+        opal_accelerator_cuda_mem_bw[i] = bw;
+    }
+
     result = OPAL_SUCCESS;
     opal_atomic_wmb();
     accelerator_cuda_init_complete = true;
@@ -268,6 +315,9 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
         OBJ_DESTRUCT(&opal_accelerator_cuda_default_stream);
     }
 
+    free(opal_accelerator_cuda_mem_bw);
+    opal_accelerator_cuda_mem_bw = NULL;
+
 
     OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
     OBJ_DESTRUCT(&accelerator_cuda_init_lock);
diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c
index b9b002e81ed..24f9e04419e 100644
--- a/opal/mca/accelerator/null/accelerator_null_component.c
+++ b/opal/mca/accelerator/null/accelerator_null_component.c
@@ -76,6 +76,8 @@ static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream);
 
 static int accelerator_null_get_num_devices(int *num_devices);
 
+static int accelerator_null_get_mem_bw(int device, float *bw);
+
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
@@ -146,7 +148,8 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
     accelerator_null_get_buffer_id,
 
     accelerator_null_wait_stream,
-    accelerator_null_get_num_devices
+    accelerator_null_get_num_devices,
+    accelerator_null_get_mem_bw
 };
 
 static int accelerator_null_open(void)
@@ -310,4 +313,11 @@ static int accelerator_null_get_num_devices(int *num_devices)
 {
     *num_devices = 0;
     return OPAL_SUCCESS;
+}
+
+
+static int accelerator_null_get_mem_bw(int device, float *bw)
+{
+    *bw = 1.0; // return something that is not 0
+    return OPAL_SUCCESS;
 }
\ No newline at end of file
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h
index e9afaa51019..94bffd800e2 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm.h
+++ b/opal/mca/accelerator/rocm/accelerator_rocm.h
@@ -67,5 +67,5 @@ OPAL_DECLSPEC extern hipStream_t opal_accelerator_alloc_stream;
 OPAL_DECLSPEC extern opal_accelerator_rocm_stream_t opal_accelerator_rocm_default_stream;
 OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_rocm_stream_lock;
 OPAL_DECLSPEC extern int opal_accelerator_rocm_num_devices;
-
+OPAL_DECLSPEC extern float *opal_accelerator_rocm_mem_bw;
 #endif
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
index 605978a9974..e9f4ccf5b86 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
@@ -33,6 +33,7 @@ size_t opal_accelerator_rocm_memcpyH2D_limit=1048576;
 /* Initialization lock for lazy rocm initialization */
 static opal_mutex_t accelerator_rocm_init_lock;
 static bool accelerator_rocm_init_complete = false;
+static int checkmem;
 
 hipStream_t *opal_accelerator_rocm_MemcpyStream = NULL;
 
@@ -49,10 +50,7 @@ opal_accelerator_rocm_stream_t opal_accelerator_rocm_default_stream = {0};
 opal_mutex_t opal_accelerator_rocm_stream_lock = {0};
 int opal_accelerator_rocm_num_devices = 0;
 
-/* Initialization lock for delayed rocm initialization */
-static opal_mutex_t accelerator_rocm_init_lock;
-static bool accelerator_rocm_init_complete = false;
-static int checkmem;
+float *opal_accelerator_rocm_mem_bw = NULL;
 
 #define HIP_CHECK(condition)                                                 \
 {                                                                            \
@@ -228,6 +226,24 @@ int opal_accelerator_rocm_lazy_init()
     OBJ_CONSTRUCT(&opal_accelerator_rocm_default_stream, opal_accelerator_rocm_stream_t);
     opal_accelerator_rocm_default_stream.base.stream = default_stream;
 
+    opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices);
+    for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) {
+        int mem_clock_rate; // kHz
+        err = hipDeviceGetAttribute(&mem_clock_rate,
+                                    hipDeviceAttributeMemoryClockRate,
+                                    i);
+        int bus_width; // bit
+        err = hipDeviceGetAttribute(&bus_width,
+                                    hipDeviceAttributeMemoryBusWidth,
+                                    i);
+        /* bw = clock_rate * bus width * 2bit multiplier
+         * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
+         */
+        float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
+        opal_accelerator_rocm_mem_bw[i] = bw;
+    }
+
+    err = OPAL_SUCCESS;
     opal_atomic_wmb();
     accelerator_rocm_init_complete = true;
 out:
@@ -268,6 +284,10 @@ static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module)
         opal_accelerator_rocm_MemcpyStream = NULL;
     }
 
+    free(opal_accelerator_rocm_mem_bw);
+    opal_accelerator_rocm_mem_bw = NULL;
+
+
     OBJ_DESTRUCT(&accelerator_rocm_init_lock);
     return;
 }
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 0f0ce05235f..378d2ff273c 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -51,6 +51,8 @@ static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream);
 
 static int mca_accelerator_rocm_get_num_devices(int *num_devices);
 
+static int mca_accelerator_rocm_get_mem_bw(int device, float *bw);
+
 opal_accelerator_base_module_t opal_accelerator_rocm_module =
 {
     mca_accelerator_rocm_get_default_stream, //DONE
@@ -83,7 +85,9 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
     mca_accelerator_rocm_get_buffer_id,
 
     mca_accelerator_rocm_wait_stream, //DONE
-    mca_accelerator_rocm_get_num_devices //DONE
+    mca_accelerator_rocm_get_num_devices, //DONE
+
+    mca_accelerator_rocm_get_mem_bw
 };
 
 
@@ -724,3 +728,15 @@ static int mca_accelerator_rocm_get_num_devices(int *num_devices)
     *num_devices = opal_accelerator_rocm_num_devices;
     return OPAL_SUCCESS;
 }
+
+static int mca_accelerator_rocm_get_mem_bw(int device, float *bw)
+{
+    int delayed_init = opal_accelerator_rocm_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    assert(opal_accelerator_rocm_mem_bw != NULL);
+
+    *bw = opal_accelerator_rocm_mem_bw[device];
+    return OPAL_SUCCESS;
+}

From 6601484ce278ffd14c567a74d7f6dbd5e493c0b4 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 24 Jul 2023 17:50:13 +0000
Subject: [PATCH 55/74] accelerator/rocm: regular memory behaves like unified
 memory

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/rocm/accelerator_rocm_module.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 378d2ff273c..0c75981b586 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -116,6 +116,8 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6
             //*flags |= MCA_ACCELERATOR_FLAGS_HOST_ATOMICS;
             /* First access on a device pointer triggers ROCM support lazy initialization. */
             opal_accelerator_rocm_lazy_init();
+            // on Frontier the host can access any device memory
+            *flags |= MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY;
             *dev_id = srcAttr.device;
             ret = 1;
 #if HIP_VERSION >= 50731921

From d0fe9a21a5b3f8ecc13324efe41f58c93f599893 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 24 Jul 2023 17:50:42 +0000
Subject: [PATCH 56/74] ROCM: add missing FUNC_FUNC_FN macro

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/rocm/op_rocm_impl.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/ompi/mca/op/rocm/op_rocm_impl.cpp b/ompi/mca/op/rocm/op_rocm_impl.cpp
index 7a39f846941..f5068f9a2a0 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.cpp
+++ b/ompi/mca/op/rocm/op_rocm_impl.cpp
@@ -201,6 +201,29 @@ static inline __device__ T vprod(const T& a, const T& b) {
 #define VFUNC_FUNC(name, type_name, type, vtype, vlen, vfn, fn) FUNC_FUNC_FN(name, type_name, type, fn)
 #endif // defined(USE_VECTORS)
 
+#define FUNC_FUNC_FN(name, type_name, type, fn)                                                     \
+    static __global__ void                                                                          \
+    ompi_op_rocm_2buff_##name##_##type_name##_kernel(const type *__restrict__ in,                   \
+                                                     type *__restrict__ inout, int n) {             \
+        const int index = blockIdx.x * blockDim.x + threadIdx.x;                                    \
+        const int stride = blockDim.x * gridDim.x;                                                  \
+        for (int i = index; i < n; i += stride) {                                                   \
+            inout[i] = fn(inout[i], in[i]);                                                         \
+        }                                                                                           \
+    }                                                                                               \
+    void                                                                                            \
+    ompi_op_rocm_2buff_##name##_##type_name##_submit(const type *in,                                \
+                                              type *inout,                                          \
+                                              int count,                                            \
+                                              int threads_per_block,                                \
+                                              int max_blocks,                                       \
+                                              hipStream_t stream) {                                    \
+        int threads = min(count, threads_per_block);                                                \
+        int blocks  = min((count + threads-1) / threads, max_blocks);                               \
+        int n = count;                                                                              \
+        hipStream_t s = stream;                                                                        \
+        ompi_op_rocm_2buff_##name##_##type_name##_kernel<<<blocks, threads, 0, s>>>(in, inout, n);  \
+    }
 
 /*
  * Since all the functions in this file are essentially identical, we

From 63b64a064a8775e17fd881c3c3ee463eec0e04d5 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 26 Jul 2023 10:36:36 +0200
Subject: [PATCH 57/74] opal_datatype_accelerator_memcpy: determine device copy
 type

We know where source and target buffers are located, so pass the right
transfer direction to the accelerator memcpy call.

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/datatype/opal_datatype_copy.c | 36 +++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c
index d4d6189d3d9..1459b6ad558 100644
--- a/opal/datatype/opal_datatype_copy.c
+++ b/opal/datatype/opal_datatype_copy.c
@@ -28,6 +28,7 @@
 
 #include <stddef.h>
 #include <stdlib.h>
+#include <stdio.h>
 
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype.h"
@@ -55,11 +56,29 @@
         }                                                                                   \
     } while (0)
 
+
+static opal_accelerator_transfer_type_t get_transfer_type(int src_dev, int dst_dev)
+{
+    if (src_dev == MCA_ACCELERATOR_NO_DEVICE_ID) {
+        if (dst_dev == MCA_ACCELERATOR_NO_DEVICE_ID) {
+            return MCA_ACCELERATOR_TRANSFER_HTOH;
+        } else {
+            return MCA_ACCELERATOR_TRANSFER_HTOD;
+        }
+    } else {
+        if (dst_dev == MCA_ACCELERATOR_NO_DEVICE_ID) {
+            return MCA_ACCELERATOR_TRANSFER_DTOH;
+        } else {
+            return MCA_ACCELERATOR_TRANSFER_DTOD;
+        }
+    }
+}
+
 static void *opal_datatype_accelerator_memcpy(void *dest, const void *src, size_t size,
                                               opal_accelerator_stream_t *stream)
 {
     int res;
-    int dev_id;
+    int src_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID, dst_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
     uint64_t flags;
     /* If accelerator check addr returns an error, we can only
      * assume it is a host buffer. If device buffer checking fails,
@@ -68,16 +87,17 @@ static void *opal_datatype_accelerator_memcpy(void *dest, const void *src, size_
      * and retries are also unlikely to succeed. We identify these
      * buffers as host buffers as attempting a memcpy would provide
      * a chance to succeed. */
-    if (0 >= opal_accelerator.check_addr(dest, &dev_id, &flags) &&
-        0 >= opal_accelerator.check_addr(src, &dev_id, &flags)) {
+    if (0 >= opal_accelerator.check_addr(dest, &dst_dev_id, &flags) &&
+        0 >= opal_accelerator.check_addr(src, &src_dev_id, &flags)) {
         return memcpy(dest, src, size);
     }
+    //printf("opal_datatype_accelerator_memcpy: dst %p dev %d src %p dev %d transer_type %d\n", dest, dst_dev_id, src, src_dev_id, get_transfer_type(src_dev_id, dst_dev_id));
     if (NULL != stream) {
-        res = opal_accelerator.mem_copy_async(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
-                                              dest, src, size, stream, MCA_ACCELERATOR_TRANSFER_UNSPEC);
+        res = opal_accelerator.mem_copy_async(dst_dev_id, src_dev_id,
+                                              dest, src, size, stream, get_transfer_type(src_dev_id, dst_dev_id));
     } else {
-        res = opal_accelerator.mem_copy(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
-                                        dest, src, size, MCA_ACCELERATOR_TRANSFER_UNSPEC);
+        res = opal_accelerator.mem_copy(dst_dev_id, src_dev_id,
+                                        dest, src, size, get_transfer_type(src_dev_id, dst_dev_id));
     }
     if (OPAL_SUCCESS != res) {
         opal_output(0, "Error in accelerator memcpy");
@@ -177,4 +197,4 @@ int32_t opal_datatype_copy_content_same_ddt(const opal_datatype_t *datatype, int
                                             char *destination_base, char *source_base)
 {
     return opal_datatype_copy_content_same_ddt_stream(datatype, count, destination_base, source_base, NULL);
-}
\ No newline at end of file
+}

From 5a29e13842d590471d61cdbc8981c3018b775fe1 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 26 Jul 2023 10:38:53 +0200
Subject: [PATCH 58/74] accelerator rocm: fix global memcpy stream variable

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/rocm/accelerator_rocm.h           |  2 +-
 opal/mca/accelerator/rocm/accelerator_rocm_component.c |  5 +++--
 opal/mca/accelerator/rocm/accelerator_rocm_module.c    | 10 +++++-----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h
index 94bffd800e2..abd2f8125c3 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm.h
+++ b/opal/mca/accelerator/rocm/accelerator_rocm.h
@@ -55,7 +55,7 @@ struct opal_accelerator_rocm_event_t {
 typedef struct opal_accelerator_rocm_event_t opal_accelerator_rocm_event_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_rocm_event_t);
 
-OPAL_DECLSPEC extern hipStream_t opal_accelerator_rocm_MemcpyStream;
+OPAL_DECLSPEC extern hipStream_t *opal_accelerator_rocm_MemcpyStream;
 OPAL_DECLSPEC extern int opal_accelerator_rocm_memcpy_async;
 OPAL_DECLSPEC extern int opal_accelerator_rocm_verbose;
 OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyH2D_limit;
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
index e9f4ccf5b86..4358bb345f3 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
@@ -276,11 +276,12 @@ static opal_accelerator_base_module_t* accelerator_rocm_init(void)
 
 static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module)
 {
-    if (NULL != (void*)opal_accelerator_rocm_MemcpyStream) {
-        hipError_t err = hipStreamDestroy(opal_accelerator_rocm_MemcpyStream);
+    if (NULL != opal_accelerator_rocm_MemcpyStream) {
+        hipError_t err = hipStreamDestroy(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err) {
             opal_output_verbose(10, 0, "hip_dl_finalize: error while destroying the hipStream\n");
         }
+        free(opal_accelerator_rocm_MemcpyStream);
         opal_accelerator_rocm_MemcpyStream = NULL;
     }
 
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 0c75981b586..0a414421c14 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -326,14 +326,14 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
 
     if (opal_accelerator_rocm_memcpy_async) {
         err = hipMemcpyAsync(dest, src, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error starting async copy\n");
             return OPAL_ERROR;
         }
 
-        err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
+        err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error synchronizing stream after async copy\n");
@@ -408,7 +408,7 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d
 
     if (opal_accelerator_rocm_memcpy_async) {
         err = hipMemcpyAsync(tmp, src, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error in async memcpy for memmove\n");
@@ -416,14 +416,14 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d
         }
 
         err = hipMemcpyAsync(dest, tmp, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error in async memcpy for memmove\n");
             return OPAL_ERROR;
         }
 
-        err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
+        err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error synchronizing stream for memmove\n");

From 5c7c7a10a84ef38f552ca88fa038147365d34a40 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 26 Jul 2023 10:39:14 +0200
Subject: [PATCH 59/74] Thread base: fix missing include file

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/threads/base/threads_base.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/opal/mca/threads/base/threads_base.c b/opal/mca/threads/base/threads_base.c
index 227aaeb64d3..d3ae6b5ccaf 100644
--- a/opal/mca/threads/base/threads_base.c
+++ b/opal/mca/threads/base/threads_base.c
@@ -24,6 +24,7 @@
 
 #include "opal/constants.h"
 #include "opal/mca/threads/base/base.h"
+#include "opal/mca/threads/threads.h"
 
 #if OPAL_ENABLE_DEBUG
 bool opal_debug_threads = false;

From 76f00c48d11a7fd760d069613225111061b4d543 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 26 Jul 2023 10:39:41 +0200
Subject: [PATCH 60/74] Accelerator: Remove debug output

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/cuda/accelerator_cuda_component.c | 2 +-
 opal/mca/accelerator/rocm/accelerator_rocm_module.c    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index 08abd522a8a..04006f85dd4 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -255,7 +255,7 @@ int opal_accelerator_cuda_delayed_init()
          * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
          */
         float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
-        printf("clock rate: %d kHz, bus width: %d bit, bandwidth: %f GB/s\n", mem_clock_rate, bus_width, bw);
+        //printf("clock rate: %d kHz, bus width: %d bit, bandwidth: %f GB/s\n", mem_clock_rate, bus_width, bw);
         opal_accelerator_cuda_mem_bw[i] = bw;
     }
 
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 0a414421c14..8180dc24409 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -132,6 +132,7 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6
             *dev_id = srcAttr.device;
         }
     }
+    //printf("mca_accelerator_rocm_check_addr %p dev %d ret %d\n", addr, *dev_id, ret);
 
     return ret;
 }

From 56bcfee53978dbd9808f11fc3a7d882431199c7d Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 26 Jul 2023 12:37:09 +0000
Subject: [PATCH 61/74] Allreduce: don't copy inputs if data can be accessed
 from the host

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c | 39 ++++++++++++++++--------
 ompi/mca/coll/base/coll_base_util.h      | 11 ++++---
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 5a042b07489..1ce45f941c5 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -160,7 +160,9 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* get the device for sbuf and rbuf and where the op would like to execute */
     int sendbuf_dev, recvbuf_dev, op_dev;
-    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev, &op_dev);
+    uint64_t sendbuf_flags, recvbuf_flags;
+    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev,
+                                 &sendbuf_flags, &recvbuf_flags, &op_dev);
     span = opal_datatype_span(&dtype->super, count, &gap);
     inplacebuf_free = ompi_coll_base_allocate_on_device(op_dev, span, module);
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
@@ -172,16 +174,16 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         opal_accelerator.get_default_stream(op_dev, &stream);
     }
 
-    /* TODO: These copies are only relevant if buffers are not on the same device.
-     *       Can we figure out whether the op-device can access these remote buffers directly? */
     tmpsend = (char*) sbuf;
     if (op_dev != recvbuf_dev) {
         /* copy data to where the op wants it to be */
         if (MPI_IN_PLACE == sbuf) {
             ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)rbuf, stream);
             if (ret < 0) { line = __LINE__; goto error_hndl; }
-        } else {
-            tmpsend = (char*) sbuf;
+        }
+        /* only copy if op is on the device or we cannot access the sendbuf on the host */
+        else if (op_dev != MCA_ACCELERATOR_NO_DEVICE_ID ||
+                 0 == (sendbuf_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) {
             ret = ompi_datatype_copy_content_same_ddt_stream(dtype, count, inplacebuf, (char*)sbuf, stream);
             if (ret < 0) { line = __LINE__; goto error_hndl; }
         }
@@ -194,9 +196,12 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Handle MPI_IN_PLACE */
     bool use_sbuf = (MPI_IN_PLACE != sbuf);
-    /* allocate temporary recv buffer if the tmpbuf above is on a different device than the rbuf */
+    /* allocate temporary recv buffer if the tmpbuf above is on a different device than the rbuf
+     * and the op is on the device or we cannot access the recv buffer on the host */
     recvbuf = rbuf;
-    if (op_dev != recvbuf_dev) {
+    if (op_dev != recvbuf_dev &&
+        (op_dev != MCA_ACCELERATOR_NO_DEVICE_ID ||
+         0 == (recvbuf_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY))) {
         recvbuf = ompi_coll_base_allocate_on_device(op_dev, span, module);
         if (use_sbuf) {
             /* copy from rbuf */
@@ -505,7 +510,9 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
 
     /* get the device for sbuf and rbuf and where the op would like to execute */
     int sendbuf_dev, recvbuf_dev, op_dev;
-    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev, &op_dev);
+    uint64_t sendbuf_flags, recvbuf_flags;
+    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev,
+                                 &sendbuf_flags, &recvbuf_flags, &op_dev);
     if (size > 2) {
         inbuf[0] = ompi_coll_base_allocate_on_device(op_dev, 2*max_real_segsize, module);
         if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
@@ -520,7 +527,9 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
     bool use_sbuf = (MPI_IN_PLACE != sbuf);
     /* allocate temporary recv buffer if the tmpbuf above is on a different device than the rbuf */
     recvbuf = rbuf;
-    if (op_dev != recvbuf_dev) {
+    if (op_dev != recvbuf_dev &&
+        /* only copy if op is on the device or the recvbuffer cannot be accessed on the host */
+        (op_dev != MCA_ACCELERATOR_NO_DEVICE_ID || 0 == (MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY & recvbuf_flags))) {
         recvbuf = ompi_coll_base_allocate_on_device(op_dev, typelng*count, module);
         if (use_sbuf) {
             /* copy from rbuf */
@@ -823,7 +832,9 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
     int sendbuf_dev, recvbuf_dev, op_dev;
-    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev, &op_dev);
+    uint64_t sendbuf_flags, recvbuf_flags;
+    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev,
+                                 &sendbuf_flags, &recvbuf_flags, &op_dev);
     /* Allocate and initialize temporary buffers */
     inbuf[0] = ompi_coll_base_allocate_on_device(op_dev, max_real_segsize, module);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
@@ -1146,7 +1157,9 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
     /* get the device for sbuf and rbuf and where the op would like to execute */
     int sendbuf_dev, recvbuf_dev, op_dev;
-    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev, &op_dev);
+    uint64_t sendbuf_flags, recvbuf_flags;
+    ompi_coll_base_select_device(op, sbuf, rbuf, count, dtype, &sendbuf_dev, &recvbuf_dev,
+                                 &sendbuf_flags, &recvbuf_flags, &op_dev);
 
     /* Temporary buffer for receiving messages */
     char *tmp_buf = NULL;
@@ -1156,10 +1169,10 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     tmp_buf = tmp_buf_raw - gap;
 
     char *recvbuf = rbuf;
-    if (op_dev != recvbuf_dev) {
+    if (op_dev != recvbuf_dev && 0 == (MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY & recvbuf_flags)) {
         recvbuf = ompi_coll_base_allocate_on_device(op_dev, dsize, module);
     }
-    if (op_dev != sendbuf_dev && sbuf != MPI_IN_PLACE) {
+    if (op_dev != sendbuf_dev && 0 == (MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY & sendbuf_flags) && sbuf != MPI_IN_PLACE) {
         /* move the data into the recvbuf and set sbuf to MPI_IN_PLACE */
         ompi_datatype_copy_content_same_ddt(dtype, count, (char*)recvbuf, (char*)sbuf);
         sbuf = MPI_IN_PLACE;
diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h
index 10d4cfcba17..2598810a238 100644
--- a/ompi/mca/coll/base/coll_base_util.h
+++ b/ompi/mca/coll/base/coll_base_util.h
@@ -222,14 +222,15 @@ void ompi_coll_base_select_device(
     struct ompi_datatype_t *dtype,
     int *sendbuf_device,
     int *recvbuf_device,
+    uint64_t *sendbuf_flags,
+    uint64_t *recvbuf_flags,
     int *op_device)
 {
-    uint64_t sendbuf_flags, recvbuf_flags;
     /* TODO: move this into ompi_op_select_device to save the extra lookups? */
-    *recvbuf_device = -1;
-    *sendbuf_device = -1;
-    if (sendbuf != NULL && sendbuf != MPI_IN_PLACE) opal_accelerator.check_addr(sendbuf, sendbuf_device, &sendbuf_flags);
-    if (recvbuf != NULL) opal_accelerator.check_addr(recvbuf, recvbuf_device, &recvbuf_flags);
+    *recvbuf_device = MCA_ACCELERATOR_NO_DEVICE_ID;
+    *sendbuf_device = MCA_ACCELERATOR_NO_DEVICE_ID;
+    if (sendbuf != NULL && sendbuf != MPI_IN_PLACE) opal_accelerator.check_addr(sendbuf, sendbuf_device, sendbuf_flags);
+    if (recvbuf != NULL) opal_accelerator.check_addr(recvbuf, recvbuf_device, recvbuf_flags);
     ompi_op_preferred_device(op, *recvbuf_device, *sendbuf_device, count, dtype, op_device);
 }
 

From a1f089ecbc1dcb9140784e916bb7c25e36f0f915 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 6 Nov 2023 00:08:00 +0000
Subject: [PATCH 62/74] Be more careful when releasing temporary receive
 buffers

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 1ce45f941c5..5c03a1b3cf3 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -199,10 +199,12 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     /* allocate temporary recv buffer if the tmpbuf above is on a different device than the rbuf
      * and the op is on the device or we cannot access the recv buffer on the host */
     recvbuf = rbuf;
+    bool free_recvbuf = false;
     if (op_dev != recvbuf_dev &&
         (op_dev != MCA_ACCELERATOR_NO_DEVICE_ID ||
          0 == (recvbuf_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY))) {
         recvbuf = ompi_coll_base_allocate_on_device(op_dev, span, module);
+        free_recvbuf = true;
         if (use_sbuf) {
             /* copy from rbuf */
             ompi_datatype_copy_content_same_ddt_stream(dtype, count, (char*)recvbuf, (char*)sbuf, stream);
@@ -367,7 +369,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
     }
     ompi_coll_base_free_tmpbuf(inplacebuf_free, op_dev, module);
 
-    if (op_dev != recvbuf_dev) {
+    if (free_recvbuf) {
         ompi_coll_base_free_tmpbuf(recvbuf, op_dev, module);
     }
     return MPI_SUCCESS;

From 33616e65877a8b34e1c7c1b228730fe5de726d62 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 6 Nov 2023 00:12:08 +0000
Subject: [PATCH 63/74] Remove debug output and dead code

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_util.c |  9 ++--
 ompi/mca/coll/base/coll_base_util.h | 66 -----------------------------
 ompi/op/op.h                        | 66 -----------------------------
 3 files changed, 3 insertions(+), 138 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c
index 1efb67215f2..9479c68f7cc 100644
--- a/ompi/mca/coll/base/coll_base_util.c
+++ b/ompi/mca/coll/base/coll_base_util.c
@@ -627,15 +627,13 @@ void *ompi_coll_base_allocate_on_device(int device, size_t size,
     if (module->base_data->num_device_allocators <= device) {
         int num_dev;
         opal_accelerator.num_devices(&num_dev);
-	printf("ompi_coll_base_allocate_on_device num_dev %d device %d\n", num_dev, device);
-	if (num_dev < device+1) num_dev = device+1;
+        if (num_dev < device+1) num_dev = device+1;
         module->base_data->device_allocators = realloc(module->base_data->device_allocators, num_dev * sizeof(mca_allocator_base_module_t *));
-	for (int i = module->base_data->num_device_allocators; i < num_dev; ++i) {
-	    module->base_data->device_allocators[i] = NULL;
+        for (int i = module->base_data->num_device_allocators; i < num_dev; ++i) {
+            module->base_data->device_allocators[i] = NULL;
         }
         module->base_data->num_device_allocators = num_dev;
     }
-    //printf("allocators %p module %p\n", module->base_data->device_allocators, module->base_data->device_allocators[device]);
     if (NULL == (allocator_module = module->base_data->device_allocators[device])) {
         mca_allocator_base_component_t *allocator_component;
         allocator_component = mca_allocator_component_lookup("devicebucket");
@@ -646,7 +644,6 @@ void *ompi_coll_base_allocate_on_device(int device, size_t size,
         assert(allocator_module != NULL);
         module->base_data->device_allocators[device] = allocator_module;
     }
-    //printf("allocator_module %p\n", allocator_module);
     return allocator_module->alc_alloc(allocator_module, size, 0);
 }
 
diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h
index 2598810a238..9dd1a6f41d0 100644
--- a/ompi/mca/coll/base/coll_base_util.h
+++ b/ompi/mca/coll/base/coll_base_util.h
@@ -226,7 +226,6 @@ void ompi_coll_base_select_device(
     uint64_t *recvbuf_flags,
     int *op_device)
 {
-    /* TODO: move this into ompi_op_select_device to save the extra lookups? */
     *recvbuf_device = MCA_ACCELERATOR_NO_DEVICE_ID;
     *sendbuf_device = MCA_ACCELERATOR_NO_DEVICE_ID;
     if (sendbuf != NULL && sendbuf != MPI_IN_PLACE) opal_accelerator.check_addr(sendbuf, sendbuf_device, sendbuf_flags);
@@ -234,71 +233,6 @@ void ompi_coll_base_select_device(
     ompi_op_preferred_device(op, *recvbuf_device, *sendbuf_device, count, dtype, op_device);
 }
 
-/**
- * Returns a pointer to memory in the same memory domain as the receive or send buffer.
- * Device memory is allocated if either the receive buffer or the send buffer are
- * located on the device and if the op supports on-device reductions on the datatype.
- * If memory is allocated on the host, device will be set to -1.
- */
-static inline
-void* ompi_coll_base_allocate_op_tmpbuf(
-    const void *sendbuf, const void *recvbuf, size_t size,
-    struct ompi_op_t *op, size_t count, struct ompi_datatype_t *dtype,
-    int *device, mca_coll_base_module_t *module)
-{
-    void *res = NULL;
-    uint64_t flags;
-    *device = -1;
-
-    ompi_op_select_device(op, sendbuf, recvbuf, count, dtype, device);
-    if (*device > -1) {
-        res = ompi_coll_base_allocate_on_device(*device, size, module);
-        if (NULL == res) {
-            // fallback to host
-            *device = -1;
-        }
-    }
-#if 0
-    if ((NULL == op && NULL == dtype) || ompi_op_supports_device(op, dtype)) {
-        /* if the recvbuf is on the device we take that device */
-        if (NULL != recvbuf && 0 < opal_accelerator.check_addr(recvbuf, device, &flags)) {
-            /* allocate cache on demand */
-            res = ompi_coll_base_allocate_on_device(*device, size, module);
-            if (NULL == res) {
-                // fallback to host
-                *device = -1;
-            }
-        } else if (MPI_IN_PLACE != sendbuf && NULL != sendbuf &&
-                0 < opal_accelerator.check_addr(sendbuf, device, &flags)) {
-            /* send buffer is on a device so try to allocate memory there */
-            res = ompi_coll_base_allocate_on_device(*device, size, module);
-            if (NULL == res) {
-                // fallback to host
-                *device = -1;
-            }
-        }
-    }
-#endif // 0
-
-    if (NULL == res) {
-        res = malloc(size);
-    }
-    return res;
-}
-
-#if 0
-/**
- * Like ompi_coll_base_allocate_op_tmpbuf but without checking op-datatype
- * device compatibility.
- */
-static inline
-void* ompi_coll_base_allocate_tmpbuf(
-    const void *sendbuf, const void *recvbuf,
-    size_t size, int *device, mca_coll_base_module_t *module)
-{
-    return ompi_coll_base_allocate_op_tmpbuf(sendbuf, recvbuf, size, NULL, NULL, device, module);
-}
-#endif // 0
 /**
  * Frees memory allocated through ompi_coll_base_allocate_op_tmpbuf
  * or ompi_coll_base_allocate_tmpbuf.
diff --git a/ompi/op/op.h b/ompi/op/op.h
index c6ee9244980..32b85229081 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -891,72 +891,6 @@ static inline void ompi_op_preferred_device(ompi_op_t *op, int source_dev,
     }
 }
 
-//#if 0
-/**
- * Determine where the op can run most efficiently. Uses some heuristic based
- * on information from opal_accelerator to determine whether it would be more
- * efficient to run on a device or on the host.
- *
- * Either source or target can be NULL, in which case they will be ignored.
- *
- * Returns -1 for host, or the device number [0..NUMDEV-1] otherwise.
- */
-static inline void ompi_op_select_device(ompi_op_t *op, const void *source,
-                                         const void *target, size_t count,
-                                         ompi_datatype_t *dtype, int *device)
-{
-    /* default to host */
-    *device = -1;
-    if (!ompi_op_is_intrinsic (op)) {
-        *device = -1;
-        return;
-    }
-    /* quick check: can we execute on both sides? */
-    int dtype_id = ompi_op_ddt_map[dtype->id];
-    if (NULL == op->o_device_op || NULL == op->o_device_op->do_intrinsic.fns[dtype_id]) {
-        /* not available on the gpu, must select host */
-        return;
-    }
-
-    /* Penalty for accessing unified memory from the host
-     * TODO: how to determine this value? */
-    const double host_unified_memory_penalty = 10;
-
-    double host_startup_cost = 0.0; // host has no startup cost
-    double host_compute_cost = 1.0*count; // host reference 1.0 per element
-    double device_startup_cost = 10000.0; // to be filled below
-    double device_compute_cost = 0.0001*count;
-    double transfer_cost = 0.0; // summed up based on what has to be transferred
-    int source_dev_id = -1, target_dev_id = -1;
-    uint64_t source_flags = 0, target_flags = 0;
-    int target_check_addr = -1;
-    if (target != NULL) target_check_addr = opal_accelerator.check_addr(target, &target_dev_id, &target_flags);
-    int source_check_addr = -1;
-    if (source != NULL) source_check_addr = opal_accelerator.check_addr(source, &source_dev_id, &source_flags);
-    if (op->o_func.intrinsic.fns[dtype_id]) {
-        /* op not available on the host, must select a device */
-        host_compute_cost = 1E12;
-    } else if ((target_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY) || (source_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY)) {
-        /* at least one buffer is on unified memory */
-        host_compute_cost *= host_unified_memory_penalty; // reduced bandwidth
-    } else if (0 > source_check_addr && 0 > target_check_addr) {
-        /* both buffers are on the device, mark host as unusable */
-        host_compute_cost = 1E12;
-    } else if (0 <= source_check_addr && 0 <= target_check_addr) {
-        /* both buffers are on the host, mark device as unusable */
-        device_compute_cost = 1E12;
-    }
-
-    /* select a device, or remain on the host */
-    //printf("ompi_op_select_device: host startup %f host compute %f device startup %f device compute %f\n",
-    //       host_startup_cost, host_compute_cost, device_startup_cost, device_compute_cost);
-    if ((host_startup_cost + host_compute_cost) > (device_startup_cost + device_compute_cost)) {
-        *device = (target_dev_id >= 0) ? target_dev_id : source_dev_id;
-    }
-
-}
-//#endif // 0
-
 END_C_DECLS
 
 #endif /* OMPI_OP_H */

From 9da8b54fa4cb8ef18b8e147c7f3fe5c1f85d82c2 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 6 Nov 2023 00:16:46 +0000
Subject: [PATCH 64/74] Bump max devicebucket allocator max size to 1GB

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/allocator/devicebucket/allocator_devicebucket.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/opal/mca/allocator/devicebucket/allocator_devicebucket.c b/opal/mca/allocator/devicebucket/allocator_devicebucket.c
index 17bd82b408e..e5b4f6d26f5 100644
--- a/opal/mca/allocator/devicebucket/allocator_devicebucket.c
+++ b/opal/mca/allocator/devicebucket/allocator_devicebucket.c
@@ -74,7 +74,6 @@ struct mca_allocator_base_module_t *mca_allocator_devicebucket_module_init(
         return NULL;
     }
     allocator->super.alc_alloc = mca_allocator_devicebucket_alloc_wrapper;
-    //allocator->super.alc_realloc = mca_allocator_devicebucket_realloc;
     allocator->super.alc_realloc = NULL; // not supported
     allocator->super.alc_free = mca_allocator_devicebucket_free;
     allocator->super.alc_compact = mca_allocator_devicebucket_cleanup;
@@ -86,7 +85,7 @@ struct mca_allocator_base_module_t *mca_allocator_devicebucket_module_init(
 static int mca_allocator_devicebucket_module_register(void)
 {
     mca_allocator_min_cache_size = 4*1024;      // 4K
-    mca_allocator_max_cache_size = 1*1024*1024; // 1M
+    mca_allocator_max_cache_size = 1*1024*1024*1024; // 1G
     (void) mca_base_component_var_register(&mca_allocator_devicebucket_component.allocator_version,
                                            "min_cache_size", "Minimum allocation cache size",
                                            MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0,
@@ -115,7 +114,6 @@ int mca_allocator_devicebucket_module_close(void)
 void *mca_allocator_devicebucket_alloc_wrapper(struct mca_allocator_base_module_t *allocator, size_t size,
                                          size_t align)
 {
-    //printf("mca_allocator_devicebucket_alloc_wrapper size %zu align %zu\n", size, align);
     if (0 == align) {
         return mca_allocator_devicebucket_alloc(allocator, size);
     }

From 93ded5e7aa9be1c55285cd6f59684b091755842d Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 6 Nov 2023 00:17:21 +0000
Subject: [PATCH 65/74] accelerator/cuda: fix error message

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/cuda/accelerator_cuda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index 2c4d525a4a1..3ff45de0efe 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -466,7 +466,7 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
 #endif //0
     result = cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
+        opal_show_help("help-accelerator-cuda.txt", "cuMemcpy failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }

From 182e6fad2014e6f489b4a752a450d468cd632085 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 6 Nov 2023 00:18:05 +0000
Subject: [PATCH 66/74] CUDA: Select compute capability 52 by default

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index 6f7dc89d97e..a5040fa884b 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -24,7 +24,7 @@ sources = op_cuda_component.c op_cuda.h op_cuda_functions.c op_cuda_impl.h
 cu_sources = op_cuda_impl.cu
 
 NVCC = nvcc -g
-NVCCFLAGS= --std c++17 --gpu-architecture=compute_80
+NVCCFLAGS= --std c++17 --gpu-architecture=compute_52
 
 .cu.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \

From e5eb45f1ecfdbca3aeadbd58d937ebe10e5d6bb4 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 7 Nov 2023 18:21:03 +0000
Subject: [PATCH 67/74] Sqash const correctness warnings

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/base/coll_base_allreduce.c |  2 +-
 ompi/op/op.h                             | 59 ++++++++----------------
 2 files changed, 19 insertions(+), 42 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 5c03a1b3cf3..fdc77d1f416 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -1221,7 +1221,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
              * Send the left half of the input vector to the left neighbor,
              * Recv the right half of the input vector from the left neighbor
              */
-            err = ompi_coll_base_sendrecv(send_buf, count_lhalf, dtype, rank - 1,
+            err = ompi_coll_base_sendrecv((void*)send_buf, count_lhalf, dtype, rank - 1,
                                           MCA_COLL_BASE_TAG_ALLREDUCE,
                                           (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
                                           count_rhalf, dtype, rank - 1,
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 32b85229081..0d6cebe96ca 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -548,7 +548,7 @@ static inline bool ompi_op_supports_device(const ompi_op_t * op, const ompi_data
  * optimization).  If you give it an intrinsic op with a datatype that
  * is not defined to have that operation, it is likely to seg fault.
  */
-static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
+static inline void ompi_op_reduce_stream(ompi_op_t * op, const void *source,
                                          void *target, size_t full_count,
                                          ompi_datatype_t * dtype,
                                          int device,
@@ -668,14 +668,14 @@ static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
                 opal_accelerator.get_default_stream(device, &actual_stream);
                 flush_stream = true;
             }
-            op->o_device_op->do_intrinsic.fns[dtype_id](source, target,
+            op->o_device_op->do_intrinsic.fns[dtype_id]((void*)source, target,
                                                         &count, &dtype, device, actual_stream,
                                                         op->o_device_op->do_intrinsic.modules[dtype_id]);
             if (flush_stream) {
                 opal_accelerator.wait_stream(actual_stream);
             }
         } else {
-            op->o_func.intrinsic.fns[dtype_id](source, target,
+            op->o_func.intrinsic.fns[dtype_id]((void*)source, target,
                                                &count, &dtype,
                                                op->o_func.intrinsic.modules[dtype_id]);
         }
@@ -686,31 +686,31 @@ static inline void ompi_op_reduce_stream(ompi_op_t * op, void *source,
     if (0 != (op->o_flags & OMPI_OP_FLAGS_FORTRAN_FUNC)) {
         f_dtype = OMPI_INT_2_FINT(dtype->d_f_to_c_index);
         f_count = OMPI_INT_2_FINT(count);
-        op->o_func.fort_fn(source, target, &f_count, &f_dtype);
+        op->o_func.fort_fn((void*)source, target, &f_count, &f_dtype);
         return;
     } else if (0 != (op->o_flags & OMPI_OP_FLAGS_JAVA_FUNC)) {
-        op->o_func.java_data.intercept_fn(source, target, &count, &dtype,
+        op->o_func.java_data.intercept_fn((void*)source, target, &count, &dtype,
                                           op->o_func.java_data.baseType,
                                           op->o_func.java_data.jnienv,
                                           op->o_func.java_data.object);
         return;
     }
-    op->o_func.c_fn(source, target, &count, &dtype);
+    op->o_func.c_fn((void*)source, target, &count, &dtype);
     return;
 }
 
-static inline void ompi_op_reduce(ompi_op_t * op, void *source,
+static inline void ompi_op_reduce(ompi_op_t * op, const void *source,
                                   void *target, size_t full_count,
                                   ompi_datatype_t * dtype)
 {
     ompi_op_reduce_stream(op, source, target, full_count, dtype, MCA_ACCELERATOR_NO_DEVICE_ID, NULL);
 }
 
-static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, void * restrict source2,
+static inline void ompi_3buff_op_user (ompi_op_t *op, const void * source1, const void * source2,
                                        void * restrict result, int count, struct ompi_datatype_t *dtype)
 {
-    ompi_datatype_copy_content_same_ddt (dtype, count, result, source1);
-    op->o_func.c_fn (source2, result, &count, &dtype);
+    ompi_datatype_copy_content_same_ddt (dtype, count, result, (void*)source1);
+    op->o_func.c_fn ((void*)source2, result, &count, &dtype);
 }
 
 /**
@@ -736,23 +736,16 @@ static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, v
  *
  * Otherwise, this function is the same as ompi_op_reduce.
  */
-static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, void *source1,
-                                               void *source2, void *target,
+static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, const void *source1,
+                                               const void *source2, void *target,
                                                int count, ompi_datatype_t * dtype,
                                                int device,
                                                opal_accelerator_stream_t *stream)
 {
-    void *restrict src1;
-    void *restrict src2;
-    void *restrict tgt;
     bool use_device_op = false;
-    src1 = source1;
-    src2 = source2;
-    tgt = target;
-
     if (OPAL_UNLIKELY(!ompi_op_is_intrinsic (op))) {
         /* no 3buff variants for user-defined ops */
-        ompi_3buff_op_user (op, src1, src2, tgt, count, dtype);
+        ompi_3buff_op_user (op, source1, source2, target, count, dtype);
         return;
     }
 
@@ -801,23 +794,20 @@ static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, void *source1,
             dtype_id = ompi_op_ddt_map[dtype->id];
         }
         if (use_device_op) {
-            if (NULL == op->o_device_op) {
-                abort(); // TODO: be more graceful!
-            }
             opal_accelerator_stream_t *actual_stream = stream;
             bool flush_stream = false;
             if (NULL == stream) {
                 opal_accelerator.get_default_stream(device, &actual_stream);
                 flush_stream = true;
             }
-            op->o_device_op->do_3buff_intrinsic.fns[dtype_id](source1, source2, target,
+            op->o_device_op->do_3buff_intrinsic.fns[dtype_id]((void*)source1, (void*)source2, target,
                                                               &count, &dtype, device, actual_stream,
                                                               op->o_device_op->do_3buff_intrinsic.modules[dtype_id]);
             if (flush_stream) {
                 opal_accelerator.wait_stream(actual_stream);
             }
         } else {
-            op->o_3buff_intrinsic.fns[dtype_id](source1, source2, target,
+            op->o_3buff_intrinsic.fns[dtype_id]((void*)source1, (void*)source2, target,
                                                 &count, &dtype,
                                                 op->o_func.intrinsic.modules[dtype_id]);
         }
@@ -825,27 +815,14 @@ static inline void ompi_3buff_op_reduce_stream(ompi_op_t * op, void *source1,
 }
 
 
-static inline void ompi_3buff_op_reduce(ompi_op_t * op, void *source1,
-                                        void *source2, void *target,
+static inline void ompi_3buff_op_reduce(ompi_op_t * op, const void *source1,
+                                        const void *source2, void *target,
                                         int count, ompi_datatype_t * dtype)
 {
-    void *restrict src1;
-    void *restrict src2;
-    void *restrict tgt;
-    src1 = source1;
-    src2 = source2;
-    tgt = target;
-
     if (OPAL_LIKELY(ompi_op_is_intrinsic (op))) {
         ompi_3buff_op_reduce_stream(op, source1, source2, target, count, dtype, MCA_ACCELERATOR_NO_DEVICE_ID, NULL);
-#if 0
-        op->o_3buff_intrinsic.fns[ompi_op_ddt_map[dtype->id]](src1, src2,
-                                                              tgt, &count,
-                                                              &dtype,
-                                                              op->o_3buff_intrinsic.modules[ompi_op_ddt_map[dtype->id]]);
-#endif // 0
     } else {
-        ompi_3buff_op_user (op, src1, src2, tgt, count, dtype);
+        ompi_3buff_op_user (op, source1, source2, target, count, dtype);
     }
 }
 

From 14a5372bcf4cb87714a34f6a651b7c997fb1d3aa Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 7 Nov 2023 18:21:58 +0000
Subject: [PATCH 68/74] Squash warnings about mismatched function pointer types

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/base/op_base_op_select.c |  9 ++++-----
 ompi/mca/op/cuda/op_cuda.h           |  6 ------
 ompi/mca/op/cuda/op_cuda_component.c |  4 ++--
 ompi/mca/op/op.h                     | 10 ++++++++--
 ompi/mca/op/rocm/op_rocm_component.c |  4 ++--
 5 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/ompi/mca/op/base/op_base_op_select.c b/ompi/mca/op/base/op_base_op_select.c
index 344b4250a6a..534a1d63267 100644
--- a/ompi/mca/op/base/op_base_op_select.c
+++ b/ompi/mca/op/base/op_base_op_select.c
@@ -158,23 +158,22 @@ int ompi_op_base_op_select(ompi_op_t *op)
             }
             for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
                 /* 2-buffer variants */
-                if (NULL != avail->ao_module->opm_fns[i]) {
+                if (NULL != avail->ao_module->opm_stream_fns[i]) {
                     if (NULL != op->o_device_op->do_intrinsic.modules[i]) {
                         OBJ_RELEASE(op->o_device_op->do_intrinsic.modules[i]);
                     }
-                    // TODO: function signatures don't match, fix it!
-                    op->o_device_op->do_intrinsic.fns[i] = avail->ao_module->opm_fns[i];
+                    op->o_device_op->do_intrinsic.fns[i] = avail->ao_module->opm_stream_fns[i];
                     op->o_device_op->do_intrinsic.modules[i] = avail->ao_module;
                     OBJ_RETAIN(avail->ao_module);
                 }
 
                 /* 3-buffer variants */
-                if (NULL != avail->ao_module->opm_3buff_fns[i]) {
+                if (NULL != avail->ao_module->opm_3buff_stream_fns[i]) {
                     if (NULL != op->o_device_op->do_3buff_intrinsic.modules[i]) {
                         OBJ_RELEASE(op->o_device_op->do_3buff_intrinsic.modules[i]);
                     }
                     op->o_device_op->do_3buff_intrinsic.fns[i] =
-                        avail->ao_module->opm_3buff_fns[i];
+                        avail->ao_module->opm_3buff_stream_fns[i];
                     op->o_device_op->do_3buff_intrinsic.modules[i] = avail->ao_module;
                     OBJ_RETAIN(avail->ao_module);
                 }
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index 2b7bb2831ee..8d1e5b52a06 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -49,12 +49,6 @@ BEGIN_C_DECLS
 typedef struct {
     /** The base op component struct */
     ompi_op_base_component_1_0_0_t super;
-
-#if 0
-    /* a stream on which to schedule kernel calls */
-    CUstream cu_stream;
-    CUcontext *cu_ctx;
-#endif // 0
     int cu_max_num_blocks;
     int cu_max_num_threads;
     int *cu_max_threads_per_block;
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 611d936cd42..1f50dc9085b 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -211,8 +211,8 @@ cuda_component_op_query(struct ompi_op_t *op, int *priority)
     module = OBJ_NEW(ompi_op_base_module_t);
     module->opm_device_enabled = true;
     for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-        module->opm_fns[i] = ompi_op_cuda_functions[op->o_f_to_c_index][i];
-        module->opm_3buff_fns[i] = ompi_op_cuda_3buff_functions[op->o_f_to_c_index][i];
+        module->opm_stream_fns[i] = ompi_op_cuda_functions[op->o_f_to_c_index][i];
+        module->opm_3buff_stream_fns[i] = ompi_op_cuda_3buff_functions[op->o_f_to_c_index][i];
 
         if( NULL != module->opm_fns[i] ) {
             OBJ_RETAIN(module);
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 2c0d3ee337a..097c2a109b4 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -410,8 +410,14 @@ typedef struct ompi_op_base_module_1_0_0_t {
 
     /** Function pointers for all the different datatypes to be used
         with the MPI_Op that this module is used with */
-    ompi_op_base_handler_fn_1_0_0_t opm_fns[OMPI_OP_BASE_TYPE_MAX];
-    ompi_op_base_3buff_handler_fn_1_0_0_t opm_3buff_fns[OMPI_OP_BASE_TYPE_MAX];
+    union {
+        ompi_op_base_handler_fn_1_0_0_t        opm_fns[OMPI_OP_BASE_TYPE_MAX];
+        ompi_op_base_stream_handler_fn_1_0_0_t opm_stream_fns[OMPI_OP_BASE_TYPE_MAX];
+    };
+    union {
+        ompi_op_base_3buff_handler_fn_1_0_0_t        opm_3buff_fns[OMPI_OP_BASE_TYPE_MAX];
+        ompi_op_base_3buff_stream_handler_fn_1_0_0_t opm_3buff_stream_fns[OMPI_OP_BASE_TYPE_MAX];
+    };
 } ompi_op_base_module_1_0_0_t;
 
 /**
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
index 911ccb3d818..e363bf94385 100644
--- a/ompi/mca/op/rocm/op_rocm_component.c
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -204,8 +204,8 @@ rocm_component_op_query(struct ompi_op_t *op, int *priority)
     module = OBJ_NEW(ompi_op_base_module_t);
     module->opm_device_enabled = true;
     for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-        module->opm_fns[i] = ompi_op_rocm_functions[op->o_f_to_c_index][i];
-        module->opm_3buff_fns[i] = ompi_op_rocm_3buff_functions[op->o_f_to_c_index][i];
+        module->opm_stream_fns[i] = ompi_op_rocm_functions[op->o_f_to_c_index][i];
+        module->opm_3buff_stream_fns[i] = ompi_op_rocm_3buff_functions[op->o_f_to_c_index][i];
 
         if( NULL != module->opm_fns[i] ) {
             OBJ_RETAIN(module);

From 1f638097dd0b9be8b5cdf7912449dda1e3cfc77b Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 7 Nov 2023 18:25:36 +0000
Subject: [PATCH 69/74] Squash printfs

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/op_cuda_component.c | 32 +---------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 1f50dc9085b..5070e8a4c94 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -71,18 +71,6 @@ ompi_op_cuda_component_t mca_op_cuda_component = {
  */
 static int cuda_component_open(void)
 {
-    /* We checked the flags during register, so if they are set to
-     * zero either the architecture is not suitable or the user disabled
-     * AVX support.
-     *
-     * A first level check to see what level of AVX is available on the
-     * hardware.
-     *
-     * Note that if this function returns non-OMPI_SUCCESS, then this
-     * component won't even be shown in ompi_info output (which is
-     * probably not what you want).
-     */
-    printf("op cuda_component_open\n");
     return OMPI_SUCCESS;
 }
 
@@ -92,7 +80,6 @@ static int cuda_component_open(void)
 static int cuda_component_close(void)
 {
     if (mca_op_cuda_component.cu_num_devices > 0) {
-        //cuStreamDestroy(mca_op_cuda_component.cu_stream);
         free(mca_op_cuda_component.cu_max_threads_per_block);
         mca_op_cuda_component.cu_max_threads_per_block = NULL;
         free(mca_op_cuda_component.cu_max_blocks);
@@ -144,7 +131,7 @@ cuda_component_init_query(bool enable_progress_threads,
     int num_devices;
     int rc;
     int prio_lo, prio_hi;
-    //memset(&mca_op_cuda_component, 0, sizeof(mca_op_cuda_component));
+    // TODO: is this init needed here?
     cuInit(0);
     CHECK(cuDeviceGetCount, (&num_devices));
     mca_op_cuda_component.cu_num_devices = num_devices;
@@ -163,9 +150,6 @@ cuda_component_init_query(bool enable_progress_threads,
         if (-1 < mca_op_cuda_component.cu_max_num_threads) {
             if (mca_op_cuda_component.cu_max_threads_per_block[i] >= mca_op_cuda_component.cu_max_num_threads) {
                 mca_op_cuda_component.cu_max_threads_per_block[i] = mca_op_cuda_component.cu_max_num_threads;
-            } else {
-                printf("WARN: CUDA device %d does not support %d threads per block, falling back to %d\n",
-                       i, mca_op_cuda_component.cu_max_num_threads, mca_op_cuda_component.cu_max_threads_per_block[i]);
             }
         }
 
@@ -176,27 +160,13 @@ cuda_component_init_query(bool enable_progress_threads,
             /* fall-back to value that should work on every device */
             mca_op_cuda_component.cu_max_blocks[i] = 512;
         }
-        printf("max threads %d max blocks %d\n", mca_op_cuda_component.cu_max_threads_per_block[i], mca_op_cuda_component.cu_max_blocks[i]);
         if (-1 < mca_op_cuda_component.cu_max_num_blocks) {
             if (mca_op_cuda_component.cu_max_blocks[i] >= mca_op_cuda_component.cu_max_num_blocks) {
                 mca_op_cuda_component.cu_max_blocks[i] = mca_op_cuda_component.cu_max_num_blocks;
-            } else {
-                printf("WARN: CUDA device %d does not support %d blocks, falling back to %d\n",
-                       i, mca_op_cuda_component.cu_max_num_blocks, mca_op_cuda_component.cu_max_blocks[i]);
             }
         }
     }
 
-#if 0
-    /* try to create a high-priority stream */
-    rc = cuCtxGetStreamPriorityRange(&prio_lo, &prio_hi);
-    if (CUDA_SUCCESS != rc) {
-        cuStreamCreateWithPriority(&mca_op_cuda_component.cu_stream, CU_STREAM_NON_BLOCKING, prio_hi);
-    } else {
-        mca_op_cuda_component.cu_stream = 0;
-    }
-#endif // 0
-    printf("op cuda_component_init_query\n");
     return OMPI_SUCCESS;
 }
 

From 3d9f33aa63c40aac0b36ae21c9635fe979917df5 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 7 Nov 2023 19:30:38 +0000
Subject: [PATCH 70/74] Replace fprintf with show_help

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/Makefile.am                  |  6 +--
 ompi/mca/op/cuda/help-ompi-mca-op-cuda.txt    | 15 ++++++++
 ompi/mca/op/cuda/op_cuda.h                    |  7 ++--
 ompi/mca/op/rocm/Makefile.am                  |  2 +
 ompi/mca/op/rocm/help-ompi-mca-op-rocm.txt    | 15 ++++++++
 ompi/mca/op/rocm/op_rocm.h                    | 11 +++---
 ompi/op/Makefile.am                           |  2 +
 ompi/op/help-ompi-op.txt                      | 15 ++++++++
 ompi/op/op.h                                  | 38 +------------------
 .../help-mca-allocator-devicebucket.txt       |  4 +-
 10 files changed, 66 insertions(+), 49 deletions(-)
 create mode 100644 ompi/mca/op/cuda/help-ompi-mca-op-cuda.txt
 create mode 100644 ompi/mca/op/rocm/help-ompi-mca-op-rocm.txt
 create mode 100644 ompi/op/help-ompi-op.txt

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index a5040fa884b..509451ce0db 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -19,6 +19,8 @@
 
 AM_CPPFLAGS = $(common_cuda_CPPFLAGS)
 
+dist_ompidata_DATA = help-ompi-mca-op-cuda.txt
+
 sources = op_cuda_component.c op_cuda.h op_cuda_functions.c op_cuda_impl.h
 #sources_extended = op_cuda_functions.cu
 cu_sources = op_cuda_impl.cu
@@ -60,14 +62,12 @@ endif
 # The DSO should install itself in $(ompilibdir) (by default,
 # $prefix/lib/openmpi).
 
-CUDADIR=/nfs/apps/spacks/2023-01-01/opt/spack/linux-centos7-x86_64/gcc-9.5.0/cuda-11.8.0-u2modnncfevx54ibr5dy27sxkirwsf7f
-
 mcacomponentdir = $(ompilibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_cuda_la_SOURCES = $(sources)
 mca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
 mca_op_cuda_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
-		$(accelerator_cuda_LIBS) -L$(CUDADIR)/lib64 -lcudart
+		$(accelerator_cuda_LIBS) $(accelerator_cudart_LIBS)
 EXTRA_mca_op_cuda_la_SOURCES = $(cu_sources)
 
 # Specific information for static builds.
diff --git a/ompi/mca/op/cuda/help-ompi-mca-op-cuda.txt b/ompi/mca/op/cuda/help-ompi-mca-op-cuda.txt
new file mode 100644
index 00000000000..f999ebc939c
--- /dev/null
+++ b/ompi/mca/op/cuda/help-ompi-mca-op-cuda.txt
@@ -0,0 +1,15 @@
+# -*- text -*-
+#
+# Copyright (c) 2023      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+# This is the US/English help file for Open MPI's CUDA operator component
+#
+[CUDA call failed]
+"CUDA call %s failed: %s: %s\n"
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index 8d1e5b52a06..ab349d48ee4 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -33,9 +33,10 @@ BEGIN_C_DECLS
     do {                                                      \
         cudaError_t err = fn args;                            \
         if (err != cudaSuccess) {                             \
-            fprintf(stderr, "%s:%d: %s failed at line: %s: %s\n", \
-                    __FILE__, __LINE__, str(fn), cudaGetErrorName(err), \
-                    cudaGetErrorString(err));                 \
+            opal_show_help("help-ompi-mca-op-cuda.txt",       \
+                           "CUDA call failed", true,          \
+                           str(fn), cudaGetErrorName(err),    \
+                           cudaGetErrorString(err));          \
             ompi_mpi_abort(MPI_COMM_WORLD, 1);                \
         }                                                     \
     } while (0)
diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index c2e941dee96..b22ea7e6f13 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -19,6 +19,8 @@
 
 AM_CPPFLAGS = $(common_rocm_CPPFLAGS)
 
+dist_ompidata_DATA = help-ompi-mca-op-rocm.txt
+
 sources = op_rocm_component.c op_rocm.h op_rocm_functions.c op_rocm_impl.h
 rocm_sources = op_rocm_impl.cpp
 
diff --git a/ompi/mca/op/rocm/help-ompi-mca-op-rocm.txt b/ompi/mca/op/rocm/help-ompi-mca-op-rocm.txt
new file mode 100644
index 00000000000..848afbb663d
--- /dev/null
+++ b/ompi/mca/op/rocm/help-ompi-mca-op-rocm.txt
@@ -0,0 +1,15 @@
+# -*- text -*-
+#
+# Copyright (c) 2023      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+# This is the US/English help file for Open MPI's HIP operator component
+#
+[HIP call failed]
+"HIP call %s failed: %s: %s\n"
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
index dba668f828a..0dfeabf689b 100644
--- a/ompi/mca/op/rocm/op_rocm.h
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -31,11 +31,12 @@ BEGIN_C_DECLS
 
 #define CHECK(fn, args)                                       \
     do {                                                      \
-        hipError_t err = fn args;                            \
-        if (err != hipSuccess) {                             \
-            fprintf(stderr, "%s:%d: %s failed at line: %s: %s\n", \
-                    __FILE__, __LINE__, str(fn), hipGetErrorName(err), \
-                    hipGetErrorString(err));                 \
+        hipError_t err = fn args;                             \
+        if (err != hipSuccess) {                              \
+            opal_show_help("help-ompi-mca-op-rocm.txt",       \
+                           "HIP call failed", true,          \
+                           str(fn), hipGetErrorName(err),    \
+                           hipGetErrorString(err));          \
             ompi_mpi_abort(MPI_COMM_WORLD, 1);                \
         }                                                     \
     } while (0)
diff --git a/ompi/op/Makefile.am b/ompi/op/Makefile.am
index 5599c31311b..f0ba89c5618 100644
--- a/ompi/op/Makefile.am
+++ b/ompi/op/Makefile.am
@@ -22,6 +22,8 @@
 # This makefile.am does not stand on its own - it is included from
 # ompi/Makefile.am
 
+dist_ompidata_DATA += op/help-ompi-op.txt
+
 headers += op/op.h
 
 lib@OMPI_LIBMPI_NAME@_la_SOURCES += op/op.c
diff --git a/ompi/op/help-ompi-op.txt b/ompi/op/help-ompi-op.txt
new file mode 100644
index 00000000000..5cfb60b8f9f
--- /dev/null
+++ b/ompi/op/help-ompi-op.txt
@@ -0,0 +1,15 @@
+# -*- text -*-
+#
+# Copyright (c) 2004-2023 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+# This is the US/English help file for Open MPI's allocator bucket support
+#
+[missing implementation]
+ERROR: No suitable module for op %s on type %s found for device memory!
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 0d6cebe96ca..578e144d6a8 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -482,42 +482,6 @@ static inline bool ompi_op_is_valid(ompi_op_t * op, ompi_datatype_t * ddt,
     return true;
 }
 
-
-
-/**
- * Check to see if an op supports device execution on the given datatype
- *
- * @param op The op to check
- * @param ddt The datatype to check
- *
- * @returns true If the op supports devices on that datatype
- * @returns false If the op does not support devices on that datatype
- *
- */
-static inline bool ompi_op_supports_device(const ompi_op_t * op, const ompi_datatype_t * ddt)
-{
-    /* Check:
-       - non-intrinsic ddt's cannot be invoked on intrinsic op's
-       - if intrinsic ddt invoked on intrinsic op:
-       - ensure the datatype is defined in the op map
-       - ensure we have a function pointer for that combination
-     */
-    if (ompi_op_is_intrinsic(op)) {
-        if (ompi_datatype_is_predefined(ddt)) {
-            /* Intrinsic ddt on intrinsic op */
-            if (NULL == op->o_device_op ||
-                -1   == ompi_op_ddt_map[ddt->id] ||
-                NULL == op->o_device_op->do_intrinsic.fns[ompi_op_ddt_map[ddt->id]]) {
-                return false;
-            }
-        }
-    }
-
-    /* op supports device for the given datatype */
-    return true;
-}
-
-
 /**
  * Perform a reduction operation.
  *
@@ -642,7 +606,7 @@ static inline void ompi_op_reduce_stream(ompi_op_t * op, const void *source,
                 (source_check_addr == 0 || (source_flags & MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY))) {
                 /* nothing to be done, we won't need device-capable ops */
             } else {
-                fprintf(stderr, "op: no suitable op %s module for type %s found for device memory!\n", op->o_name, dtype->name);
+                opal_show_help("help-ompi-op.txt", "missing implementation", true, op->o_name, dtype->name);
                 abort();
             }
         }
diff --git a/opal/mca/allocator/devicebucket/help-mca-allocator-devicebucket.txt b/opal/mca/allocator/devicebucket/help-mca-allocator-devicebucket.txt
index 27edbb92fa4..01c152fd26d 100644
--- a/opal/mca/allocator/devicebucket/help-mca-allocator-devicebucket.txt
+++ b/opal/mca/allocator/devicebucket/help-mca-allocator-devicebucket.txt
@@ -1,6 +1,8 @@
 # -*- text -*-
 #
-# Copyright (c) 2021      IBM Corporation.  All rights reserved
+# Copyright (c) 2004-2023 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow

From c878c4f7bf4fa9ece3e64217e9cb4e4a3a8447aa Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 7 Nov 2023 22:46:24 +0000
Subject: [PATCH 71/74] Squash compiler warnings

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/coll/basic/coll_basic_allreduce.c | 4 ++--
 ompi/op/op.h                               | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ompi/mca/coll/basic/coll_basic_allreduce.c b/ompi/mca/coll/basic/coll_basic_allreduce.c
index d9bcd54758a..065d358a4af 100644
--- a/ompi/mca/coll/basic/coll_basic_allreduce.c
+++ b/ompi/mca/coll/basic/coll_basic_allreduce.c
@@ -2,7 +2,7 @@
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2017 The University of Tennessee and The University
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -109,7 +109,7 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
         }
         dsize = opal_datatype_span(&dtype->super, count, &gap);
         if (opal_accelerator.check_addr(rbuf, &rbuf_dev, NULL) > 0 && rbuf_dev >= 0) {
-            if (OPAL_SUCCESS != opal_accelerator.mem_alloc(rbuf_dev, &tmpbuf, dsize)) {
+            if (OPAL_SUCCESS != opal_accelerator.mem_alloc(rbuf_dev, (void**)&tmpbuf, dsize)) {
                 err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit;
             }
             rbuf_on_device = true;
diff --git a/ompi/op/op.h b/ompi/op/op.h
index 578e144d6a8..150ae5ebc0e 100644
--- a/ompi/op/op.h
+++ b/ompi/op/op.h
@@ -44,6 +44,7 @@
 
 #include "opal/class/opal_object.h"
 #include "opal/util/printf.h"
+#include "opal/util/show_help.h"
 
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/mpi/fortran/base/fint_2_int.h"

From 1c6667d26b19d6f94bff60028cfa5c9170695054 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 7 Nov 2023 22:47:46 +0000
Subject: [PATCH 72/74] Clean up cuda and rocm op codes

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/Makefile.am         |   2 +-
 ompi/mca/op/cuda/op_cuda_functions.c | 120 +++--------
 ompi/mca/op/cuda/op_cuda_impl.cu     | 294 ++-------------------------
 ompi/mca/op/cuda/op_cuda_impl.h      | 257 +----------------------
 ompi/mca/op/rocm/Makefile.am         |  10 +-
 ompi/mca/op/rocm/configure.m4        |   2 +-
 ompi/mca/op/rocm/op_rocm_functions.c | 177 ++++------------
 ompi/mca/op/rocm/op_rocm_impl.cpp    | 290 ++------------------------
 ompi/mca/op/rocm/op_rocm_impl.h      |  11 +-
 9 files changed, 117 insertions(+), 1046 deletions(-)

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index 509451ce0db..311b9db3112 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -79,6 +79,6 @@ noinst_LTLIBRARIES = $(component_noinst)
 libmca_op_cuda_la_SOURCES = $(sources)
 libmca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
 libmca_op_cuda_la_LDFLAGS = -module -avoid-version\
-		$(accelerator_cuda_LIBS) -L$(CUDADIR)/lib64 -lcudart
+		$(accelerator_cuda_LIBS) $(accelerator_cudart_LIBS)
 EXTRA_libmca_op_cuda_la_SOURCES = $(cu_sources)
 
diff --git a/ompi/mca/op/cuda/op_cuda_functions.c b/ompi/mca/op/cuda/op_cuda_functions.c
index 97af100abfd..26e54cb0851 100644
--- a/ompi/mca/op/cuda/op_cuda_functions.c
+++ b/ompi/mca/op/cuda/op_cuda_functions.c
@@ -69,13 +69,8 @@ static inline void device_op_pre(const void *orig_source1,
 
         if (NULL != orig_source2) {
             source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
-            //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
-            //       orig_target, target_rc, *target_device, orig_source1, source1_rc, *source1_device, orig_source2, source2_rc, *source2_device, *device);
         }
 
-        //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
-        //       target_rc, *target_device, source_rc, *source_device, *device);
-
         if (0 == target_rc && 0 == source1_rc && 0 == source2_rc) {
             /* no buffers are on any device, select device 0 */
             *device = 0;
@@ -94,7 +89,6 @@ static inline void device_op_pre(const void *orig_source1,
 
             if (0 == target_rc) {
                 // allocate memory on the device for the target buffer
-                //printf("copying target from device %d to host\n", *target_device);
                 opal_accelerator.mem_alloc_stream(*device, target, nbytes, stream);
                 CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*target, orig_target, nbytes, *(CUstream*)stream->stream));
                 *target_device = -1; // mark target device as host
@@ -102,16 +96,13 @@ static inline void device_op_pre(const void *orig_source1,
 
             if (0 == source1_rc || *device != *source1_device) {
                 // allocate memory on the device for the source buffer
-                //printf("allocating source on device %d\n", *device);
                 opal_accelerator.mem_alloc_stream(*device, source1, nbytes, stream);
                 if (0 == source1_rc) {
                     /* copy from host to device */
-                    //printf("copying source from host to device %d\n", *device);
                     CHECK(cuMemcpyHtoDAsync, ((CUdeviceptr)*source1, orig_source1, nbytes, *(CUstream*)stream->stream));
                 } else {
                     /* copy from one device to another device */
                     /* TODO: does this actually work? Can we enable P2P? */
-                    //printf("attempting cross-device copy for source\n");
                     CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source1, (CUdeviceptr)orig_source1, nbytes, *(CUstream*)stream->stream));
                 }
             }
@@ -119,7 +110,6 @@ static inline void device_op_pre(const void *orig_source1,
         }
         if (NULL != source2_device && *target_device != *source2_device) {
             // allocate memory on the device for the source buffer
-            //printf("allocating source on device %d\n", *device);
             size_t nbytes;
             ompi_datatype_type_size(dtype, &nbytes);
             nbytes *= count;
@@ -132,7 +122,6 @@ static inline void device_op_pre(const void *orig_source1,
             } else {
                 /* copy from one device to another device */
                 /* TODO: does this actually work? Can we enable P2P? */
-                //printf("attempting cross-device copy for source\n");
                 CHECK(cuMemcpyDtoDAsync, ((CUdeviceptr)*source2, (CUdeviceptr)orig_source2, nbytes, *(CUstream*)stream->stream));
             }
         }
@@ -469,11 +458,12 @@ OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC(sum, c_float_complex, float _Complex, +=)
-OP_FUNC(sum, c_double_complex, double _Complex, +=)
 OP_FUNC(sum, c_long_double_complex, long double _Complex, +=)
 #endif // 0
 
+FUNC_FUNC(sum, c_float_complex, cuFloatComplex)
+FUNC_FUNC(sum, c_double_complex, cuDoubleComplex)
+
 /*************************************************************************
  * Product
  *************************************************************************/
@@ -547,11 +537,12 @@ OP_FUNC(prod, c_short_float_complex, short float _Complex, *=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC(prod, c_float_complex, float _Complex, *=)
-OP_FUNC(prod, c_double_complex, double _Complex, *=)
 OP_FUNC(prod, c_long_double_complex, long double _Complex, *=)
 #endif // 0
 
+FUNC_FUNC(prod, c_float_complex, cuFloatComplex)
+FUNC_FUNC(prod, c_double_complex, cuDoubleComplex)
+
 /*************************************************************************
  * Logical AND
  *************************************************************************/
@@ -752,17 +743,6 @@ FUNC_FUNC(bxor, byte, char)
  * Max location
  *************************************************************************/
 
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC(maxloc, 2real, >)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC(maxloc, 2double_precision, >)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC(maxloc, 2integer, >)
-#endif
-#endif // 0
 LOC_FUNC(maxloc, float_int, >)
 LOC_FUNC(maxloc, double_int, >)
 LOC_FUNC(maxloc, long_int, >)
@@ -773,17 +753,7 @@ LOC_FUNC(maxloc, long_double_int, >)
 /*************************************************************************
  * Min location
  *************************************************************************/
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC(minloc, 2real, <)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC(minloc, 2double_precision, <)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC(minloc, 2integer, <)
-#endif
-#endif // 0
+
 LOC_FUNC(minloc, float_int, <)
 LOC_FUNC(minloc, double_int, <)
 LOC_FUNC(minloc, long_int, <)
@@ -1091,6 +1061,9 @@ OP_FUNC_3BUF(sum, c_double_complex, double _Complex, +)
 OP_FUNC_3BUF(sum, c_long_double_complex, long double _Complex, +)
 #endif // 0
 
+FUNC_FUNC_3BUF(sum, c_float_complex, cuFloatComplex)
+FUNC_FUNC_3BUF(sum, c_double_complex, cuDoubleComplex)
+
 /*************************************************************************
  * Product
  *************************************************************************/
@@ -1162,11 +1135,12 @@ OP_FUNC_3BUF(prod, c_short_float_complex, short float _Complex, *)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC_3BUF(prod, c_float_complex, float _Complex, *)
-OP_FUNC_3BUF(prod, c_double_complex, double _Complex, *)
 OP_FUNC_3BUF(prod, c_long_double_complex, long double _Complex, *)
 #endif // 0
 
+FUNC_FUNC_3BUF(prod, c_float_complex, cuFloatComplex)
+FUNC_FUNC_3BUF(prod, c_double_complex, cuDoubleComplex)
+
 /*************************************************************************
  * Logical AND
  *************************************************************************/
@@ -1362,42 +1336,10 @@ FORT_INT_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
 /* Byte */
 FORT_INT_FUNC_3BUF(bxor, byte, char)
 
-/*************************************************************************
- * Min and max location "pair" datatypes
- *************************************************************************/
-
-/*
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_STRUCT_3BUF(2real, ompi_fortran_real_t, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_STRUCT_3BUF(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_STRUCT_3BUF(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
-#endif
-LOC_STRUCT_3BUF(float_int, float, int)
-LOC_STRUCT_3BUF(double_int, double, int)
-LOC_STRUCT_3BUF(long_int, long, int)
-LOC_STRUCT_3BUF(2int, int, int)
-LOC_STRUCT_3BUF(short_int, short, int)
-LOC_STRUCT_3BUF(long_double_int, long double, int)
-*/
-
 /*************************************************************************
  * Max location
  *************************************************************************/
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF(maxloc, 2real, >)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF(maxloc, 2double_precision, >)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF(maxloc, 2integer, >)
-#endif
-#endif // 0
+
 LOC_FUNC_3BUF(maxloc, float_int, >)
 LOC_FUNC_3BUF(maxloc, double_int, >)
 LOC_FUNC_3BUF(maxloc, long_int, >)
@@ -1408,17 +1350,7 @@ LOC_FUNC_3BUF(maxloc, long_double_int, >)
 /*************************************************************************
  * Min location
  *************************************************************************/
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF(minloc, 2real, <)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF(minloc, 2double_precision, <)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF(minloc, 2integer, <)
-#endif
-#endif // 0
+
 LOC_FUNC_3BUF(minloc, float_int, <)
 LOC_FUNC_3BUF(minloc, double_int, <)
 LOC_FUNC_3BUF(minloc, long_int, <)
@@ -1578,21 +1510,19 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 #else
 #define SHORT_FLOAT_COMPLEX(name, ftype) NULL
 #endif
-#define FLOAT_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_float_complex
-#define DOUBLE_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_double_complex
 #define LONG_DOUBLE_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_long_double_complex
 #else
 #define SHORT_FLOAT_COMPLEX(name, ftype) NULL
-#define FLOAT_COMPLEX(name, ftype) NULL
-#define DOUBLE_COMPLEX(name, ftype) NULL
 #define LONG_DOUBLE_COMPLEX(name, ftype) NULL
 #endif // 0
+#define FLOAT_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_float_complex
+#define DOUBLE_COMPLEX(name, ftype) ompi_op_cuda_##ftype##_##name##_c_double_complex
 
 #define COMPLEX(name, ftype)                                                  \
-    [OMPI_OP_CUDA_TYPE_C_SHORT_FLOAT_COMPLEX] = SHORT_FLOAT_COMPLEX(name, ftype), \
-    [OMPI_OP_CUDA_TYPE_C_FLOAT_COMPLEX] = FLOAT_COMPLEX(name, ftype),         \
-    [OMPI_OP_CUDA_TYPE_C_DOUBLE_COMPLEX] = DOUBLE_COMPLEX(name, ftype),       \
-    [OMPI_OP_CUDA_TYPE_C_LONG_DOUBLE_COMPLEX] = LONG_DOUBLE_COMPLEX(name, ftype)
+    [OMPI_OP_BASE_TYPE_C_SHORT_FLOAT_COMPLEX] = SHORT_FLOAT_COMPLEX(name, ftype), \
+    [OMPI_OP_BASE_TYPE_C_FLOAT_COMPLEX] = FLOAT_COMPLEX(name, ftype),         \
+    [OMPI_OP_BASE_TYPE_C_DOUBLE_COMPLEX] = DOUBLE_COMPLEX(name, ftype),       \
+    [OMPI_OP_BASE_TYPE_C_LONG_DOUBLE_COMPLEX] = LONG_DOUBLE_COMPLEX(name, ftype)
 
 /** Byte ****************************************************************/
 
@@ -1666,14 +1596,14 @@ ompi_op_base_stream_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_
             C_INTEGER(sum, 2buff),
             FORTRAN_INTEGER(sum, 2buff),
             FLOATING_POINT(sum, 2buff),
-            NULL,
+            COMPLEX(sum, 2buff),
         },
         /* Corresponds to MPI_PROD */
         [OMPI_OP_BASE_FORTRAN_PROD] = {
             C_INTEGER(prod, 2buff),
             FORTRAN_INTEGER(prod, 2buff),
             FLOATING_POINT(prod, 2buff),
-            NULL,
+            COMPLEX(prod, 2buff),
         },
         /* Corresponds to MPI_LAND */
         [OMPI_OP_BASE_FORTRAN_LAND] = {
@@ -1752,14 +1682,14 @@ ompi_op_base_3buff_stream_handler_fn_t ompi_op_cuda_3buff_functions[OMPI_OP_BASE
             C_INTEGER(sum, 3buff),
             FORTRAN_INTEGER(sum, 3buff),
             FLOATING_POINT(sum, 3buff),
-            NULL,
+            COMPLEX(sum, 3buff),
         },
         /* Corresponds to MPI_PROD */
         [OMPI_OP_BASE_FORTRAN_PROD] = {
             C_INTEGER(prod, 3buff),
             FORTRAN_INTEGER(prod, 3buff),
             FLOATING_POINT(prod, 3buff),
-            NULL,
+            COMPLEX(prod, 3buff),
         },
         /* Corresponds to MPI_LAND */
         [OMPI_OP_BASE_FORTRAN_LAND] ={
diff --git a/ompi/mca/op/cuda/op_cuda_impl.cu b/ompi/mca/op/cuda/op_cuda_impl.cu
index 65bdecea60e..79c82feaa19 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.cu
+++ b/ompi/mca/op/cuda/op_cuda_impl.cu
@@ -62,7 +62,8 @@ static inline __device__ T vprod(const T& a, const T& b) {
 
 /* TODO: missing support for
  * - short float (conditional on whether short float is available)
- * - complex
+ * - some Fortran types
+ * - some complex types
  */
 
 #define USE_VECTORS 1
@@ -430,7 +431,7 @@ OP_FUNC(sum, long_double, long double, +)
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
 #undef current_func
 #define current_func(a, b) __hadd2(a, b)
-VFUNC_FUNC(sum, half, half, half2, 2, __hadd2, __hadd)
+//VFUNC_FUNC(sum, half, half, half2, 2, __hadd2, __hadd)
 #endif // __CUDA_ARCH__
 
 /* Complex */
@@ -439,6 +440,7 @@ VFUNC_FUNC(sum, half, half, half2, 2, __hadd2, __hadd)
 OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
+OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 #endif
 #endif // 0
 #undef current_func
@@ -447,7 +449,6 @@ FUNC_FUNC(sum, c_float_complex, cuFloatComplex)
 #undef current_func
 #define current_func(a, b) (cuCadd(a,b))
 FUNC_FUNC(sum, c_double_complex, cuDoubleComplex)
-//OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 
 /*************************************************************************
  * Product
@@ -485,7 +486,7 @@ OP_FUNC(prod, long_double, long double, *)
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530
 #undef current_func
 #define current_func(a, b) __hmul2(a, b)
-VFUNC_FUNC(prod, half, half, half2, 2, __hmul2, __hmul)
+//VFUNC_FUNC(prod, half, half, half2, 2, __hmul2, __hmul)
 #endif // __CUDA_ARCH__
 
 /* Complex */
@@ -522,10 +523,6 @@ FUNC_FUNC(land, uint64_t, uint64_t)
 FUNC_FUNC(land,  long,  long)
 FUNC_FUNC(land,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC(land, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC(land, bool, bool)
 
@@ -721,14 +718,6 @@ LOC_FUNC(minloc, long_double_int, <)
  *
  * This macro is for minloc and maxloc
  */
-/*
-#define LOC_STRUCT(type_name, type1, type2) \
-  typedef struct { \
-      type1 v; \
-      type2 k; \
-  } ompi_op_predefined_##type_name##_t;
-*/
-
 #define LOC_FUNC_3BUF(name, type_name, op)                                                          \
     static __global__ void                                                                          \
     ompi_op_cuda_3buff_##name##_##type_name##_kernel(const ompi_op_predefined_##type_name##_t *__restrict__ in1, \
@@ -788,25 +777,6 @@ FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(max,  long,  long)
 FUNC_FUNC_3BUF(max,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(max, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(max, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(max, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(max, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(max, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(max, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC_3BUF(max, short_float, short float)
@@ -816,24 +786,6 @@ FUNC_FUNC_3BUF(max, short_float, opal_short_float_t)
 FUNC_FUNC_3BUF(max, float, float)
 FUNC_FUNC_3BUF(max, double, double)
 FUNC_FUNC_3BUF(max, long_double, long double)
-#if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC_3BUF(max, fortran_real, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC_3BUF(max, fortran_double_precision, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC_3BUF(max, fortran_real2, ompi_fortran_real2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC_3BUF(max, fortran_real4, ompi_fortran_real4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC_3BUF(max, fortran_real8, ompi_fortran_real8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC_3BUF(max, fortran_real16, ompi_fortran_real16_t)
-#endif
 
 
 /*************************************************************************
@@ -854,25 +806,6 @@ FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(min,  long,  long)
 FUNC_FUNC_3BUF(min,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(min, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(min, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(min, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(min, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(min, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(min, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC_3BUF(min, short_float, short float)
@@ -882,24 +815,6 @@ FUNC_FUNC_3BUF(min, short_float, opal_short_float_t)
 FUNC_FUNC_3BUF(min, float, float)
 FUNC_FUNC_3BUF(min, double, double)
 FUNC_FUNC_3BUF(min, long_double, long double)
-#if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC_3BUF(min, fortran_real, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC_3BUF(min, fortran_double_precision, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC_3BUF(min, fortran_real2, ompi_fortran_real2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC_3BUF(min, fortran_real4, ompi_fortran_real4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC_3BUF(min, fortran_real8, ompi_fortran_real8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC_3BUF(min, fortran_real16, ompi_fortran_real16_t)
-#endif
 
 /*************************************************************************
  * Sum
@@ -917,25 +832,6 @@ OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF(sum,  long,  long, +)
 OP_FUNC_3BUF(sum,  ulong, unsigned long, +)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC_3BUF(sum, fortran_integer, ompi_fortran_integer_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC_3BUF(sum, fortran_integer1, ompi_fortran_integer1_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC_3BUF(sum, fortran_integer2, ompi_fortran_integer2_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC_3BUF(sum, fortran_integer4, ompi_fortran_integer4_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC_3BUF(sum, fortran_integer8, ompi_fortran_integer8_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC_3BUF(sum, fortran_integer16, ompi_fortran_integer16_t, +)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 OP_FUNC_3BUF(sum, short_float, short float, +)
@@ -945,24 +841,7 @@ OP_FUNC_3BUF(sum, short_float, opal_short_float_t, +)
 OP_FUNC_3BUF(sum, float, float, +)
 OP_FUNC_3BUF(sum, double, double, +)
 OP_FUNC_3BUF(sum, long_double, long double, +)
-#if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC_3BUF(sum, fortran_real, ompi_fortran_real_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC_3BUF(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC_3BUF(sum, fortran_real2, ompi_fortran_real2_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC_3BUF(sum, fortran_real4, ompi_fortran_real4_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC_3BUF(sum, fortran_real8, ompi_fortran_real8_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC_3BUF(sum, fortran_real16, ompi_fortran_real16_t, +)
-#endif
+
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -970,14 +849,14 @@ OP_FUNC_3BUF(sum, c_short_float_complex, short float _Complex, +)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
 #endif
+OP_FUNC_3BUF(sum, c_long_double_complex, cuLongDoubleComplex, +)
 #endif // 0
 #undef current_func
-#define current_func(a, b) (cuCmulf(a,b))
+#define current_func(a, b) (cuCaddf(a,b))
 FUNC_FUNC_3BUF(sum, c_float_complex, cuFloatComplex)
 #undef current_func
-#define current_func(a, b) (cuCmul(a,b))
+#define current_func(a, b) (cuCadd(a,b))
 FUNC_FUNC_3BUF(sum, c_double_complex, cuDoubleComplex)
-//OP_FUNC_3BUF(sum, c_long_double_complex, cuLongDoubleComplex, +)
 
 /*************************************************************************
  * Product
@@ -995,25 +874,6 @@ OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF(prod,  long,  long, *)
 OP_FUNC_3BUF(prod,  ulong, unsigned long, *)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC_3BUF(prod, fortran_integer, ompi_fortran_integer_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC_3BUF(prod, fortran_integer1, ompi_fortran_integer1_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC_3BUF(prod, fortran_integer2, ompi_fortran_integer2_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC_3BUF(prod, fortran_integer4, ompi_fortran_integer4_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC_3BUF(prod, fortran_integer8, ompi_fortran_integer8_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC_3BUF(prod, fortran_integer16, ompi_fortran_integer16_t, *)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 OP_FUNC_3BUF(prod, short_float, short float, *)
@@ -1023,24 +883,7 @@ OP_FUNC_3BUF(prod, short_float, opal_short_float_t, *)
 OP_FUNC_3BUF(prod, float, float, *)
 OP_FUNC_3BUF(prod, double, double, *)
 OP_FUNC_3BUF(prod, long_double, long double, *)
-#if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC_3BUF(prod, fortran_real, ompi_fortran_real_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC_3BUF(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC_3BUF(prod, fortran_real2, ompi_fortran_real2_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC_3BUF(prod, fortran_real4, ompi_fortran_real4_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC_3BUF(prod, fortran_real8, ompi_fortran_real8_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC_3BUF(prod, fortran_real16, ompi_fortran_real16_t, *)
-#endif
+
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -1048,10 +891,14 @@ OP_FUNC_3BUF(prod, c_short_float_complex, short float _Complex, *)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC_3BUF(prod, c_float_complex, float _Complex, *)
-OP_FUNC_3BUF(prod, c_double_complex, double _Complex, *)
 OP_FUNC_3BUF(prod, c_long_double_complex, long double _Complex, *)
 #endif // 0
+#undef current_func
+#define current_func(a, b) (cuCmulf(a,b))
+FUNC_FUNC_3BUF(prod, c_float_complex, cuFloatComplex)
+#undef current_func
+#define current_func(a, b) (cuCmul(a,b))
+FUNC_FUNC_3BUF(prod, c_double_complex, cuDoubleComplex)
 
 /*************************************************************************
  * Logical AND
@@ -1071,10 +918,6 @@ FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(land,  long,  long)
 FUNC_FUNC_3BUF(land,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(land, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF(land, bool, bool)
 
@@ -1096,10 +939,6 @@ FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lor,  long,  long)
 FUNC_FUNC_3BUF(lor,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(lor, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF(lor, bool, bool)
 
@@ -1121,10 +960,6 @@ FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lxor,  long,  long)
 FUNC_FUNC_3BUF(lxor,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(lxor, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF(lxor, bool, bool)
 
@@ -1146,25 +981,6 @@ FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(band,  long,  long)
 FUNC_FUNC_3BUF(band,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(band, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(band, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(band, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(band, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(band, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(band, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF(band, byte, char)
 
@@ -1186,25 +1002,6 @@ FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bor,  long,  long)
 FUNC_FUNC_3BUF(bor,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(bor, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(bor, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(bor, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(bor, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(bor, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(bor, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF(bor, byte, char)
 
@@ -1226,63 +1023,13 @@ FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bxor,  long,  long)
 FUNC_FUNC_3BUF(bxor,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(bxor, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(bxor, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(bxor, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(bxor, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(bxor, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF(bxor, byte, char)
 
-/*************************************************************************
- * Min and max location "pair" datatypes
- *************************************************************************/
-
-/*
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_STRUCT_3BUF(2real, ompi_fortran_real_t, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_STRUCT_3BUF(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_STRUCT_3BUF(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
-#endif
-LOC_STRUCT_3BUF(float_int, float, int)
-LOC_STRUCT_3BUF(double_int, double, int)
-LOC_STRUCT_3BUF(long_int, long, int)
-LOC_STRUCT_3BUF(2int, int, int)
-LOC_STRUCT_3BUF(short_int, short, int)
-LOC_STRUCT_3BUF(long_double_int, long double, int)
-*/
-
 /*************************************************************************
  * Max location
  *************************************************************************/
 
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF(maxloc, 2real, >)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF(maxloc, 2double_precision, >)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF(maxloc, 2integer, >)
-#endif
 LOC_FUNC_3BUF(maxloc, float_int, >)
 LOC_FUNC_3BUF(maxloc, double_int, >)
 LOC_FUNC_3BUF(maxloc, long_int, >)
@@ -1294,15 +1041,6 @@ LOC_FUNC_3BUF(maxloc, long_double_int, >)
  * Min location
  *************************************************************************/
 
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF(minloc, 2real, <)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF(minloc, 2double_precision, <)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF(minloc, 2integer, <)
-#endif
 LOC_FUNC_3BUF(minloc, float_int, <)
 LOC_FUNC_3BUF(minloc, double_int, <)
 LOC_FUNC_3BUF(minloc, long_int, <)
diff --git a/ompi/mca/op/cuda/op_cuda_impl.h b/ompi/mca/op/cuda/op_cuda_impl.h
index 10ecbd3d084..2c02c32c313 100644
--- a/ompi/mca/op/cuda/op_cuda_impl.h
+++ b/ompi/mca/op/cuda/op_cuda_impl.h
@@ -163,13 +163,11 @@ OP_FUNC_SIG(sum, long_double, long double, +=)
 OP_FUNC_SIG(sum, c_short_float_complex, short float _Complex, +=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
+OP_FUNC_SIG(sum, c_long_double_complex, long double _Complex, +=)
 #endif
 #endif // 0
 FUNC_FUNC_SIG(sum, c_float_complex, cuFloatComplex)
 FUNC_FUNC_SIG(sum, c_double_complex, cuDoubleComplex)
-//OP_FUNC_SIG(sum, c_float_complex, float _Complex, +=)
-//OP_FUNC_SIG(sum, c_double_complex, double _Complex, +=)
-//OP_FUNC_SIG(sum, c_long_double_complex, long double _Complex, +=)
 
 /*************************************************************************
  * Product
@@ -208,11 +206,12 @@ OP_FUNC_SIG(prod, c_short_float_complex, short float _Complex, *=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC_SIG(prod, c_float_complex, float _Complex, *=)
-OP_FUNC_SIG(prod, c_double_complex, double _Complex, *=)
 OP_FUNC_SIG(prod, c_long_double_complex, long double _Complex, *=)
 #endif // 0
 
+FUNC_FUNC_SIG(prod, c_float_complex, cuFloatComplex)
+FUNC_FUNC_SIG(prod, c_double_complex, cuDoubleComplex)
+
 /*************************************************************************
  * Logical AND
  *************************************************************************/
@@ -422,25 +421,6 @@ FUNC_FUNC_3BUF_SIG(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(max,  long,  long)
 FUNC_FUNC_3BUF_SIG(max,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF_SIG(max, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF_SIG(max, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF_SIG(max, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF_SIG(max, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF_SIG(max, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF_SIG(max, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC_3BUF_SIG(max, short_float, short float)
@@ -450,25 +430,6 @@ FUNC_FUNC_3BUF_SIG(max, short_float, opal_short_float_t)
 FUNC_FUNC_3BUF_SIG(max, float, float)
 FUNC_FUNC_3BUF_SIG(max, double, double)
 FUNC_FUNC_3BUF_SIG(max, long_double, long double)
-#if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC_3BUF_SIG(max, fortran_real, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC_3BUF_SIG(max, fortran_double_precision, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC_3BUF_SIG(max, fortran_real2, ompi_fortran_real2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC_3BUF_SIG(max, fortran_real4, ompi_fortran_real4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC_3BUF_SIG(max, fortran_real8, ompi_fortran_real8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC_3BUF_SIG(max, fortran_real16, ompi_fortran_real16_t)
-#endif
-
 
 /*************************************************************************
  * Min
@@ -486,25 +447,6 @@ FUNC_FUNC_3BUF_SIG(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(min,  long,  long)
 FUNC_FUNC_3BUF_SIG(min,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF_SIG(min, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF_SIG(min, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF_SIG(min, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF_SIG(min, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF_SIG(min, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF_SIG(min, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC_3BUF_SIG(min, short_float, short float)
@@ -514,24 +456,6 @@ FUNC_FUNC_3BUF_SIG(min, short_float, opal_short_float_t)
 FUNC_FUNC_3BUF_SIG(min, float, float)
 FUNC_FUNC_3BUF_SIG(min, double, double)
 FUNC_FUNC_3BUF_SIG(min, long_double, long double)
-#if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC_3BUF_SIG(min, fortran_real, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC_3BUF_SIG(min, fortran_double_precision, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC_3BUF_SIG(min, fortran_real2, ompi_fortran_real2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC_3BUF_SIG(min, fortran_real4, ompi_fortran_real4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC_3BUF_SIG(min, fortran_real8, ompi_fortran_real8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC_3BUF_SIG(min, fortran_real16, ompi_fortran_real16_t)
-#endif
 
 /*************************************************************************
  * Sum
@@ -549,25 +473,6 @@ OP_FUNC_3BUF_SIG(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF_SIG(sum,  long,  long, +)
 OP_FUNC_3BUF_SIG(sum,  ulong, unsigned long, +)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC_3BUF_SIG(sum, fortran_integer, ompi_fortran_integer_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC_3BUF_SIG(sum, fortran_integer1, ompi_fortran_integer1_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC_3BUF_SIG(sum, fortran_integer2, ompi_fortran_integer2_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC_3BUF_SIG(sum, fortran_integer4, ompi_fortran_integer4_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC_3BUF_SIG(sum, fortran_integer8, ompi_fortran_integer8_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC_3BUF_SIG(sum, fortran_integer16, ompi_fortran_integer16_t, +)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 OP_FUNC_3BUF_SIG(sum, short_float, short float, +)
@@ -577,24 +482,7 @@ OP_FUNC_3BUF_SIG(sum, short_float, opal_short_float_t, +)
 OP_FUNC_3BUF_SIG(sum, float, float, +)
 OP_FUNC_3BUF_SIG(sum, double, double, +)
 OP_FUNC_3BUF_SIG(sum, long_double, long double, +)
-#if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC_3BUF_SIG(sum, fortran_real, ompi_fortran_real_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC_3BUF_SIG(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC_3BUF_SIG(sum, fortran_real2, ompi_fortran_real2_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC_3BUF_SIG(sum, fortran_real4, ompi_fortran_real4_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC_3BUF_SIG(sum, fortran_real8, ompi_fortran_real8_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC_3BUF_SIG(sum, fortran_real16, ompi_fortran_real16_t, +)
-#endif
+
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -602,10 +490,10 @@ OP_FUNC_3BUF_SIG(sum, c_short_float_complex, short float _Complex, +)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC_3BUF_SIG(sum, c_float_complex, float _Complex, +)
-OP_FUNC_3BUF_SIG(sum, c_double_complex, double _Complex, +)
 OP_FUNC_3BUF_SIG(sum, c_long_double_complex, long double _Complex, +)
 #endif // 0
+FUNC_FUNC_3BUF_SIG(sum, c_float_complex, cuFloatComplex)
+FUNC_FUNC_3BUF_SIG(sum, c_double_complex, cuDoubleComplex)
 
 /*************************************************************************
  * Product
@@ -623,25 +511,6 @@ OP_FUNC_3BUF_SIG(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF_SIG(prod,  long,  long, *)
 OP_FUNC_3BUF_SIG(prod,  ulong, unsigned long, *)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC_3BUF_SIG(prod, fortran_integer, ompi_fortran_integer_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC_3BUF_SIG(prod, fortran_integer1, ompi_fortran_integer1_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC_3BUF_SIG(prod, fortran_integer2, ompi_fortran_integer2_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC_3BUF_SIG(prod, fortran_integer4, ompi_fortran_integer4_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC_3BUF_SIG(prod, fortran_integer8, ompi_fortran_integer8_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC_3BUF_SIG(prod, fortran_integer16, ompi_fortran_integer16_t, *)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 OP_FUNC_3BUF_SIG(prod, short_float, short float, *)
@@ -651,24 +520,7 @@ OP_FUNC_3BUF_SIG(prod, short_float, opal_short_float_t, *)
 OP_FUNC_3BUF_SIG(prod, float, float, *)
 OP_FUNC_3BUF_SIG(prod, double, double, *)
 OP_FUNC_3BUF_SIG(prod, long_double, long double, *)
-#if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC_3BUF_SIG(prod, fortran_real, ompi_fortran_real_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC_3BUF_SIG(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC_3BUF_SIG(prod, fortran_real2, ompi_fortran_real2_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC_3BUF_SIG(prod, fortran_real4, ompi_fortran_real4_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC_3BUF_SIG(prod, fortran_real8, ompi_fortran_real8_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC_3BUF_SIG(prod, fortran_real16, ompi_fortran_real16_t, *)
-#endif
+
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -680,6 +532,8 @@ OP_FUNC_3BUF_SIG(prod, c_float_complex, float _Complex, *)
 OP_FUNC_3BUF_SIG(prod, c_double_complex, double _Complex, *)
 OP_FUNC_3BUF_SIG(prod, c_long_double_complex, long double _Complex, *)
 #endif // 0
+FUNC_FUNC_3BUF_SIG(prod, c_float_complex, cuFloatComplex)
+FUNC_FUNC_3BUF_SIG(prod, c_double_complex, cuDoubleComplex)
 
 /*************************************************************************
  * Logical AND
@@ -697,10 +551,6 @@ FUNC_FUNC_3BUF_SIG(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(land,  long,  long)
 FUNC_FUNC_3BUF_SIG(land,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF_SIG(land, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF_SIG(land, bool, bool)
 
@@ -720,10 +570,6 @@ FUNC_FUNC_3BUF_SIG(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(lor,  long,  long)
 FUNC_FUNC_3BUF_SIG(lor,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF_SIG(lor, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF_SIG(lor, bool, bool)
 
@@ -743,10 +589,6 @@ FUNC_FUNC_3BUF_SIG(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(lxor,  long,  long)
 FUNC_FUNC_3BUF_SIG(lxor,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF_SIG(lxor, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF_SIG(lxor, bool, bool)
 
@@ -766,25 +608,6 @@ FUNC_FUNC_3BUF_SIG(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(band,  long,  long)
 FUNC_FUNC_3BUF_SIG(band,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF_SIG(band, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF_SIG(band, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF_SIG(band, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF_SIG(band, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF_SIG(band, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF_SIG(band, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF_SIG(band, byte, char)
 
@@ -804,25 +627,6 @@ FUNC_FUNC_3BUF_SIG(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(bor,  long,  long)
 FUNC_FUNC_3BUF_SIG(bor,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF_SIG(bor, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF_SIG(bor, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF_SIG(bor, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF_SIG(bor, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF_SIG(bor, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF_SIG(bor, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF_SIG(bor, byte, char)
 
@@ -842,25 +646,6 @@ FUNC_FUNC_3BUF_SIG(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF_SIG(bxor,  long,  long)
 FUNC_FUNC_3BUF_SIG(bxor,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF_SIG(bxor, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF_SIG(bxor, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF_SIG(bxor, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF_SIG(bxor, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF_SIG(bxor, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF_SIG(bxor, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF_SIG(bxor, byte, char)
 
@@ -868,17 +653,6 @@ FUNC_FUNC_3BUF_SIG(bxor, byte, char)
  * Max location
  *************************************************************************/
 
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF_SIG(maxloc, 2real, >)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF_SIG(maxloc, 2double_precision, >)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF_SIG(maxloc, 2integer, >)
-#endif
-#endif // 0
 LOC_FUNC_3BUF_SIG(maxloc, float_int, >)
 LOC_FUNC_3BUF_SIG(maxloc, double_int, >)
 LOC_FUNC_3BUF_SIG(maxloc, long_int, >)
@@ -890,17 +664,6 @@ LOC_FUNC_3BUF_SIG(maxloc, long_double_int, >)
  * Min location
  *************************************************************************/
 
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF_SIG(minloc, 2real, <)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF_SIG(minloc, 2double_precision, <)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF_SIG(minloc, 2integer, <)
-#endif
-#endif // 0
 LOC_FUNC_3BUF_SIG(minloc, float_int, <)
 LOC_FUNC_3BUF_SIG(minloc, double_int, <)
 LOC_FUNC_3BUF_SIG(minloc, long_int, <)
diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index b22ea7e6f13..1b79e890f72 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -17,7 +17,7 @@
 # First, list all .h and .c sources.  It is necessary to list all .h
 # files so that they will be picked up in the distribution tarball.
 
-AM_CPPFLAGS = $(common_rocm_CPPFLAGS)
+AM_CPPFLAGS = $(op_rocm_CPPFLAGS)
 
 dist_ompidata_DATA = help-ompi-mca-op-rocm.txt
 
@@ -25,12 +25,10 @@ sources = op_rocm_component.c op_rocm.h op_rocm_functions.c op_rocm_impl.h
 rocm_sources = op_rocm_impl.cpp
 
 HIPCC = hipcc
-#HIPCCFLAGS= -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.5.0/include
-
 
 .cpp.l$(OBJEXT):
 	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -O2 -std=c++17 -fvectorize -prefer-non-pic $(HIPCCFLAGS) -Wc,-fPIC,-g -c $<
+	$(LIBTOOLFLAGS) --mode=compile $(HIPCC) -O2 -std=c++17 -fvectorize -prefer-non-pic -Wc,-fPIC,-g -c $<
 
 # -o $($@.o:.lo)
 
@@ -67,7 +65,7 @@ mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_rocm_la_SOURCES = $(sources)
 mca_op_rocm_la_LIBADD = $(rocm_sources:.cpp=.lo)
 mca_op_rocm_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
-		$(accelerator_rocm_LIBS) $(HIPCCFLAGS)
+		$(op_rocm_LIBS)
 EXTRA_mca_op_rocm_la_SOURCES = $(rocm_sources)
 
 # Specific information for static builds.
@@ -79,6 +77,6 @@ noinst_LTLIBRARIES = $(component_noinst)
 libmca_op_rocm_la_SOURCES = $(sources)
 libmca_op_rocm_la_LIBADD = $(rocm_sources:.cpp=.lo)
 libmca_op_rocm_la_LDFLAGS = -module -avoid-version\
-		$(accelerator_rocm_LIBS) ${HIPCCFLAGS}
+		$(op_rocm_LIBS)
 EXTRA_libmca_op_rocm_la_SOURCES = $(rocm_sources)
 
diff --git a/ompi/mca/op/rocm/configure.m4 b/ompi/mca/op/rocm/configure.m4
index 79de7769fa9..ffd88698be0 100644
--- a/ompi/mca/op/rocm/configure.m4
+++ b/ompi/mca/op/rocm/configure.m4
@@ -23,7 +23,7 @@ AC_DEFUN([MCA_ompi_op_rocm_CONFIG],[
 
     AC_CONFIG_FILES([ompi/mca/op/rocm/Makefile])
 
-    OPAL_CHECK_CUDA([op_rocm])
+    OPAL_CHECK_ROCM([op_rocm])
 
     AS_IF([test "x$ROCM_SUPPORT" = "x1"],
           [$1],
diff --git a/ompi/mca/op/rocm/op_rocm_functions.c b/ompi/mca/op/rocm/op_rocm_functions.c
index fbb68dfdab4..43420dc18a7 100644
--- a/ompi/mca/op/rocm/op_rocm_functions.c
+++ b/ompi/mca/op/rocm/op_rocm_functions.c
@@ -67,16 +67,9 @@ static inline void device_op_pre(const void *orig_source1,
         source1_rc = opal_accelerator.check_addr(*source1, source1_device, &source1_flags);
         *device = *target_device;
 
-        // TODO
-        //printf("OUT - target device & rc %d %d source %d %d\n", *target_device, target_rc, *source1_device, source1_rc);
         if (NULL != orig_source2) {
             source2_rc = opal_accelerator.check_addr(*source2, source2_device, &source2_flags);
-            //printf("device_op_pre: target %p rc %d dev %d, source1 %p rc %d dev %d, source2 %p rc %d dev %d, device %d\n",
-            //       orig_target, target_rc, *target_device, orig_source1, source1_rc, *source1_device, orig_source2, source2_rc, *source2_device, *device);
-        }
-
-        //printf("device_op_pre: target rc %d dev %d, source rc %d dev %d, device %d\n",
-        //       target_rc, *target_device, source_rc, *source_device, *device);
+         }
 
         if (0 == target_rc && 0 == source1_rc && 0 == source2_rc) {
             /* no buffers are on any device, select device 0 */
@@ -96,7 +89,6 @@ static inline void device_op_pre(const void *orig_source1,
 
             if (0 == target_rc) {
                 // allocate memory on the device for the target buffer
-                //printf("copying target from device %d to host\n", *target_device);
                 opal_accelerator.mem_alloc_stream(*device, target, nbytes, stream);
                 CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*target, orig_target, nbytes, *(hipStream_t*)stream->stream));
                 *target_device = -1; // mark target device as host
@@ -104,16 +96,13 @@ static inline void device_op_pre(const void *orig_source1,
 
             if (0 == source1_rc || *device != *source1_device) {
                 // allocate memory on the device for the source buffer
-                //printf("allocating source on device %d\n", *device);
                 opal_accelerator.mem_alloc_stream(*device, source1, nbytes, stream);
                 if (0 == source1_rc) {
                     /* copy from host to device */
-                    //printf("copying source from host to device %d\n", *device);
-                    CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source1, orig_source1, nbytes, *(hipStream_t*)stream->stream));
+                    CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source1, (void*)orig_source1, nbytes, *(hipStream_t*)stream->stream));
                 } else {
                     /* copy from one device to another device */
                     /* TODO: does this actually work? Can we enable P2P? */
-                    //printf("attempting cross-device copy for source\n");
                     CHECK(hipMemcpyDtoDAsync, ((hipDeviceptr_t)*source1, (hipDeviceptr_t)orig_source1, nbytes, *(hipStream_t*)stream->stream));
                 }
             }
@@ -130,7 +119,7 @@ static inline void device_op_pre(const void *orig_source1,
             if (0 == source2_rc) {
                 /* copy from host to device */
                 //printf("copying source from host to device %d\n", *device);
-                CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source2, orig_source2, nbytes, *(hipStream_t*)stream->stream));
+                CHECK(hipMemcpyHtoDAsync, ((hipDeviceptr_t)*source2, (void*)orig_source2, nbytes, *(hipStream_t*)stream->stream));
             } else {
                 /* copy from one device to another device */
                 /* TODO: does this actually work? Can we enable P2P? */
@@ -297,14 +286,6 @@ FORT_INT_FUNC(max, fortran_integer8, ompi_fortran_integer8_t)
 FORT_INT_FUNC(max, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 
-#if 0
-/* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-FUNC_FUNC(max, short_float, short float)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-FUNC_FUNC(max, short_float, opal_short_float_t)
-#endif
-#endif // 0
 FUNC_FUNC(max, float, float)
 FUNC_FUNC(max, double, double)
 FUNC_FUNC(max, long_double, long double)
@@ -366,15 +347,6 @@ FORT_INT_FUNC(min, fortran_integer8, ompi_fortran_integer8_t)
 FORT_INT_FUNC(min, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 
-#if 0
-/* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-FUNC_FUNC(min, short_float, short float)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-FUNC_FUNC(min, short_float, opal_short_float_t)
-#endif
-#endif // 0
-
 FUNC_FUNC(min, float, float)
 FUNC_FUNC(min, double, double)
 FUNC_FUNC(min, long_double, long double)
@@ -433,15 +405,6 @@ FORT_INT_FUNC(sum, fortran_integer8, ompi_fortran_integer8_t)
 FORT_INT_FUNC(sum, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 
-#if 0
-/* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-OP_FUNC(sum, short_float, short float, +=)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-OP_FUNC(sum, short_float, opal_short_float_t, +=)
-#endif
-#endif // 0
-
 OP_FUNC(sum, float, float, +=)
 OP_FUNC(sum, double, double, +=)
 OP_FUNC(sum, long_double, long double, +=)
@@ -470,10 +433,14 @@ OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC(sum, c_float_complex, float _Complex, +=)
-OP_FUNC(sum, c_double_complex, double _Complex, +=)
 OP_FUNC(sum, c_long_double_complex, long double _Complex, +=)
 #endif // 0
+#undef current_func
+#define current_func(a, b) (hipCaddf(a,b))
+FUNC_FUNC(sum, c_float_complex, hipFloatComplex)
+#undef current_func
+#define current_func(a, b) (hipCadd(a,b))
+FUNC_FUNC(sum, c_double_complex, hipDoubleComplex)
 
 /*************************************************************************
  * Product
@@ -512,14 +479,6 @@ FORT_INT_FUNC(prod, fortran_integer16, ompi_fortran_integer16_t)
 #endif
 /* Floating point */
 
-#if 0
-#if defined(HAVE_SHORT_FLOAT)
-OP_FUNC(prod, short_float, short float, *=)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-OP_FUNC(prod, short_float, opal_short_float_t, *=)
-#endif
-#endif // 0
-
 OP_FUNC(prod, float, float, *=)
 OP_FUNC(prod, double, double, *=)
 OP_FUNC(prod, long_double, long double, *=)
@@ -541,6 +500,7 @@ FORT_FLOAT_FUNC(prod, fortran_real8, ompi_fortran_real8_t)
 #if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
 FORT_FLOAT_FUNC(prod, fortran_real16, ompi_fortran_real16_t)
 #endif
+
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -548,10 +508,14 @@ OP_FUNC(prod, c_short_float_complex, short float _Complex, *=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC(prod, c_float_complex, float _Complex, *=)
-OP_FUNC(prod, c_double_complex, double _Complex, *=)
 OP_FUNC(prod, c_long_double_complex, long double _Complex, *=)
 #endif // 0
+#undef current_func
+#define current_func(a, b) (hipCmulf(a,b))
+FUNC_FUNC(prod, c_float_complex, hipFloatComplex)
+#undef current_func
+#define current_func(a, b) (hipCmul(a,b))
+FUNC_FUNC(prod, c_double_complex, hipDoubleComplex)
 
 /*************************************************************************
  * Logical AND
@@ -753,17 +717,6 @@ FUNC_FUNC(bxor, byte, char)
  * Max location
  *************************************************************************/
 
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC(maxloc, 2real, >)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC(maxloc, 2double_precision, >)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC(maxloc, 2integer, >)
-#endif
-#endif // 0
 LOC_FUNC(maxloc, float_int, >)
 LOC_FUNC(maxloc, double_int, >)
 LOC_FUNC(maxloc, long_int, >)
@@ -774,17 +727,7 @@ LOC_FUNC(maxloc, long_double_int, >)
 /*************************************************************************
  * Min location
  *************************************************************************/
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC(minloc, 2real, <)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC(minloc, 2double_precision, <)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC(minloc, 2integer, <)
-#endif
-#endif // 0
+
 LOC_FUNC(minloc, float_int, <)
 LOC_FUNC(minloc, double_int, <)
 LOC_FUNC(minloc, long_int, <)
@@ -1087,10 +1030,14 @@ OP_FUNC_3BUF(sum, c_short_float_complex, short float _Complex, +)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC_3BUF(sum, c_float_complex, float _Complex, +)
-OP_FUNC_3BUF(sum, c_double_complex, double _Complex, +)
 OP_FUNC_3BUF(sum, c_long_double_complex, long double _Complex, +)
 #endif // 0
+#undef current_func
+#define current_func(a, b) (hipCaddf(a,b))
+FUNC_FUNC_3BUF(sum, c_float_complex, hipFloatComplex)
+#undef current_func
+#define current_func(a, b) (hipCadd(a,b))
+FUNC_FUNC_3BUF(sum, c_double_complex, hipDoubleComplex)
 
 /*************************************************************************
  * Product
@@ -1163,10 +1110,14 @@ OP_FUNC_3BUF(prod, c_short_float_complex, short float _Complex, *)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC_3BUF(prod, c_float_complex, float _Complex, *)
-OP_FUNC_3BUF(prod, c_double_complex, double _Complex, *)
 OP_FUNC_3BUF(prod, c_long_double_complex, long double _Complex, *)
 #endif // 0
+#undef current_func
+#define current_func(a, b) (hipCmulf(a,b))
+FUNC_FUNC_3BUF(prod, c_float_complex, hipFloatComplex)
+#undef current_func
+#define current_func(a, b) (hipCmul(a,b))
+FUNC_FUNC_3BUF(prod, c_double_complex, hipDoubleComplex)
 
 /*************************************************************************
  * Logical AND
@@ -1363,42 +1314,10 @@ FORT_INT_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
 /* Byte */
 FORT_INT_FUNC_3BUF(bxor, byte, char)
 
-/*************************************************************************
- * Min and max location "pair" datatypes
- *************************************************************************/
-
-/*
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_STRUCT_3BUF(2real, ompi_fortran_real_t, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_STRUCT_3BUF(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_STRUCT_3BUF(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
-#endif
-LOC_STRUCT_3BUF(float_int, float, int)
-LOC_STRUCT_3BUF(double_int, double, int)
-LOC_STRUCT_3BUF(long_int, long, int)
-LOC_STRUCT_3BUF(2int, int, int)
-LOC_STRUCT_3BUF(short_int, short, int)
-LOC_STRUCT_3BUF(long_double_int, long double, int)
-*/
-
 /*************************************************************************
  * Max location
  *************************************************************************/
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF(maxloc, 2real, >)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF(maxloc, 2double_precision, >)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF(maxloc, 2integer, >)
-#endif
-#endif // 0
+
 LOC_FUNC_3BUF(maxloc, float_int, >)
 LOC_FUNC_3BUF(maxloc, double_int, >)
 LOC_FUNC_3BUF(maxloc, long_int, >)
@@ -1409,17 +1328,7 @@ LOC_FUNC_3BUF(maxloc, long_double_int, >)
 /*************************************************************************
  * Min location
  *************************************************************************/
-#if 0
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF(minloc, 2real, <)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF(minloc, 2double_precision, <)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF(minloc, 2integer, <)
-#endif
-#endif // 0
+
 LOC_FUNC_3BUF(minloc, float_int, <)
 LOC_FUNC_3BUF(minloc, double_int, <)
 LOC_FUNC_3BUF(minloc, long_int, <)
@@ -1562,7 +1471,7 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 
 #if OMPI_HAVE_FORTRAN_LOGICAL
 #define FORTRAN_LOGICAL(name, ftype)                                          \
-  ompi_op_rocm_##ftype##_##name##_fortran_logical  /* OMPI_OP_CUDA_TYPE_LOGICAL */
+  ompi_op_rocm_##ftype##_##name##_fortran_logical  /* OMPI_OP_ROCM_TYPE_LOGICAL */
 #else
 #define FORTRAN_LOGICAL(name, ftype) NULL
 #endif
@@ -1579,21 +1488,19 @@ LOC_FUNC_3BUF(minloc, long_double_int, <)
 #else
 #define SHORT_FLOAT_COMPLEX(name, ftype) NULL
 #endif
-#define FLOAT_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_float_complex
-#define DOUBLE_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_double_complex
 #define LONG_DOUBLE_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_long_double_complex
 #else
 #define SHORT_FLOAT_COMPLEX(name, ftype) NULL
-#define FLOAT_COMPLEX(name, ftype) NULL
-#define DOUBLE_COMPLEX(name, ftype) NULL
 #define LONG_DOUBLE_COMPLEX(name, ftype) NULL
 #endif // 0
+#define FLOAT_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_float_complex
+#define DOUBLE_COMPLEX(name, ftype) ompi_op_rocm_##ftype##_##name##_c_double_complex
 
 #define COMPLEX(name, ftype)                                                  \
-    [OMPI_OP_CUDA_TYPE_C_SHORT_FLOAT_COMPLEX] = SHORT_FLOAT_COMPLEX(name, ftype), \
-    [OMPI_OP_CUDA_TYPE_C_FLOAT_COMPLEX] = FLOAT_COMPLEX(name, ftype),         \
-    [OMPI_OP_CUDA_TYPE_C_DOUBLE_COMPLEX] = DOUBLE_COMPLEX(name, ftype),       \
-    [OMPI_OP_CUDA_TYPE_C_LONG_DOUBLE_COMPLEX] = LONG_DOUBLE_COMPLEX(name, ftype)
+    [OMPI_OP_BASE_TYPE_C_SHORT_FLOAT_COMPLEX] = SHORT_FLOAT_COMPLEX(name, ftype), \
+    [OMPI_OP_BASE_TYPE_C_FLOAT_COMPLEX] = FLOAT_COMPLEX(name, ftype),         \
+    [OMPI_OP_BASE_TYPE_C_DOUBLE_COMPLEX] = DOUBLE_COMPLEX(name, ftype),       \
+    [OMPI_OP_BASE_TYPE_C_LONG_DOUBLE_COMPLEX] = LONG_DOUBLE_COMPLEX(name, ftype)
 
 /** Byte ****************************************************************/
 
@@ -1667,14 +1574,14 @@ ompi_op_base_stream_handler_fn_t ompi_op_rocm_functions[OMPI_OP_BASE_FORTRAN_OP_
             C_INTEGER(sum, 2buff),
             FORTRAN_INTEGER(sum, 2buff),
             FLOATING_POINT(sum, 2buff),
-            NULL,
+            COMPLEX(sum, 2buff),
         },
         /* Corresponds to MPI_PROD */
         [OMPI_OP_BASE_FORTRAN_PROD] = {
             C_INTEGER(prod, 2buff),
             FORTRAN_INTEGER(prod, 2buff),
             FLOATING_POINT(prod, 2buff),
-            NULL,
+            COMPLEX(prod, 2buff),
         },
         /* Corresponds to MPI_LAND */
         [OMPI_OP_BASE_FORTRAN_LAND] = {
@@ -1753,14 +1660,14 @@ ompi_op_base_3buff_stream_handler_fn_t ompi_op_rocm_3buff_functions[OMPI_OP_BASE
             C_INTEGER(sum, 3buff),
             FORTRAN_INTEGER(sum, 3buff),
             FLOATING_POINT(sum, 3buff),
-            NULL,
+            COMPLEX(sum, 3buff),
         },
         /* Corresponds to MPI_PROD */
         [OMPI_OP_BASE_FORTRAN_PROD] = {
             C_INTEGER(prod, 3buff),
             FORTRAN_INTEGER(prod, 3buff),
             FLOATING_POINT(prod, 3buff),
-            NULL,
+            COMPLEX(prod, 3buff),
         },
         /* Corresponds to MPI_LAND */
         [OMPI_OP_BASE_FORTRAN_LAND] ={
diff --git a/ompi/mca/op/rocm/op_rocm_impl.cpp b/ompi/mca/op/rocm/op_rocm_impl.cpp
index f5068f9a2a0..28142ce0e0b 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.cpp
+++ b/ompi/mca/op/rocm/op_rocm_impl.cpp
@@ -452,6 +452,7 @@ OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
+OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 #endif // 0
 #undef current_func
 #define current_func(a, b) (hipCaddf(a,b))
@@ -459,7 +460,6 @@ FUNC_FUNC(sum, c_float_complex, hipFloatComplex)
 #undef current_func
 #define current_func(a, b) (hipCadd(a,b))
 FUNC_FUNC(sum, c_double_complex, hipDoubleComplex)
-//OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 
 /*************************************************************************
  * Product
@@ -500,6 +500,7 @@ OP_FUNC(sum, c_short_float_complex, short float _Complex, +=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
+OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 #endif // 0
 #undef current_func
 #define current_func(a, b) (hipCmulf(a,b))
@@ -507,7 +508,6 @@ FUNC_FUNC(prod, c_float_complex, hipFloatComplex)
 #undef current_func
 #define current_func(a, b) (hipCmul(a,b))
 FUNC_FUNC(prod, c_double_complex, hipDoubleComplex)
-//OP_FUNC(sum, c_long_double_complex, cuLongDoubleComplex, +=)
 
 /*************************************************************************
  * Logical AND
@@ -527,10 +527,6 @@ FUNC_FUNC(land, uint64_t, uint64_t)
 FUNC_FUNC(land,  long,  long)
 FUNC_FUNC(land,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC(land, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC(land, bool, bool)
 
@@ -796,25 +792,6 @@ FUNC_FUNC_3BUF(max, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(max,  long,  long)
 FUNC_FUNC_3BUF(max,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(max, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(max, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(max, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(max, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(max, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(max, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC_3BUF(max, short_float, short float)
@@ -824,25 +801,6 @@ FUNC_FUNC_3BUF(max, short_float, opal_short_float_t)
 FUNC_FUNC_3BUF(max, float, float)
 FUNC_FUNC_3BUF(max, double, double)
 FUNC_FUNC_3BUF(max, long_double, long double)
-#if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC_3BUF(max, fortran_real, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC_3BUF(max, fortran_double_precision, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC_3BUF(max, fortran_real2, ompi_fortran_real2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC_3BUF(max, fortran_real4, ompi_fortran_real4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC_3BUF(max, fortran_real8, ompi_fortran_real8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC_3BUF(max, fortran_real16, ompi_fortran_real16_t)
-#endif
-
 
 /*************************************************************************
  * Min
@@ -862,25 +820,6 @@ FUNC_FUNC_3BUF(min, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(min,  long,  long)
 FUNC_FUNC_3BUF(min,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(min, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(min, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(min, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(min, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(min, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(min, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 FUNC_FUNC_3BUF(min, short_float, short float)
@@ -890,24 +829,6 @@ FUNC_FUNC_3BUF(min, short_float, opal_short_float_t)
 FUNC_FUNC_3BUF(min, float, float)
 FUNC_FUNC_3BUF(min, double, double)
 FUNC_FUNC_3BUF(min, long_double, long double)
-#if OMPI_HAVE_FORTRAN_REAL
-FUNC_FUNC_3BUF(min, fortran_real, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-FUNC_FUNC_3BUF(min, fortran_double_precision, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-FUNC_FUNC_3BUF(min, fortran_real2, ompi_fortran_real2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-FUNC_FUNC_3BUF(min, fortran_real4, ompi_fortran_real4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-FUNC_FUNC_3BUF(min, fortran_real8, ompi_fortran_real8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-FUNC_FUNC_3BUF(min, fortran_real16, ompi_fortran_real16_t)
-#endif
 
 /*************************************************************************
  * Sum
@@ -925,25 +846,6 @@ OP_FUNC_3BUF(sum, uint64_t, uint64_t, +)
 OP_FUNC_3BUF(sum,  long,  long, +)
 OP_FUNC_3BUF(sum,  ulong, unsigned long, +)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC_3BUF(sum, fortran_integer, ompi_fortran_integer_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC_3BUF(sum, fortran_integer1, ompi_fortran_integer1_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC_3BUF(sum, fortran_integer2, ompi_fortran_integer2_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC_3BUF(sum, fortran_integer4, ompi_fortran_integer4_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC_3BUF(sum, fortran_integer8, ompi_fortran_integer8_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC_3BUF(sum, fortran_integer16, ompi_fortran_integer16_t, +)
-#endif
 /* Floating point */
 #if defined(HAVE_SHORT_FLOAT)
 OP_FUNC_3BUF(sum, short_float, short float, +)
@@ -953,24 +855,7 @@ OP_FUNC_3BUF(sum, short_float, opal_short_float_t, +)
 OP_FUNC_3BUF(sum, float, float, +)
 OP_FUNC_3BUF(sum, double, double, +)
 OP_FUNC_3BUF(sum, long_double, long double, +)
-#if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC_3BUF(sum, fortran_real, ompi_fortran_real_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC_3BUF(sum, fortran_double_precision, ompi_fortran_double_precision_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC_3BUF(sum, fortran_real2, ompi_fortran_real2_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC_3BUF(sum, fortran_real4, ompi_fortran_real4_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC_3BUF(sum, fortran_real8, ompi_fortran_real8_t, +)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC_3BUF(sum, fortran_real16, ompi_fortran_real16_t, +)
-#endif
+
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -978,14 +863,14 @@ OP_FUNC_3BUF(sum, c_short_float_complex, short float _Complex, +)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_SUM_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
 #endif
+OP_FUNC_3BUF(sum, c_long_double_complex, cuLongDoubleComplex, +)
 #endif // 0
 #undef current_func
-#define current_func(a, b) (hipCmulf(a,b))
+#define current_func(a, b) (hipCaddf(a,b))
 FUNC_FUNC_3BUF(sum, c_float_complex, hipFloatComplex)
 #undef current_func
-#define current_func(a, b) (hipCmul(a,b))
+#define current_func(a, b) (hipCadd(a,b))
 FUNC_FUNC_3BUF(sum, c_double_complex, hipDoubleComplex)
-//OP_FUNC_3BUF(sum, c_long_double_complex, cuLongDoubleComplex, +)
 
 /*************************************************************************
  * Product
@@ -1003,52 +888,6 @@ OP_FUNC_3BUF(prod, uint64_t, uint64_t, *)
 OP_FUNC_3BUF(prod,  long,  long, *)
 OP_FUNC_3BUF(prod,  ulong, unsigned long, *)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-OP_FUNC_3BUF(prod, fortran_integer, ompi_fortran_integer_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-OP_FUNC_3BUF(prod, fortran_integer1, ompi_fortran_integer1_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-OP_FUNC_3BUF(prod, fortran_integer2, ompi_fortran_integer2_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-OP_FUNC_3BUF(prod, fortran_integer4, ompi_fortran_integer4_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-OP_FUNC_3BUF(prod, fortran_integer8, ompi_fortran_integer8_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-OP_FUNC_3BUF(prod, fortran_integer16, ompi_fortran_integer16_t, *)
-#endif
-/* Floating point */
-#if defined(HAVE_SHORT_FLOAT)
-OP_FUNC_3BUF(prod, short_float, short float, *)
-#elif defined(HAVE_OPAL_SHORT_FLOAT_T)
-OP_FUNC_3BUF(prod, short_float, opal_short_float_t, *)
-#endif
-OP_FUNC_3BUF(prod, float, float, *)
-OP_FUNC_3BUF(prod, double, double, *)
-OP_FUNC_3BUF(prod, long_double, long double, *)
-#if OMPI_HAVE_FORTRAN_REAL
-OP_FUNC_3BUF(prod, fortran_real, ompi_fortran_real_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-OP_FUNC_3BUF(prod, fortran_double_precision, ompi_fortran_double_precision_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL2
-OP_FUNC_3BUF(prod, fortran_real2, ompi_fortran_real2_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL4
-OP_FUNC_3BUF(prod, fortran_real4, ompi_fortran_real4_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL8
-OP_FUNC_3BUF(prod, fortran_real8, ompi_fortran_real8_t, *)
-#endif
-#if OMPI_HAVE_FORTRAN_REAL16 && OMPI_REAL16_MATCHES_C
-OP_FUNC_3BUF(prod, fortran_real16, ompi_fortran_real16_t, *)
-#endif
 /* Complex */
 #if 0
 #if defined(HAVE_SHORT_FLOAT__COMPLEX)
@@ -1056,10 +895,14 @@ OP_FUNC_3BUF(prod, c_short_float_complex, short float _Complex, *)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC_3BUF(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC_3BUF(prod, c_float_complex, float _Complex, *)
-OP_FUNC_3BUF(prod, c_double_complex, double _Complex, *)
 OP_FUNC_3BUF(prod, c_long_double_complex, long double _Complex, *)
 #endif // 0
+#undef current_func
+#define current_func(a, b) (hipCmulf(a,b))
+FUNC_FUNC_3BUF(prod, c_float_complex, hipFloatComplex)
+#undef current_func
+#define current_func(a, b) (hipCmul(a,b))
+FUNC_FUNC_3BUF(prod, c_double_complex, hipDoubleComplex)
 
 /*************************************************************************
  * Logical AND
@@ -1079,10 +922,6 @@ FUNC_FUNC_3BUF(land, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(land,  long,  long)
 FUNC_FUNC_3BUF(land,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(land, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF(land, bool, bool)
 
@@ -1104,10 +943,6 @@ FUNC_FUNC_3BUF(lor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lor,  long,  long)
 FUNC_FUNC_3BUF(lor,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(lor, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF(lor, bool, bool)
 
@@ -1129,10 +964,6 @@ FUNC_FUNC_3BUF(lxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(lxor,  long,  long)
 FUNC_FUNC_3BUF(lxor,  ulong, unsigned long)
 
-/* Logical */
-#if OMPI_HAVE_FORTRAN_LOGICAL
-FUNC_FUNC_3BUF(lxor, fortran_logical, ompi_fortran_logical_t)
-#endif
 /* C++ bool */
 FUNC_FUNC_3BUF(lxor, bool, bool)
 
@@ -1154,25 +985,6 @@ FUNC_FUNC_3BUF(band, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(band,  long,  long)
 FUNC_FUNC_3BUF(band,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(band, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(band, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(band, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(band, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(band, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(band, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF(band, byte, char)
 
@@ -1194,25 +1006,6 @@ FUNC_FUNC_3BUF(bor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bor,  long,  long)
 FUNC_FUNC_3BUF(bor,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(bor, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(bor, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(bor, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(bor, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(bor, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(bor, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF(bor, byte, char)
 
@@ -1234,63 +1027,13 @@ FUNC_FUNC_3BUF(bxor, uint64_t, uint64_t)
 FUNC_FUNC_3BUF(bxor,  long,  long)
 FUNC_FUNC_3BUF(bxor,  ulong, unsigned long)
 
-/* Fortran integer */
-#if OMPI_HAVE_FORTRAN_INTEGER
-FUNC_FUNC_3BUF(bxor, fortran_integer, ompi_fortran_integer_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER1
-FUNC_FUNC_3BUF(bxor, fortran_integer1, ompi_fortran_integer1_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER2
-FUNC_FUNC_3BUF(bxor, fortran_integer2, ompi_fortran_integer2_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER4
-FUNC_FUNC_3BUF(bxor, fortran_integer4, ompi_fortran_integer4_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER8
-FUNC_FUNC_3BUF(bxor, fortran_integer8, ompi_fortran_integer8_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER16
-FUNC_FUNC_3BUF(bxor, fortran_integer16, ompi_fortran_integer16_t)
-#endif
 /* Byte */
 FUNC_FUNC_3BUF(bxor, byte, char)
 
-/*************************************************************************
- * Min and max location "pair" datatypes
- *************************************************************************/
-
-/*
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_STRUCT_3BUF(2real, ompi_fortran_real_t, ompi_fortran_real_t)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_STRUCT_3BUF(2double_precision, ompi_fortran_double_precision_t, ompi_fortran_double_precision_t)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_STRUCT_3BUF(2integer, ompi_fortran_integer_t, ompi_fortran_integer_t)
-#endif
-LOC_STRUCT_3BUF(float_int, float, int)
-LOC_STRUCT_3BUF(double_int, double, int)
-LOC_STRUCT_3BUF(long_int, long, int)
-LOC_STRUCT_3BUF(2int, int, int)
-LOC_STRUCT_3BUF(short_int, short, int)
-LOC_STRUCT_3BUF(long_double_int, long double, int)
-*/
-
 /*************************************************************************
  * Max location
  *************************************************************************/
 
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF(maxloc, 2real, >)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF(maxloc, 2double_precision, >)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF(maxloc, 2integer, >)
-#endif
 LOC_FUNC_3BUF(maxloc, float_int, >)
 LOC_FUNC_3BUF(maxloc, double_int, >)
 LOC_FUNC_3BUF(maxloc, long_int, >)
@@ -1302,15 +1045,6 @@ LOC_FUNC_3BUF(maxloc, long_double_int, >)
  * Min location
  *************************************************************************/
 
-#if OMPI_HAVE_FORTRAN_REAL
-LOC_FUNC_3BUF(minloc, 2real, <)
-#endif
-#if OMPI_HAVE_FORTRAN_DOUBLE_PRECISION
-LOC_FUNC_3BUF(minloc, 2double_precision, <)
-#endif
-#if OMPI_HAVE_FORTRAN_INTEGER
-LOC_FUNC_3BUF(minloc, 2integer, <)
-#endif
 LOC_FUNC_3BUF(minloc, float_int, <)
 LOC_FUNC_3BUF(minloc, double_int, <)
 LOC_FUNC_3BUF(minloc, long_int, <)
diff --git a/ompi/mca/op/rocm/op_rocm_impl.h b/ompi/mca/op/rocm/op_rocm_impl.h
index 688218b8068..9beec67d9ef 100644
--- a/ompi/mca/op/rocm/op_rocm_impl.h
+++ b/ompi/mca/op/rocm/op_rocm_impl.h
@@ -202,10 +202,10 @@ OP_FUNC_SIG(prod, c_short_float_complex, short float _Complex, *=)
 #elif defined(HAVE_OPAL_SHORT_FLOAT_COMPLEX_T)
 COMPLEX_PROD_FUNC(c_short_float_complex, opal_short_float_t)
 #endif
-OP_FUNC_SIG(prod, c_float_complex, float _Complex, *=)
-OP_FUNC_SIG(prod, c_double_complex, double _Complex, *=)
 OP_FUNC_SIG(prod, c_long_double_complex, long double _Complex, *=)
 #endif // 0
+FUNC_FUNC_SIG(prod, c_float_complex, hipFloatComplex)
+FUNC_FUNC_SIG(prod, c_double_complex, hipDoubleComplex)
 
 /*************************************************************************
  * Logical AND
@@ -344,9 +344,6 @@ LOC_STRUCT(2int, int, int)
 LOC_STRUCT(short_int, short, int)
 LOC_STRUCT(long_double_int, long double, int)
 LOC_STRUCT(ulong, unsigned long, int)
-/* compat types for Fortran */
-LOC_STRUCT(2real, float, float)
-LOC_STRUCT(2double_precision, double, double)
 
 /*************************************************************************
  * Max location
@@ -600,6 +597,8 @@ OP_FUNC_3BUF_SIG(sum, c_float_complex, float _Complex, +)
 OP_FUNC_3BUF_SIG(sum, c_double_complex, double _Complex, +)
 OP_FUNC_3BUF_SIG(sum, c_long_double_complex, long double _Complex, +)
 #endif // 0
+FUNC_FUNC_3BUF_SIG(sum, c_float_complex, hipFloatComplex)
+FUNC_FUNC_3BUF_SIG(sum, c_double_complex, hipDoubleComplex)
 
 /*************************************************************************
  * Product
@@ -674,6 +673,8 @@ OP_FUNC_3BUF_SIG(prod, c_float_complex, float _Complex, *)
 OP_FUNC_3BUF_SIG(prod, c_double_complex, double _Complex, *)
 OP_FUNC_3BUF_SIG(prod, c_long_double_complex, long double _Complex, *)
 #endif // 0
+FUNC_FUNC_3BUF_SIG(prod, c_float_complex, hipFloatComplex)
+FUNC_FUNC_3BUF_SIG(prod, c_double_complex, hipDoubleComplex)
 
 /*************************************************************************
  * Logical AND

From 7bb4b95eb48616bfe816786e61ca304a7f408957 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 7 Nov 2023 23:05:57 +0000
Subject: [PATCH 73/74] Minor tweak to CUDA op configury

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ompi/mca/op/cuda/Makefile.am  | 6 +++---
 ompi/mca/op/cuda/configure.m4 | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index 311b9db3112..5e68ddf5854 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -17,7 +17,7 @@
 # First, list all .h and .c sources.  It is necessary to list all .h
 # files so that they will be picked up in the distribution tarball.
 
-AM_CPPFLAGS = $(common_cuda_CPPFLAGS)
+AM_CPPFLAGS = $(op_cuda_CPPFLAGS) $(op_cudart_CPPFLAGS)
 
 dist_ompidata_DATA = help-ompi-mca-op-cuda.txt
 
@@ -67,7 +67,7 @@ mcacomponent_LTLIBRARIES = $(component_install)
 mca_op_cuda_la_SOURCES = $(sources)
 mca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
 mca_op_cuda_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
-		$(accelerator_cuda_LIBS) $(accelerator_cudart_LIBS)
+		$(op_cuda_LIBS) $(op_cudart_LIBS)
 EXTRA_mca_op_cuda_la_SOURCES = $(cu_sources)
 
 # Specific information for static builds.
@@ -79,6 +79,6 @@ noinst_LTLIBRARIES = $(component_noinst)
 libmca_op_cuda_la_SOURCES = $(sources)
 libmca_op_cuda_la_LIBADD = $(cu_sources:.cu=.lo)
 libmca_op_cuda_la_LDFLAGS = -module -avoid-version\
-		$(accelerator_cuda_LIBS) $(accelerator_cudart_LIBS)
+		$(op_cuda_LIBS) $(op_cudart_LIBS)
 EXTRA_libmca_op_cuda_la_SOURCES = $(cu_sources)
 
diff --git a/ompi/mca/op/cuda/configure.m4 b/ompi/mca/op/cuda/configure.m4
index 91617ba4ecb..0974e3aaf31 100644
--- a/ompi/mca/op/cuda/configure.m4
+++ b/ompi/mca/op/cuda/configure.m4
@@ -24,6 +24,7 @@ AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
     AC_CONFIG_FILES([ompi/mca/op/cuda/Makefile])
 
     OPAL_CHECK_CUDA([op_cuda])
+    OPAL_CHECK_CUDART([op_cudart])
 
     AS_IF([test "x$CUDA_SUPPORT" = "x1"],
           [$1],
@@ -33,4 +34,8 @@ AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
     AC_SUBST([op_cuda_LDFLAGS])
     AC_SUBST([op_cuda_LIBS])
 
+    AC_SUBST([op_cudart_CPPFLAGS])
+    AC_SUBST([op_cudart_LDFLAGS])
+    AC_SUBST([op_cudart_LIBS])
+
 ])dnl

From d1382c360ea91c1c1269cd46f9ddc197d30f2cb4 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 7 Nov 2023 19:06:59 -0500
Subject: [PATCH 74/74] Fix rebase errors

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 opal/mca/accelerator/rocm/accelerator_rocm_component.c |  4 ++--
 opal/mca/accelerator/rocm/accelerator_rocm_module.c    | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
index 4358bb345f3..86d71c0034b 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
@@ -33,7 +33,6 @@ size_t opal_accelerator_rocm_memcpyH2D_limit=1048576;
 /* Initialization lock for lazy rocm initialization */
 static opal_mutex_t accelerator_rocm_init_lock;
 static bool accelerator_rocm_init_complete = false;
-static int checkmem;
 
 hipStream_t *opal_accelerator_rocm_MemcpyStream = NULL;
 
@@ -171,6 +170,7 @@ static int accelerator_rocm_component_register(void)
 
 int opal_accelerator_rocm_lazy_init()
 {
+    int prio_hi, prio_lo;
     int err = OPAL_SUCCESS;
 
     /* Double checked locking to avoid having to
@@ -191,7 +191,7 @@ int opal_accelerator_rocm_lazy_init()
     /* Create stream for use in cuMemcpyAsync synchronous copies */
     hipStream_t memcpy_stream;
     err = hipStreamCreate(&memcpy_stream);
-    if (OPAL_UNLIKELY(result != hipSuccess)) {
+    if (OPAL_UNLIKELY(err != hipSuccess)) {
         opal_show_help("help-accelerator-rocm.txt", "hipStreamCreateWithFlags failed", true,
                        OPAL_PROC_MY_HOSTNAME, err);
         goto out;
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 8180dc24409..ee31233199f 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -139,11 +139,11 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6
 
 static int mca_accelerator_rocm_get_default_stream(int dev_id, opal_accelerator_stream_t **stream)
 {
-    int delayed_init = opal_accelerator_rocm_delayed_init();
+    int delayed_init = opal_accelerator_rocm_lazy_init();
     if (OPAL_UNLIKELY(0 != delayed_init)) {
         return delayed_init;
     }
-    *stream = &opal_accelerator_rocm_default_stream;
+    *stream = &opal_accelerator_rocm_default_stream.base;
     return OPAL_SUCCESS;
 }
 
@@ -359,7 +359,7 @@ static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, v
     hipError_t result;
     void *ptr;
 
-    int delayed_init = opal_accelerator_rocm_delayed_init();
+    int delayed_init = opal_accelerator_rocm_lazy_init();
     if (OPAL_UNLIKELY(0 != delayed_init)) {
         return delayed_init;
     }
@@ -649,7 +649,7 @@ static int mca_accelerator_rocm_mem_alloc_stream(
 //#if HIP_VERSION >= ??? //TODO
     hipError_t result;
 
-    int delayed_init = opal_accelerator_rocm_delayed_init();
+    int delayed_init = opal_accelerator_rocm_lazy_init();
     if (OPAL_UNLIKELY(0 != delayed_init)) {
         return delayed_init;
     }
@@ -734,7 +734,7 @@ static int mca_accelerator_rocm_get_num_devices(int *num_devices)
 
 static int mca_accelerator_rocm_get_mem_bw(int device, float *bw)
 {
-    int delayed_init = opal_accelerator_rocm_delayed_init();
+    int delayed_init = opal_accelerator_rocm_lazy_init();
     if (OPAL_UNLIKELY(0 != delayed_init)) {
         return delayed_init;
     }