diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
index 26978d0867e..2b17884fef7 100644
--- a/ompi/datatype/ompi_datatype.h
+++ b/ompi/datatype/ompi_datatype.h
@@ -508,5 +508,11 @@ ompi_datatype_consolidate_free(ompi_datatype_consolidate_t *dtmod)
  */
 #define OMPI_DATATYPE_CONSOLIDATE_THRESHOLD 250
 
+static void*
+BUF_START(void *userbuf, MPI_Datatype dt)
+{
+    return userbuf + dt->super.true_lb;
+}
+
 END_C_DECLS
 #endif  /* OMPI_DATATYPE_H_HAS_BEEN_INCLUDED */
diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c
index 416c9c7fa8f..79376f8504b 100644
--- a/ompi/mca/coll/cuda/coll_cuda_allreduce.c
+++ b/ompi/mca/coll/cuda/coll_cuda_allreduce.c
@@ -3,6 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2014-2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -41,7 +42,7 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
 
     bufsize = opal_datatype_span(&dtype->super, count, &gap);
 
-    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
+    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs(BUF_START((char *)sbuf, dtype), NULL))) {
         sbuf1 = (char*)malloc(bufsize);
         if (NULL == sbuf1) {
             return OMPI_ERR_OUT_OF_RESOURCE;
@@ -50,7 +51,7 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
         sbuf = sbuf1 - gap;
     }
 
-    if (opal_cuda_check_bufs(rbuf, NULL)) {
+    if (opal_cuda_check_bufs(BUF_START(rbuf, dtype), NULL)) {
         rbuf1 = (char*)malloc(bufsize);
         if (NULL == rbuf1) {
             if (NULL != sbuf1) free(sbuf1);
diff --git a/ompi/mca/coll/cuda/coll_cuda_exscan.c b/ompi/mca/coll/cuda/coll_cuda_exscan.c
index 5f736697fe0..2fcfc5ebdc4 100644
--- a/ompi/mca/coll/cuda/coll_cuda_exscan.c
+++ b/ompi/mca/coll/cuda/coll_cuda_exscan.c
@@ -3,6 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2014-2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -33,7 +34,7 @@ int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
 
     bufsize = opal_datatype_span(&dtype->super, count, &gap);
 
-    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
+    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs(BUF_START((char *)sbuf, dtype), NULL))) {
         sbuf1 = (char*)malloc(bufsize);
         if (NULL == sbuf1) {
             return OMPI_ERR_OUT_OF_RESOURCE;
@@ -42,7 +43,7 @@ int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
         sbuf = sbuf1 - gap;
     }
 
-    if (opal_cuda_check_bufs(rbuf, NULL)) {
+    if (opal_cuda_check_bufs(BUF_START(rbuf, dtype), NULL)) {
         rbuf1 = (char*)malloc(bufsize);
         if (NULL == rbuf1) {
             if (NULL != sbuf1) free(sbuf1);
diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c
index 5d82667b6bb..6ad5d54dc59 100644
--- a/ompi/mca/coll/cuda/coll_cuda_reduce.c
+++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c
@@ -3,6 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2014-2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -43,7 +44,7 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
     bufsize = opal_datatype_span(&dtype->super, count, &gap);
 
 
-    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
+    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs(BUF_START((char *)sbuf, dtype), NULL))) {
         sbuf1 = (char*)malloc(bufsize);
         if (NULL == sbuf1) {
             return OMPI_ERR_OUT_OF_RESOURCE;
@@ -53,7 +54,7 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
         sbuf = sbuf1 - gap;
     }
 
-    if (opal_cuda_check_bufs(rbuf, NULL)) {
+    if (opal_cuda_check_bufs(BUF_START(rbuf, dtype), NULL)) {
         rbuf1 = (char*)malloc(bufsize);
         if (NULL == rbuf1) {
             if (NULL != sbuf1) free(sbuf1);
diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
index 907257b0da8..cd1ba5894fe 100644
--- a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
+++ b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
@@ -3,6 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2014-2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -47,7 +48,7 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount,
 
     sbufsize = rbufsize * ompi_comm_size(comm);
 
-    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
+    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs(BUF_START((char *)sbuf, dtype), NULL))) {
         sbuf1 = (char*)malloc(sbufsize);
         if (NULL == sbuf1) {
             return OMPI_ERR_OUT_OF_RESOURCE;
@@ -56,7 +57,7 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount,
         sbuf = sbuf1 - gap;
     }
 
-    if (opal_cuda_check_bufs(rbuf, NULL)) {
+    if (opal_cuda_check_bufs(BUF_START(rbuf, dtype), NULL)) {
         rbuf1 = (char*)malloc(rbufsize);
         if (NULL == rbuf1) {
             if (NULL != sbuf1) free(sbuf1);
diff --git a/ompi/mca/coll/cuda/coll_cuda_scan.c b/ompi/mca/coll/cuda/coll_cuda_scan.c
index 4e7300c12f8..24e95753d90 100644
--- a/ompi/mca/coll/cuda/coll_cuda_scan.c
+++ b/ompi/mca/coll/cuda/coll_cuda_scan.c
@@ -3,6 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2014-2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -40,7 +41,7 @@ int mca_coll_cuda_scan(const void *sbuf, void *rbuf, int count,
 
     bufsize = opal_datatype_span(&dtype->super, count, &gap);
 
-    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) {
+    if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs(BUF_START((char *)sbuf, dtype), NULL))) {
         sbuf1 = (char*)malloc(bufsize);
         if (NULL == sbuf1) {
             return OMPI_ERR_OUT_OF_RESOURCE;
@@ -49,7 +50,7 @@ int mca_coll_cuda_scan(const void *sbuf, void *rbuf, int count,
         sbuf = sbuf1 - gap;
     }
 
-    if (opal_cuda_check_bufs(rbuf, NULL)) {
+    if (opal_cuda_check_bufs(BUF_START(rbuf, dtype), NULL)) {
         rbuf1 = (char*)malloc(bufsize);
         if (NULL == rbuf1) {
             if (NULL != sbuf1) free(sbuf1);
diff --git a/ompi/mca/coll/libnbc/nbc_ialltoall.c b/ompi/mca/coll/libnbc/nbc_ialltoall.c
index ebed3c4c3b2..78a57cd8155 100644
--- a/ompi/mca/coll/libnbc/nbc_ialltoall.c
+++ b/ompi/mca/coll/libnbc/nbc_ialltoall.c
@@ -10,7 +10,7 @@
  * Copyright (c) 2014      NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2014-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
- * Copyright (c) 2017      IBM Corporation.  All rights reserved.
+ * Copyright (c) 2017-2021 IBM Corporation. All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
  * $COPYRIGHT$
  *
@@ -146,7 +146,7 @@ static int nbc_alltoall_init(const void* sendbuf, int sendcount, MPI_Datatype se
 
     /* phase 1 - rotate n data blocks upwards into the tmpbuffer */
 #if OPAL_CUDA_SUPPORT
-    if (NBC_Type_intrinsic(sendtype) && !(opal_cuda_check_bufs((char *)sendbuf, (char *)recvbuf))) {
+    if (NBC_Type_intrinsic(sendtype) && !(opal_cuda_check_bufs(BUF_START((char *)sendbuf, sendtype), BUF_START((char *)recvbuf, recvtype)))) {
 #else
     if (NBC_Type_intrinsic(sendtype)) {
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/ompi/mca/common/ompio/common_ompio_buffer.c b/ompi/mca/common/ompio/common_ompio_buffer.c
index dbd7e30e6b4..48eaff52682 100644
--- a/ompi/mca/common/ompio/common_ompio_buffer.c
+++ b/ompi/mca/common/ompio/common_ompio_buffer.c
@@ -10,6 +10,7 @@
  *  Copyright (c) 2004-2005 The Regents of the University of California.
  *                          All rights reserved.
  *  Copyright (c) 2008-2019 University of Houston. All rights reserved.
+ *  Copyright (c) 2021      IBM Corporation. All rights reserved.
  *  $COPYRIGHT$
  *
  *  Additional copyrights may follow
@@ -39,8 +40,9 @@ static void* mca_common_ompio_buffer_alloc_seg ( void *ctx, size_t *size );
 static void mca_common_ompio_buffer_free_seg ( void *ctx, void *buf );
 
 #if OPAL_CUDA_SUPPORT
-void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, int *is_gpu, 
-				      int *is_managed)
+void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf,
+                                      struct ompi_datatype_t *datatype,
+                                      int *is_gpu, int *is_managed)
 {
     opal_convertor_t    convertor;  
     
@@ -48,6 +50,7 @@ void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, int *is
     *is_managed=0;
     
     convertor.flags=0;
+    convertor.pDesc = &datatype->super;
     if ( opal_cuda_check_one_buf ( (char *)buf, &convertor ) ) {
         *is_gpu = 1;
         if ( convertor.flags & CONVERTOR_CUDA_UNIFIED ){
diff --git a/ompi/mca/common/ompio/common_ompio_buffer.h b/ompi/mca/common/ompio/common_ompio_buffer.h
index 2e5e7fcbb4c..480fa34ed54 100644
--- a/ompi/mca/common/ompio/common_ompio_buffer.h
+++ b/ompi/mca/common/ompio/common_ompio_buffer.h
@@ -11,6 +11,7 @@
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
  * Copyright (c) 2008-2019 University of Houston. All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -60,6 +61,7 @@
 
 #if OPAL_CUDA_SUPPORT
 void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, 
+                                      struct ompi_datatype_t *datatype,
 				      int *is_gpu, int *is_managed);
 #endif
 int mca_common_ompio_buffer_alloc_init ( void );
diff --git a/ompi/mca/common/ompio/common_ompio_file_read.c b/ompi/mca/common/ompio/common_ompio_file_read.c
index 695b291fe76..a8b32bf0ead 100644
--- a/ompi/mca/common/ompio/common_ompio_file_read.c
+++ b/ompi/mca/common/ompio/common_ompio_file_read.c
@@ -12,6 +12,7 @@
  *  Copyright (c) 2008-2019 University of Houston. All rights reserved.
  *  Copyright (c) 2018      Research Organization for Information Science
  *                          and Technology (RIST). All rights reserved.
+ *  Copyright (c) 2021      IBM Corporation. All rights reserved.
  *  $COPYRIGHT$
  *
  *  Additional copyrights may follow
@@ -92,7 +93,7 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
     opal_convertor_t convertor;
 #if OPAL_CUDA_SUPPORT
     int is_gpu, is_managed;
-    mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
+    mca_common_ompio_check_gpu_buf ( fh, buf, datatype, &is_gpu, &is_managed);
     if ( is_gpu && !is_managed ) {
         need_to_copy = true;
     }
@@ -271,7 +272,7 @@ int mca_common_ompio_file_iread (ompio_file_t *fh,
     
 #if OPAL_CUDA_SUPPORT
         int is_gpu, is_managed;
-        mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
+        mca_common_ompio_check_gpu_buf ( fh, buf, datatype, &is_gpu, &is_managed);
         if ( is_gpu && !is_managed ) {
             need_to_copy = true;
         }
diff --git a/ompi/mca/common/ompio/common_ompio_file_write.c b/ompi/mca/common/ompio/common_ompio_file_write.c
index 066afb8844a..ea4d93d17ea 100644
--- a/ompi/mca/common/ompio/common_ompio_file_write.c
+++ b/ompi/mca/common/ompio/common_ompio_file_write.c
@@ -12,6 +12,7 @@
  * Copyright (c) 2008-2019 University of Houston. All rights reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -73,7 +74,7 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
 
 #if OPAL_CUDA_SUPPORT
     int is_gpu, is_managed;
-    mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
+    mca_common_ompio_check_gpu_buf ( fh, buf, datatype, &is_gpu, &is_managed);
     if ( is_gpu && !is_managed ) {
         need_to_copy = true;
     }
@@ -246,7 +247,7 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
 
 #if OPAL_CUDA_SUPPORT
         int is_gpu, is_managed;
-        mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
+        mca_common_ompio_check_gpu_buf ( fh, buf, datatype, &is_gpu, &is_managed);
         if ( is_gpu && !is_managed ) {
             need_to_copy = true;
         }
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index e08265b42bc..e5a243cfa3a 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -15,6 +15,7 @@
  * Copyright (c) 2013-2018 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2017      Intel, Inc. All rights reserved
+ * Copyright (c) 2021      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -580,6 +581,11 @@ int32_t opal_convertor_prepare_for_recv(opal_convertor_t *convertor,
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
     if (!(convertor->flags & CONVERTOR_SKIP_CUDA_INIT)) {
+        /* setting a couple fields from CONVERTOR_PREPARE early
+         * so that the cuda code can figure out what offsets
+         * from pUserBuf to look at
+         */
+        convertor->pDesc      = (opal_datatype_t*)datatype;             \
         mca_cuda_convertor_init(convertor, pUserBuf);
     }
 #endif
@@ -622,6 +628,7 @@ int32_t opal_convertor_prepare_for_send(opal_convertor_t *convertor,
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
     if (!(convertor->flags & CONVERTOR_SKIP_CUDA_INIT)) {
+        convertor->pDesc      = (opal_datatype_t*)datatype;             \
         mca_cuda_convertor_init(convertor, pUserBuf);
     }
 #endif
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 2fdc4b100e3..bd2f325a294 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -14,6 +14,7 @@
  * Copyright (c) 2015      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
+ * Copyright (c) 2021      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -1698,6 +1699,25 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
     CUmemorytype memType = 0;
     CUdeviceptr dbuf = (CUdeviceptr) pUserBuf;
     CUcontext ctx = NULL, memCtx = NULL;
+
+    /*
+     * If a convertor is provided, it needs to have a .pDesc set
+     * and then we'll offset to the true_lb to find the address of the
+     * actual data, as pUserBuf by itself isn't a meaningfull address
+     * when used with MPI datatypes.  But if convertor is null we'll just
+     * use pUserBuf directly.
+     */
+    if (NULL != convertor) {
+        dbuf = (CUdeviceptr)((char*)pUserBuf + convertor->pDesc.true_lb);
+        /* I'm not doing anything with count and true_ub to locate
+         * the top byte or any interior bytes mainly because I don't
+         * even know what I'd want this function to do if the MPI
+         * datatype spanned multiple types of memory.  You can easily
+         * construct MPI datatypes to do that, so I'd lean toward
+         * documenting that that's not allowed.
+         */
+    }
+
 #if OPAL_CUDA_GET_ATTRIBUTES
     uint32_t isManaged = 0;
     /* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -2102,6 +2122,10 @@ void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf)
 /* Checks the type of pointer
  *
  * @param dest   One pointer to check
+ *               the buffers are the real address to check, eg if there
+ *               was a userbuf and an MPI datatype involved, the argument
+ *               passed in here should already be offset from userbuf to
+ *               where the data is
  * @param source Another pointer to check
  */
 bool opal_cuda_check_bufs(char *dest, char *src)
@@ -2132,6 +2156,9 @@ bool opal_cuda_check_bufs(char *dest, char *src)
 /* Checks the type of pointer
  *
  * @param buf   check one pointer providing a convertor.
+ *              when a convertor is provided, the buf should be a userbuf
+ *              so the convertor's datatype is used to locate the offset
+ *              where data actually begins
  *  Provides aditional information, e.g. managed vs. unmanaged GPU buffer
  */
 bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor)