From 2d4b87ae263eba75e7740312429d9e31bf14ffb4 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 23 Jan 2019 14:52:21 +0900 Subject: [PATCH 1/2] opal/datatype: fix opal_convertor_raw correctly handle the case in which iovec is full and the last accessed element of the datatype is the beginning of a loop Refs. open-mpi/ompi#6285 Thanks Axel Huebl for reporting this Signed-off-by: Gilles Gouaillardet cherry-pick #6295 into 3.1 Signed-off-by: George Bosilca --- opal/datatype/opal_convertor_raw.c | 21 +- test/datatype/Makefile.am | 12 +- test/datatype/ddt_raw2.c | 329 +++++++++++++++++++++++++++++ 3 files changed, 353 insertions(+), 9 deletions(-) create mode 100644 test/datatype/ddt_raw2.c diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index 09019388127..272a38ad786 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -5,8 +5,8 @@ * reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017 Research Organization for Information Science - * and Technology (RIST). All rights reserved. + * Copyright (c) 2017-2019 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -170,9 +170,18 @@ opal_convertor_raw( opal_convertor_t* pConvertor, ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)(pElem + pElem->loop.items); if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { - uint32_t i; - source_base += end_loop->first_elem_disp; - for( i = count_desc; (i > 0) && (index < *iov_count); i--, index++ ) { + ptrdiff_t offset = end_loop->first_elem_disp; + source_base += offset; + for(size_t i = count_desc; i > 0; i--, index++ ) { + if (index >= *iov_count) { + dt_elem_desc_t* nElem = pElem + 1; + while (nElem->elem.common.type == OPAL_DATATYPE_LOOP) { + nElem++; + } + assert(OPAL_DATATYPE_END_LOOP != nElem->elem.common.type); + offset = nElem->elem.disp; + break; + } OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, end_loop->size, pConvertor->pBaseBuf, pConvertor->pDesc, pConvertor->count ); iov[index].iov_base = (IOVBASE_TYPE *) source_base; @@ -181,7 +190,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor, raw_data += end_loop->size; count_desc--; } - source_base -= end_loop->first_elem_disp; + source_base -= offset; if( 0 == count_desc ) { /* completed */ pos_desc += pElem->loop.items + 1; goto update_loop_description; diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am index cd867134a4f..8efd0344ecc 100644 --- a/test/datatype/Makefile.am +++ b/test/datatype/Makefile.am @@ -4,8 +4,8 @@ # reserved. # Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. -# Copyright (c) 2014-2015 Research Organization for Information Science -# and Technology (RIST). All rights reserved. +# Copyright (c) 2014-2019 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # Copyright (c) 2016 IBM Corporation. All rights reserved. # $COPYRIGHT$ # @@ -15,7 +15,7 @@ # if PROJECT_OMPI - MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack external32 + MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 MPI_CHECKS = to_self endif TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS) @@ -40,6 +40,12 @@ ddt_raw_LDADD = \ $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la +ddt_raw2_SOURCES = ddt_raw2.c ddt_lib.c ddt_lib.h +ddt_raw2_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS) +ddt_raw2_LDADD = \ + $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la + ddt_pack_SOURCES = ddt_pack.c ddt_pack_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS) ddt_pack_LDADD = \ diff --git a/test/datatype/ddt_raw2.c b/test/datatype/ddt_raw2.c new file mode 100644 index 00000000000..1c132b4d55b --- /dev/null +++ b/test/datatype/ddt_raw2.c @@ -0,0 +1,329 @@ +#include "ompi_config.h" +#include "ddt_lib.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/datatype/opal_datatype_internal.h" +#include "opal/runtime/opal.h" + +#include +#include +#ifdef HAVE_SYS_TIME_H +#include +#endif +#include + + +int mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, + int count, + struct iovec **iov, + uint32_t *iovec_count, + int increment) +{ + + + + opal_convertor_t *convertor; + size_t remaining_length = 0; + uint32_t i; + uint32_t temp_count; + struct iovec *temp_iov=NULL; + size_t temp_data; + + + convertor = opal_convertor_create( opal_local_arch, 0 ); + + if (OMPI_SUCCESS != opal_convertor_prepare_for_send (convertor, + &(datatype->super), + count, + NULL)) { + opal_output (1, "Cannot attach the datatype to a convertor\n"); + return OMPI_ERROR; + } + + if ( 0 == datatype->super.size ) { + *iovec_count = 0; + *iov = NULL; + return OMPI_SUCCESS; + } + + remaining_length = count * datatype->super.size; + + temp_count = increment; + temp_iov = (struct iovec*)malloc(temp_count * sizeof(struct iovec)); + if (NULL == temp_iov) { + opal_output (1, "OUT OF MEMORY\n"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + while (0 == opal_convertor_raw(convertor, + temp_iov, + &temp_count, + &temp_data)) { + *iovec_count = *iovec_count + temp_count; + *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); + if (NULL == *iov) { + opal_output(1, "OUT OF MEMORY\n"); + free(temp_iov); + return OMPI_ERR_OUT_OF_RESOURCE; + } + for (i=0 ; i 0 ) { + *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); + if (NULL == *iov) { + opal_output(1, "OUT OF MEMORY\n"); + free(temp_iov); + return OMPI_ERR_OUT_OF_RESOURCE; + } + } + for (i=0 ; isuper.flags = 3332; + datatype->super.id = 0; + datatype->super.bdt_used = 512; + datatype->super.size = 31684; + datatype->super.true_lb = 4; + datatype->super.true_ub = 218288; + datatype->super.lb = 0; + datatype->super.ub = 218344; + datatype->super.nbElems = 31684; + datatype->super.align = 1; + datatype->super.loops = 1146; + datatype->super.desc.length = 3351; + datatype->super.desc.used = 184; + datatype->super.desc.desc = descs; + datatype->super.opt_desc.length = 3351; + datatype->super.opt_desc.used = 184; + datatype->super.opt_desc.desc = descs; + + uint32_t iovec_count = 0; + struct iovec * iov = NULL; + mca_common_ompio_decode_datatype ( datatype, 1, &iov, &iovec_count, 300); + uint32_t iovec_count2 = 0; + struct iovec * iov2 = NULL; + mca_common_ompio_decode_datatype ( datatype, 1, &iov2, &iovec_count2, 100); + + assert(iovec_count == iovec_count2); + // assert(iov[100].iov_base == iov2[100].iov_base); + // assert(iov[100].iov_len == iov2[100].iov_len); + for (int i=0; i Date: Tue, 29 Jan 2019 19:44:05 -0500 Subject: [PATCH 2/2] Provide a better fix for #6285. The issue was a little complicated due to the internal stack used in the convertor. The main issue was that in the case where we run out of iov space to save the raw description of the data while hanbdling a repetition (loop), instead of saving the current position and bailing out directly we reading of the next predefined type element. It worked in most cases, except the one identified by the HDF5 test. However, the biggest issue here was the drop in performance for all ensuing calls to the convertor pack/unpack, as instead of handling contiguous loops as a whole (and minimizing the number of memory copies) we copied data description by data description. Signed-off-by: George Bosilca --- opal/datatype/opal_convertor_raw.c | 22 +++++------ test/datatype/ddt_raw2.c | 60 ++++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index 272a38ad786..d74ae43851d 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -102,7 +102,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor, /* now here we have a basic datatype */ OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, pConvertor->pDesc, pConvertor->count ); - DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %lu}\n", + DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %" PRIsize_t "}\n", index, (void*)source_base, (unsigned long)blength ); ); iov[index].iov_base = (IOVBASE_TYPE *) source_base; iov[index].iov_len = blength; @@ -115,7 +115,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor, for( i = count_desc; (i > 0) && (index < *iov_count); i--, index++ ) { OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, pConvertor->pDesc, pConvertor->count ); - DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %lu}\n", + DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %" PRIsize_t "}\n", index, (void*)source_base, (unsigned long)blength ); ); iov[index].iov_base = (IOVBASE_TYPE *) source_base; iov[index].iov_len = blength; @@ -172,16 +172,7 @@ opal_convertor_raw( opal_convertor_t* pConvertor, if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { ptrdiff_t offset = end_loop->first_elem_disp; source_base += offset; - for(size_t i = count_desc; i > 0; i--, index++ ) { - if (index >= *iov_count) { - dt_elem_desc_t* nElem = pElem + 1; - while (nElem->elem.common.type == OPAL_DATATYPE_LOOP) { - nElem++; - } - assert(OPAL_DATATYPE_END_LOOP != nElem->elem.common.type); - offset = nElem->elem.disp; - break; - } + for(size_t i = MIN(count_desc, *iov_count - index); i > 0; i--, index++ ) { OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, end_loop->size, pConvertor->pBaseBuf, pConvertor->pDesc, pConvertor->count ); iov[index].iov_base = (IOVBASE_TYPE *) source_base; @@ -189,6 +180,10 @@ opal_convertor_raw( opal_convertor_t* pConvertor, source_base += pElem->loop.extent; raw_data += end_loop->size; count_desc--; + DO_DEBUG( opal_output( 0, "raw contig loop generate iov[%d] = {base %p, length %" PRIsize_t "}" + "space %lu [pos_desc %d]\n", + index, iov[index].iov_base, iov[index].iov_len, + (unsigned long)raw_data, pos_desc ); ); } source_base -= offset; if( 0 == count_desc ) { /* completed */ @@ -196,6 +191,9 @@ opal_convertor_raw( opal_convertor_t* pConvertor, goto update_loop_description; } } + if( index == *iov_count ) { /* all iov have been filled, we need to bail out */ + goto complete_loop; + } local_disp = (ptrdiff_t)source_base - local_disp; PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, pStack->disp + local_disp); diff --git a/test/datatype/ddt_raw2.c b/test/datatype/ddt_raw2.c index 1c132b4d55b..cc78e23006a 100644 --- a/test/datatype/ddt_raw2.c +++ b/test/datatype/ddt_raw2.c @@ -1,3 +1,17 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2019 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + #include "ompi_config.h" #include "ddt_lib.h" #include "opal/datatype/opal_convertor.h" @@ -12,11 +26,12 @@ #include -int mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, - int count, - struct iovec **iov, - uint32_t *iovec_count, - int increment) +static int +mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype, + int count, + struct iovec **iov, + uint32_t *iovec_count, + int increment) { @@ -310,20 +325,35 @@ int main (int argc, char *argv[]) { datatype->super.opt_desc.used = 184; datatype->super.opt_desc.desc = descs; - uint32_t iovec_count = 0; - struct iovec * iov = NULL; - mca_common_ompio_decode_datatype ( datatype, 1, &iov, &iovec_count, 300); - uint32_t iovec_count2 = 0; - struct iovec * iov2 = NULL; - mca_common_ompio_decode_datatype ( datatype, 1, &iov2, &iovec_count2, 100); + /* Get the entire raw description of the datatype in a single call */ + uint32_t iovec_count_300 = 0; + struct iovec * iov_300 = NULL; + mca_common_ompio_decode_datatype ( datatype, 1, &iov_300, &iovec_count_300, 300); + /* Get the raw description of the datatype 10 elements at the time. This stresses some + * of the execution paths in the convertor raw. + */ + uint32_t iovec_count_10 = 0; + struct iovec * iov_10 = NULL; + mca_common_ompio_decode_datatype ( datatype, 1, &iov_10, &iovec_count_10, 10); + /* Get the raw description of the datatype one element at the time. This stresses all + * execution paths in the convertor raw. + */ + uint32_t iovec_count_1 = 0; + struct iovec * iov_1 = NULL; + mca_common_ompio_decode_datatype ( datatype, 1, &iov_1, &iovec_count_1, 1); + - assert(iovec_count == iovec_count2); + assert(iovec_count_300 == iovec_count_10); + assert(iovec_count_300 == iovec_count_1); // assert(iov[100].iov_base == iov2[100].iov_base); // assert(iov[100].iov_len == iov2[100].iov_len); - for (int i=0; i