diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4 index fd7816e3ea7..67059a8c851 100644 --- a/config/opal_check_cuda.m4 +++ b/config/opal_check_cuda.m4 @@ -91,8 +91,8 @@ AS_IF([test "$opal_check_cuda_happy" = "yes"], # If we have CUDA support, check to see if we have support for SYNC_MEMOPS # which was first introduced in CUDA 6.0. AS_IF([test "$opal_check_cuda_happy"="yes"], - AC_CHECK_DECL([CU_POINTER_ATTRIBUTE_SYNC_MEMOPS], [CUDA_SYNC_MEMOPS=1], [CUDA_SYNC_MEMOPS=0], - [#include <$opal_cuda_incdir/cuda.h>]), + [AC_CHECK_DECL([CU_POINTER_ATTRIBUTE_SYNC_MEMOPS], [CUDA_SYNC_MEMOPS=1], [CUDA_SYNC_MEMOPS=0], + [#include <$opal_cuda_incdir/cuda.h>])], []) # If we have CUDA support, check to see if we have CUDA 6.0 or later. diff --git a/configure.ac b/configure.ac index d671a1c0843..5b7b7e0dcf2 100644 --- a/configure.ac +++ b/configure.ac @@ -1369,10 +1369,13 @@ OPAL_SETUP_WRAPPER_FINAL # autoconf macro defines in mpi.h. Since AC sometimes changes whether # things are defined as null tokens or an integer result, two projects # with different versions of AC can cause problems. -if test $ac_cv_header_stdc = yes; then - AC_DEFINE(OPAL_STDC_HEADERS, 1, - [Do not use outside of mpi.h. Define to 1 if you have the ANSI C header files.]) -fi + +# According to the autoconf 2.67 documentation the AC_HEADER_STDC macro, +# and therefore the ac_cv_header_stdc cache variable, is obsolescent, as +# current systems have conforming header files. Instead of removing the +# protection completely, let's just make sure it is always on. +AC_DEFINE(OPAL_STDC_HEADERS, 1, + [Do not use outside of mpi.h. Define to 1 if you have the ANSI C header files.]) if test $ac_cv_header_sys_time_h = yes ; then AC_DEFINE(OPAL_HAVE_SYS_TIME_H, 1, [Do not use outside of mpi.h. Define to 1 if you have the header file.]) diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index 853e5b1632f..3931d99d175 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -483,7 +483,8 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor ) pConvertor->remote_size = pConvertor->local_size; if( OPAL_UNLIKELY(datatype->bdt_used & pConvertor->master->hetero_mask) ) { pConvertor->flags &= (~CONVERTOR_HOMOGENEOUS); - if (!(pConvertor->flags & CONVERTOR_SEND && pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS)) { + /* Can we use the optimized description? */ + if (pConvertor->flags & OPAL_DATATYPE_OPTIMIZED_RESTRICTED) { pConvertor->use_desc = &(datatype->desc); } if( 0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE) ) { diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h index 5e34b9955b0..1e86a456127 100644 --- a/opal/datatype/opal_datatype.h +++ b/opal/datatype/opal_datatype.h @@ -75,11 +75,18 @@ BEGIN_C_DECLS * We should make the difference here between the predefined contiguous and non contiguous * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes. */ -#define OPAL_DATATYPE_FLAG_BASIC (OPAL_DATATYPE_FLAG_PREDEFINED | \ - OPAL_DATATYPE_FLAG_CONTIGUOUS | \ - OPAL_DATATYPE_FLAG_NO_GAPS | \ - OPAL_DATATYPE_FLAG_DATA | \ - OPAL_DATATYPE_FLAG_COMMITTED) +#define OPAL_DATATYPE_FLAG_BASIC \ + (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS \ + | OPAL_DATATYPE_FLAG_DATA | OPAL_DATATYPE_FLAG_COMMITTED) +/* + * If during the datatype optimization process we collapse contiguous elements with + * different types, we cannot use this optimized description for any communication + * in a heterogeneous setting, especially not for the exteranl32 support. + * + * A datatype with this flag cannot use the optimized description in heterogeneous + * setups. + */ +#define OPAL_DATATYPE_OPTIMIZED_RESTRICTED 0x1000 /** * The number of supported entries in the data-type definition and the diff --git a/opal/datatype/opal_datatype_dump.c b/opal/datatype/opal_datatype_dump.c index 27903db657e..b27deb0a673 100644 --- a/opal/datatype/opal_datatype_dump.c +++ b/opal/datatype/opal_datatype_dump.c @@ -62,17 +62,39 @@ int opal_datatype_contain_basic_datatypes( const opal_datatype_t* pData, char* p int opal_datatype_dump_data_flags( unsigned short usflags, char* ptr, size_t length ) { int index = 0; - if( length < 22 ) return 0; - index = snprintf( ptr, 22, "-----------[---][---]" ); /* set everything to - */ - if( usflags & OPAL_DATATYPE_FLAG_COMMITTED ) ptr[1] = 'c'; - if( usflags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) ptr[2] = 'C'; - if( usflags & OPAL_DATATYPE_FLAG_OVERLAP ) ptr[3] = 'o'; - if( usflags & OPAL_DATATYPE_FLAG_USER_LB ) ptr[4] = 'l'; - if( usflags & OPAL_DATATYPE_FLAG_USER_UB ) ptr[5] = 'u'; - if( usflags & OPAL_DATATYPE_FLAG_PREDEFINED ) ptr[6] = 'P'; - if( !(usflags & OPAL_DATATYPE_FLAG_NO_GAPS) ) ptr[7] = 'G'; - if( usflags & OPAL_DATATYPE_FLAG_DATA ) ptr[8] = 'D'; - if( (usflags & OPAL_DATATYPE_FLAG_BASIC) == OPAL_DATATYPE_FLAG_BASIC ) ptr[9] = 'B'; + if (length < 22) { + return 0; + } + index = snprintf(ptr, 22, "-----------[---][---]"); /* set everything to - */ + if (usflags & OPAL_DATATYPE_FLAG_COMMITTED) { + ptr[1] = 'c'; + } + if (usflags & OPAL_DATATYPE_FLAG_CONTIGUOUS) { + ptr[2] = 'C'; + } + if (usflags & OPAL_DATATYPE_FLAG_OVERLAP) { + ptr[3] = 'o'; + } + if (usflags & OPAL_DATATYPE_FLAG_USER_LB) { + ptr[4] = 'l'; + } + if (usflags & OPAL_DATATYPE_FLAG_USER_UB) { + ptr[5] = 'u'; + } + if (usflags & OPAL_DATATYPE_FLAG_PREDEFINED) { + ptr[6] = 'P'; + } + if (!(usflags & OPAL_DATATYPE_FLAG_NO_GAPS)) { + ptr[7] = 'G'; + } + if (usflags & OPAL_DATATYPE_FLAG_DATA) { + ptr[8] = 'D'; + } + if ((usflags & OPAL_DATATYPE_FLAG_BASIC) == OPAL_DATATYPE_FLAG_BASIC) { + ptr[9] = 'B'; + } else if (usflags & OPAL_DATATYPE_OPTIMIZED_RESTRICTED) { + ptr[9] = 'H'; /* optimized description restricted to homogeneous cases */ + } /* We know nothing about the upper level language or flags! */ /* ... */ return index; diff --git a/opal/datatype/opal_datatype_internal.h b/opal/datatype/opal_datatype_internal.h index 7015e99b4ff..1b5f62919c3 100644 --- a/opal/datatype/opal_datatype_internal.h +++ b/opal/datatype/opal_datatype_internal.h @@ -36,51 +36,16 @@ extern int opal_datatype_dfd; -# define DDT_DUMP_STACK( PSTACK, STACK_POS, PDESC, NAME ) \ - opal_datatype_dump_stack( (PSTACK), (STACK_POS), (PDESC), (NAME) ) -# if defined(ACCEPT_C99) -# define DUMP( ARGS... ) opal_output(opal_datatype_dfd, __VA_ARGS__) -# else -# if defined(__GNUC__) && !defined(__STDC__) -# define DUMP(ARGS...) opal_output( opal_datatype_dfd, ARGS) -# else -static inline void DUMP( char* fmt, ... ) -{ - va_list list; +# define DDT_DUMP_STACK(PSTACK, STACK_POS, PDESC, NAME) \ + opal_datatype_dump_stack((PSTACK), (STACK_POS), (PDESC), (NAME)) + +# define DUMP(...) opal_output(opal_datatype_dfd, __VA_ARGS__) - va_start( list, fmt ); - opal_output_vverbose( 0, opal_datatype_dfd, fmt, list ); - va_end( list ); -} -# endif /* __GNUC__ && !__STDC__ */ -# endif /* ACCEPT_C99 */ #else -# define DDT_DUMP_STACK( PSTACK, STACK_POS, PDESC, NAME ) -# if defined(ACCEPT_C99) -# define DUMP(ARGS...) -# else -# if defined(__GNUC__) && !defined(__STDC__) -# define DUMP(ARGS...) -# else - /* If we do not compile with PGI, mark the parameter as unused */ -# if !defined(__PGI) -# define __opal_attribute_unused_tmp__ __opal_attribute_unused__ -# else -# define __opal_attribute_unused_tmp__ -# endif -static inline void DUMP( char* fmt __opal_attribute_unused_tmp__, ... ) -{ -#if defined(__PGI) - /* Some compilers complain if we have "..." arguments and no - corresponding va_start() */ - va_list arglist; - va_start(arglist, fmt); - va_end(arglist); -#endif -} -# undef __opal_attribute_unused_tmp__ -# endif /* __GNUC__ && !__STDC__ */ -# endif /* ACCEPT_C99 */ + +# define DDT_DUMP_STACK(PSTACK, STACK_POS, PDESC, NAME) +# define DUMP(...) + #endif /* VERBOSE */ diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c index 2e661b95daa..dfc57064802 100644 --- a/opal/datatype/opal_datatype_optimize.c +++ b/opal/datatype/opal_datatype_optimize.c @@ -87,10 +87,12 @@ opal_datatype_optimize_short( opal_datatype_t* pData, compress.blocklen = pData->desc.desc[pos_desc + index].elem.blocklen; for( uint32_t i = index+1; i < loop->items; i++ ) { current = &pData->desc.desc[pos_desc + i].elem; - assert(1 == current->count); - if( (current->common.type == OPAL_DATATYPE_LOOP) || - compress.common.type != current->common.type ) { - compress.common.type = OPAL_DATATYPE_UINT1; + assert(1 == current->count); + if ((current->common.type == OPAL_DATATYPE_LOOP) + || compress.common.type != current->common.type) { + compress.common.type = OPAL_DATATYPE_UINT1; + compress.common.flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; + pData->flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; compress.blocklen = end_loop->size; break; } @@ -174,12 +176,14 @@ opal_datatype_optimize_short( opal_datatype_t* pData, /* are the two elements compatible: aka they have very similar values and they * can be merged together by increasing the count, and/or changing the extent. */ - if( (last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) == - (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size) ) { - ddt_elem_desc_t save = last; /* safekeep the type and blocklen */ - if( last.common.type != current->common.type ) { - last.blocklen *= opal_datatype_basicDatatypes[last.common.type]->size; - last.common.type = OPAL_DATATYPE_UINT1; + if ((last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) + == (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size)) { + ddt_elem_desc_t save = last; /* safekeep the type and blocklen */ + if (last.common.type != current->common.type) { + last.blocklen *= opal_datatype_basicDatatypes[last.common.type]->size; + last.common.type = OPAL_DATATYPE_UINT1; + last.common.flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; + pData->flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; } if( (last.extent * (ptrdiff_t)last.count + last.disp) == current->disp ) { @@ -225,9 +229,14 @@ opal_datatype_optimize_short( opal_datatype_t* pData, if( last.common.type == current->common.type ) { last.blocklen += current->blocklen; } else { - last.blocklen = ((last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) + - (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size)); - last.common.type = OPAL_DATATYPE_UINT1; + last.blocklen = ((last.blocklen + * opal_datatype_basicDatatypes[last.common.type]->size) + + (current->blocklen + * opal_datatype_basicDatatypes[current->common.type] + ->size)); + last.common.type = OPAL_DATATYPE_UINT1; + last.common.flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; + pData->flags |= OPAL_DATATYPE_OPTIMIZED_RESTRICTED; } last.extent += current->extent; if( current->count != 1 ) { diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index b5225017a59..b4e03a9bea4 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -383,42 +383,83 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor, */ static inline void -pack_predefined_heterogeneous( opal_convertor_t* CONVERTOR, - const dt_elem_desc_t* ELEM, - size_t* COUNT, - unsigned char** SOURCE, - unsigned char** DESTINATION, - size_t* SPACE ) +pack_predefined_heterogeneous(opal_convertor_t *CONVERTOR, + const dt_elem_desc_t *ELEM, size_t *COUNT, + unsigned char **memory, + unsigned char **packed, size_t *SPACE) { - const opal_convertor_master_t* master = (CONVERTOR)->master; - const ddt_elem_desc_t* _elem = &((ELEM)->elem); - unsigned char* _source = (*SOURCE) + _elem->disp; - ptrdiff_t advance; - size_t _count = *(COUNT); - size_t _r_blength; - - _r_blength = master->remote_sizes[_elem->common.type]; - if( (_count * _r_blength) > *(SPACE) ) { - _count = (*(SPACE) / _r_blength); - if( 0 == _count ) return; /* nothing to do */ + const opal_convertor_master_t *master = (CONVERTOR)->master; + const ddt_elem_desc_t *_elem = &((ELEM)->elem); + size_t cando_count = *(COUNT), do_now_bytes; + size_t local_elem_size = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t remote_elem_size = master->remote_sizes[_elem->common.type]; + size_t blocklen_bytes = remote_elem_size; + unsigned char *_memory = (*memory) + _elem->disp; + unsigned char *_packed = *packed; + ptrdiff_t advance = 0; + + assert(0 == (cando_count % _elem->blocklen)); /* no partials here */ + assert(*(COUNT) <= ((size_t) _elem->count * _elem->blocklen)); + + if ((remote_elem_size * cando_count) > *(SPACE)) + cando_count = (*SPACE) / blocklen_bytes; + + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + + if (_elem->blocklen == 1) { + master->pFunctions[_elem->common.type](CONVERTOR, cando_count, + _memory, *SPACE, _elem->extent, + _packed, *SPACE, remote_elem_size, + &advance); + _memory += cando_count * _elem->extent; + _packed += cando_count * remote_elem_size; + goto update_and_return; } - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, (_count * _elem->extent), (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack [l %s r %s] memcpy( %p, %p, %lu ) => space %lu\n", - ((ptrdiff_t)(opal_datatype_basicDatatypes[_elem->common.type]->size) == _elem->extent) ? "cont" : "----", - ((ptrdiff_t)_r_blength == _elem->extent) ? "cont" : "----", - (void*)*(DESTINATION), (void*)_source, (unsigned long)_r_blength, - (unsigned long)(*(SPACE)) ); ); - master->pFunctions[_elem->common.type]( CONVERTOR, _count, - _source, *SPACE, _elem->extent, - *DESTINATION, *SPACE, _r_blength, - &advance ); - _r_blength *= _count; /* update the remote length to encompass all the elements */ - *(SOURCE) += _count * _elem->extent; - *(DESTINATION) += _r_blength; - *(SPACE) -= _r_blength; - *(COUNT) -= _count; + if ((1 < _elem->count) && (_elem->blocklen <= cando_count)) { + blocklen_bytes = remote_elem_size * _elem->blocklen; + + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER(_memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count); + DO_DEBUG(opal_output(0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void *) _packed, (void *) _memory, (unsigned long) blocklen_bytes, + (unsigned long) (*(SPACE) - (_packed - *(packed))));); + master->pFunctions[_elem->common.type](CONVERTOR, _elem->blocklen, + _memory, *SPACE, local_elem_size, + _packed, *SPACE, remote_elem_size, + &advance); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); + } + + /** + * As an epilog do anything left from the last blocklen. + */ + if (0 != cando_count) { + assert((cando_count < _elem->blocklen) + || ((1 == _elem->count) && (cando_count <= _elem->blocklen))); + do_now_bytes = cando_count * remote_elem_size; + OPAL_DATATYPE_SAFEGUARD_POINTER(_memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count); + DO_DEBUG(opal_output(0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void *) _packed, (void *) _memory, (unsigned long) do_now_bytes, + (unsigned long) (*(SPACE) - (_packed - *(packed))));); + master->pFunctions[_elem->common.type](CONVERTOR, cando_count, + _memory, *SPACE, local_elem_size, + _packed, *SPACE, remote_elem_size, + &advance); + _memory += do_now_bytes; + _packed += do_now_bytes; + } + +update_and_return: + *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; } int32_t diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index efed62451ac..26a5810dc01 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -142,7 +142,7 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv, } } } - *out_size = iov_idx; /* we only reach this line after the for loop succesfully complete */ + *out_size = iov_idx; /* we only reach this line after the for loop successfully complete */ *max_data = pConv->bConverted - initial_bytes_converted; if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED; return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */ @@ -161,60 +161,70 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv, * of the exponent or mantissa). */ static inline void -opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pElem, - unsigned char* partial_data, - ptrdiff_t start_position, size_t length, - unsigned char** user_buffer ) +opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_t *pElem, + size_t *COUNT, unsigned char **packed, + unsigned char **memory, size_t *SPACE) { char unused_byte = 0x7F, saved_data[16]; unsigned char temporary[16], *temporary_buffer = temporary; - unsigned char* user_data = *user_buffer + pElem->elem.disp; - size_t count_desc = 1; + unsigned char *user_data = *memory + pElem->elem.disp; size_t data_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; + unsigned char *partial_data = *packed; + ptrdiff_t start_position = pConvertor->partial_length; + size_t length = data_length - start_position; + size_t count_desc = 1; + dt_elem_desc_t single_elem = { .elem = { .common = pElem->elem.common, .count = 1, .blocklen = 1, + .extent = data_length, /* advance by a full data element */ + .disp = 0 /* right where the pointer is */ } }; + if( *SPACE < length ) { + length = *SPACE; + } DO_DEBUG( opal_output( 0, "unpack partial data start %lu end %lu data_length %lu user %p\n" "\tbConverted %lu total_length %lu count %ld\n", - (unsigned long)start_position, (unsigned long)start_position + length, (unsigned long)data_length, (void*)*user_buffer, - (unsigned long)pConvertor->bConverted, (unsigned long)pConvertor->local_size, pConvertor->count ); ); - - /* Find a byte that is not used in the partial buffer */ + (unsigned long)start_position, (unsigned long)start_position + length, + (unsigned long)data_length, (void*)*memory, + (unsigned long)pConvertor->bConverted, + (unsigned long)pConvertor->local_size, pConvertor->count ); ); + COMPUTE_CSUM( partial_data, length, pConvertor ); + + /* Find a byte value that is not used in the partial buffer. We use it as a marker + * to identify what has not been modified by the unpack call. */ find_unused_byte: - for(size_t i = 0; i < length; i++ ) { + for (size_t i = 0; i < length; i++ ) { if( unused_byte == partial_data[i] ) { unused_byte--; goto find_unused_byte; } } - /* Copy and fill the rest of the buffer with the unused byte */ + /* Prepare an full element of the predefined type, by populating an entire type + * with the unused byte and then put the partial data at the right position. */ memset( temporary, unused_byte, data_length ); MEMCPY( temporary + start_position, partial_data, length ); + /* Save the original content of the user memory */ #if OPAL_CUDA_SUPPORT /* In the case where the data is being unpacked from device memory, need to - * use the special host to device memory copy. Note this code path was only - * seen on large receives of noncontiguous data via buffered sends. */ + * use the special host to device memory copy. */ pConvertor->cbmemcpy(saved_data, user_data, data_length, pConvertor ); #else - /* Save the content of the user memory */ MEMCPY( saved_data, user_data, data_length ); #endif /* Then unpack the data into the user memory */ - UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - temporary_buffer, *user_buffer, data_length ); + UNPACK_PREDEFINED_DATATYPE(pConvertor, &single_elem, count_desc, temporary_buffer, user_data, + data_length); - /* reload the length as it is reset by the macro */ + /* reload the length and user buffer as they have been updated by the macro */ data_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; + user_data = *memory + pElem->elem.disp; - /* For every occurence of the unused byte move data from the saved - * buffer back into the user memory. - */ + /* Rebuild the data by pulling back the unmodified bytes from the original + * content in the user memory. */ #if OPAL_CUDA_SUPPORT /* Need to copy the modified user_data again so we can see which - * bytes need to be converted back to their original values. Note - * this code path was only seen on large receives of noncontiguous - * data via buffered sends. */ + * bytes need to be converted back to their original values. */ { char resaved_data[16]; pConvertor->cbmemcpy(resaved_data, user_data, data_length, pConvertor ); @@ -229,6 +239,16 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle user_data[i] = saved_data[i]; } #endif + pConvertor->partial_length = (pConvertor->partial_length + length) % data_length; + *SPACE -= length; + *packed += length; + if (0 == pConvertor->partial_length) { + (*COUNT)--; /* we have enough to complete one full predefined type */ + *memory += data_length; + if (0 == (*COUNT % pElem->elem.blocklen)) { + *memory += pElem->elem.extent - (pElem->elem.blocklen * data_length); + } + } } /* The pack/unpack functions need a cleanup. I have to create a proper interface to access @@ -257,8 +277,8 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, size_t iov_len_local; uint32_t iov_count; - DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n", - (void*)pConvertor, (void*)iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); ); + DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, iov[%u] = {%p, %lu} )\n", + (void*)pConvertor, *out_size, (void*)iov[0].iov_base, (unsigned long)iov[0].iov_len ); ); description = pConvertor->use_desc->desc; @@ -283,28 +303,26 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_len_local = iov[iov_count].iov_len; - if( 0 != pConvertor->partial_length ) { - size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; - size_t missing_length = element_length - pConvertor->partial_length; - - assert( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ); - COMPUTE_CSUM( iov_ptr, missing_length, pConvertor ); - opal_unpack_partial_datatype( pConvertor, pElem, - iov_ptr, - pConvertor->partial_length, (size_t)(element_length - pConvertor->partial_length), - &conv_ptr ); - --count_desc; - if( 0 == count_desc ) { - conv_ptr = pConvertor->pBaseBuf + pStack->disp; - pos_desc++; /* advance to the next data */ - UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); + /* Deal with all types of partial predefined datatype unpacking, including when + * unpacking a partial predefined element and when unpacking a part smaller than + * the blocklen. + */ + if (pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) { + if (0 != pConvertor->partial_length) { /* partial predefined element */ + assert( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ); + opal_unpack_partial_predefined( pConvertor, pElem, &count_desc, + &iov_ptr, &conv_ptr, &iov_len_local ); + if (0 == count_desc) { /* the end of the vector ? */ + assert( 0 == pConvertor->partial_length ); + conv_ptr = pConvertor->pBaseBuf + pStack->disp; + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS(description, pos_desc, pElem, count_desc); + goto next_vector; + } + if( 0 == iov_len_local ) + goto complete_loop; } - iov_ptr += missing_length; - iov_len_local -= missing_length; - pConvertor->partial_length = 0; /* nothing more inside */ - } - if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - if( ((size_t)pElem->elem.count * pElem->elem.blocklen) != count_desc ) { + if (((size_t) pElem->elem.count * pElem->elem.blocklen) != count_desc) { /* we have a partial (less than blocklen) basic datatype */ int rc = UNPACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc, iov_ptr, conv_ptr, iov_len_local ); @@ -318,8 +336,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, } } - while( 1 ) { - while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + while (1) { + next_vector: + while (pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) { /* we have a basic datatype (working on full blocks) */ UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, iov_ptr, conv_ptr, iov_len_local ); @@ -380,20 +399,13 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, } complete_loop: assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - if( 0 != iov_len_local ) { + if( (pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) && (0 != iov_len_local) ) { unsigned char* temp = conv_ptr; /* We have some partial data here. Let's copy it into the convertor * and keep it hot until the next round. */ assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size ); - COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor ); - - opal_unpack_partial_datatype( pConvertor, pElem, - iov_ptr, 0, iov_len_local, - &temp ); - - pConvertor->partial_length = iov_len_local; - iov_len_local = 0; + opal_unpack_partial_predefined(pConvertor, pElem, &count_desc, &iov_ptr, &temp, &iov_len_local); } iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ @@ -426,10 +438,88 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, * 1 if everything went fine and the data was completly converted * -1 something wrong occurs. */ -int32_t -opal_unpack_general_function( opal_convertor_t* pConvertor, - struct iovec* iov, uint32_t* out_size, - size_t* max_data ) +static inline void +unpack_predefined_heterogeneous(opal_convertor_t *CONVERTOR, + const dt_elem_desc_t *ELEM, size_t *COUNT, + unsigned char **memory, + unsigned char **packed, size_t *SPACE) +{ + const opal_convertor_master_t *master = (CONVERTOR)->master; + const ddt_elem_desc_t *_elem = &((ELEM)->elem); + size_t cando_count = *(COUNT), do_now_bytes; + size_t local_elem_size = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t remote_elem_size = master->remote_sizes[_elem->common.type]; + size_t blocklen_bytes = remote_elem_size; + unsigned char *_memory = (*memory) + _elem->disp; + unsigned char *_packed = *packed; + ptrdiff_t advance = 0; + + assert(0 == (cando_count % _elem->blocklen)); /* no partials here */ + assert(*(COUNT) <= ((size_t) _elem->count * _elem->blocklen)); + + if ((remote_elem_size * cando_count) > *(SPACE)) + cando_count = (*SPACE) / blocklen_bytes; + + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + + if (_elem->blocklen == 1) { + master->pFunctions[_elem->common.type](CONVERTOR, cando_count, + _packed, *SPACE, remote_elem_size, + _memory, *SPACE, _elem->extent, + &advance); + _memory += cando_count * _elem->extent; + _packed += cando_count * local_elem_size; + goto update_and_return; + } + + if ((1 < _elem->count) && (_elem->blocklen <= cando_count)) { + blocklen_bytes = remote_elem_size * _elem->blocklen; + + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER(_memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count); + DO_DEBUG(opal_output(0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void *) _packed, (void *) _memory, (unsigned long) blocklen_bytes, + (unsigned long) (*(SPACE) - (_packed - *(packed))));); + master->pFunctions[_elem->common.type](CONVERTOR, _elem->blocklen, + _packed, *SPACE, remote_elem_size, + _memory, *SPACE, local_elem_size, + &advance); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); + } + + /** + * As an epilog do anything left from the last blocklen. + */ + if (0 != cando_count) { + assert((cando_count < _elem->blocklen) + || ((1 == _elem->count) && (cando_count <= _elem->blocklen))); + do_now_bytes = cando_count * remote_elem_size; + OPAL_DATATYPE_SAFEGUARD_POINTER(_memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count); + DO_DEBUG(opal_output(0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void *) _packed, (void *) _memory, (unsigned long) do_now_bytes, + (unsigned long) (*(SPACE) - (_packed - *(packed))));); + master->pFunctions[_elem->common.type](CONVERTOR, cando_count, + _packed, *SPACE, remote_elem_size, + _memory, *SPACE, local_elem_size, + &advance); + _memory += do_now_bytes; + _packed += do_now_bytes; + } + +update_and_return: + *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; +} + +int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec *iov, + uint32_t *out_size, size_t *max_data) { dt_stack_t* pStack; /* pointer to the position on the stack */ uint32_t pos_desc; /* actual position in the description of the derived datatype */ @@ -443,10 +533,6 @@ opal_unpack_general_function( opal_convertor_t* pConvertor, uint32_t iov_count; size_t iov_len_local; - const opal_convertor_master_t* master = pConvertor->master; - ptrdiff_t advance; /* number of bytes that we should advance the buffer */ - size_t rc; - DO_DEBUG( opal_output( 0, "opal_convertor_general_unpack( %p, {%p, %lu}, %d )\n", (void*)pConvertor, (void*)iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); ); @@ -477,45 +563,48 @@ opal_unpack_general_function( opal_convertor_t* pConvertor, while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* now here we have a basic datatype */ type = description[pos_desc].elem.common.type; - OPAL_DATATYPE_SAFEGUARD_POINTER( conv_ptr + pElem->elem.disp, pData->size, pConvertor->pBaseBuf, - pData, pConvertor->count ); - DO_DEBUG( opal_output( 0, "unpack (%p, %ld) -> (%p:%ld, %" PRIsize_t ", %ld) type %s\n", - (void*)iov_ptr, iov_len_local, - (void*)pConvertor->pBaseBuf, conv_ptr + pElem->elem.disp - pConvertor->pBaseBuf, - count_desc, description[pos_desc].elem.extent, - opal_datatype_basicDatatypes[type]->name ); ); - rc = master->pFunctions[type]( pConvertor, count_desc, - iov_ptr, iov_len_local, opal_datatype_basicDatatypes[type]->size, - conv_ptr + pElem->elem.disp, - (pConvertor->pDesc->ub - pConvertor->pDesc->lb) * pConvertor->count, - description[pos_desc].elem.extent, &advance ); - iov_len_local -= advance; /* decrease the available space in the buffer */ - iov_ptr += advance; /* increase the pointer to the buffer */ - count_desc -= rc; /* compute leftovers */ - if( 0 == count_desc ) { /* completed */ + OPAL_DATATYPE_SAFEGUARD_POINTER(conv_ptr + pElem->elem.disp, pData->size, + pConvertor->pBaseBuf, pData, pConvertor->count); + DO_DEBUG(opal_output(0, + "unpack (%p, %ld) -> (%p:%ld, %" PRIsize_t ", %ld) type %s\n", + (void *) iov_ptr, iov_len_local, (void *) pConvertor->pBaseBuf, + conv_ptr + pElem->elem.disp - pConvertor->pBaseBuf, count_desc, + description[pos_desc].elem.extent, + opal_datatype_basicDatatypes[type]->name);); + unpack_predefined_heterogeneous(pConvertor, pElem, &count_desc, &conv_ptr, &iov_ptr, + &iov_len_local); +#if 0 + rc = master->pFunctions[type](pConvertor, count_desc, iov_ptr, iov_len_local, + opal_datatype_basicDatatypes[type]->size, + conv_ptr + pElem->elem.disp, + (pConvertor->pDesc->ub - pConvertor->pDesc->lb) + * pConvertor->count, + description[pos_desc].elem.extent, &advance); + iov_len_local -= advance; /* decrease the available space in the buffer */ + iov_ptr += advance; /* increase the pointer to the buffer */ + count_desc -= rc; /* compute leftovers */ +#endif + if (0 == count_desc) { /* completed */ conv_ptr = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); if( 0 == iov_len_local ) goto complete_loop; /* escape if we're done */ continue; } +#if 0 conv_ptr += rc * description[pos_desc].elem.extent; - assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - assert( 0 == iov_len_local ); - if( 0 != iov_len_local ) { - unsigned char* temp = conv_ptr; +#endif + assert(pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED); + assert(0 == iov_len_local); + if (0 != iov_len_local) { + unsigned char *temp = conv_ptr; /* We have some partial data here. Let's copy it into the convertor * and keep it hot until the next round. */ - assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size ); - COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor ); - - opal_unpack_partial_datatype( pConvertor, pElem, - iov_ptr, 0, iov_len_local, - &temp ); - - pConvertor->partial_length = iov_len_local; - iov_len_local = 0; + assert(iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size); + opal_unpack_partial_predefined(pConvertor, pElem, &count_desc, &iov_ptr, + &temp, &iov_len_local); + assert( 0 == iov_len_local ); } goto complete_loop; } diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index 79068729a14..33db8378829 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -27,9 +27,10 @@ #endif /** - * This function deals only with partial elements. The COUNT points however to the whole leftover count, - * but this function is only expected to operate on an amount less than blength, that would allow the rest - * of the pack process to handle only entire blength blocks (plus the left over). + * This function deals only with partial elements. The COUNT points however to + * the whole leftover count, but this function is only expected to operate on + * an amount less than blength, that would allow the rest of the pack process + * to handle only entire blength blocks (plus the left over). * * Return 1 if we are now aligned on a block, 0 otherwise. */ @@ -49,6 +50,8 @@ unpack_partial_blocklen( opal_convertor_t* CONVERTOR, assert( *(COUNT) <= ((size_t)(_elem->count * _elem->blocklen)) ); + if( (*SPACE) < do_now_bytes ) /* Can we do anything ? */ + return 0; /** * First check if we already did something on this element ? The COUNT is the number * of remaining predefined types in the current elem, not how many predefined types @@ -67,8 +70,9 @@ unpack_partial_blocklen( opal_convertor_t* CONVERTOR, OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + DO_DEBUG( opal_output( 0, "unpack memcpy( %p [%ld], %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, _memory - CONVERTOR->pBaseBuf, + (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); *(memory) += (ptrdiff_t)do_now_bytes; if( do_now == left_in_block ) /* compensate if completed a blocklen */ @@ -100,15 +104,17 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, if( (blocklen_bytes * cando_count) > *(SPACE) ) cando_count = (*SPACE) / blocklen_bytes; - /* premptively update the number of COUNT we will return. */ + /* preemptively update the number of COUNT we will return. */ *(COUNT) -= cando_count; - if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ - for(; cando_count > 0; cando_count--) { + if (1 == _elem->blocklen) { /* Do as many full blocklen as possible */ + for (; cando_count > 0; cando_count--) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n", - (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + DO_DEBUG( opal_output( 0, "unpack memcpy( %p [%ld], %p [%ld], %lu ) => space %lu [blen = 1]\n", + (void*)_memory, _memory - CONVERTOR->pBaseBuf, + (void*)_packed, _packed - *packed, + (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); _packed += blocklen_bytes; _memory += _elem->extent; @@ -122,8 +128,10 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, do { /* Do as many full blocklen as possible */ OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p [%ld], %p [%ld], %lu ) => space %lu\n", + (void*)_memory, _memory - CONVERTOR->pBaseBuf, + (void*)_packed, _packed - *packed, + (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); _packed += blocklen_bytes; _memory += _elem->extent; @@ -140,8 +148,10 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", - (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p [%ld], %p [%ld], %lu ) => space %lu [epilog]\n", + (void*)_memory, _memory - CONVERTOR->pBaseBuf, + (void*)_packed, _packed - *packed, + (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); _memory += do_now_bytes; _packed += do_now_bytes; diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am index 4366724a523..dea204bb1f9 100644 --- a/test/datatype/Makefile.am +++ b/test/datatype/Makefile.am @@ -15,7 +15,7 @@ # if PROJECT_OMPI - MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data + MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw ddt_raw2 unpack_ooo ddt_pack external32 large_data partial MPI_CHECKS = to_self endif TESTS = opal_datatype_test unpack_hetero $(MPI_TESTS) @@ -96,5 +96,11 @@ unpack_hetero_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS) unpack_hetero_LDADD = \ $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la +partial_SOURCES = partial.c +partial_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS) +partial_LDADD = \ + $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la + distclean: rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile diff --git a/test/datatype/external32.c b/test/datatype/external32.c index d09938510ba..9d47e60950a 100644 --- a/test/datatype/external32.c +++ b/test/datatype/external32.c @@ -33,13 +33,29 @@ int check_vector( void* send_buffer, void* packed, static int pack_unpack_datatype( void* send_data, ompi_datatype_t *datatype, int count, void* recv_data, checker_t validator, void *validator_arg ); -static void dump_hex(void* what, size_t length); - -static void dump_hex(void* what, size_t length) +static void +dump_hex(const char* msg, const void* vbuf, int nbytes, + int start_from, int stop_at, int vals_per_line) { - size_t i; - for( i = 0; i < length; i++ ) { - printf("%02x", (unsigned int)(((unsigned char*)what)[i])); + const char* buf = (const char*)vbuf; + + if( -1 == stop_at ) stop_at = nbytes; + + for (int i = (start_from / vals_per_line) * vals_per_line; i < nbytes; ++i) { + if( i >= stop_at ) return; + if (0 == (i % vals_per_line)) { + if( NULL == msg) printf("\n"); + else printf("\n%s", msg); + } else { + if (i % 4 == 0) { + printf(" "); + } + } + printf(" "); + if( i < start_from ) + printf(" "); + else + printf("%02x", *((unsigned char *)(buf + i))); } } @@ -131,7 +147,8 @@ static int pack_unpack_datatype( void* send_data, ompi_datatype_t *datatype, int return -1; } - printf("packed %ld bytes into a %ld bytes buffer ", position, buffer_size); dump_hex(buffer, position); printf("\n"); + printf("packed %ld bytes into a %ld bytes buffer ", position, buffer_size); + dump_hex(NULL, buffer, position, 0, -1, 24); printf("\n"); position = 0; error = ompi_datatype_unpack_external("external32", buffer, buffer_size, &position, @@ -155,12 +172,14 @@ int main(int argc, char *argv[]) if( verbose ) { printf("send data %08x %08x \n", send_data[0], send_data[1]); - printf("data "); dump_hex(&send_data, sizeof(int32_t) * 2); printf("\n"); + printf("data "); + dump_hex(NULL, &send_data, sizeof(int32_t) * 2, 0, -1, 24); printf("\n"); } (void)pack_unpack_datatype( send_data, &ompi_mpi_int32_t.dt, 2, recv_data, check_contiguous, (void*)&ompi_mpi_int32_t.dt ); if( verbose ) { - printf("recv "); dump_hex(&recv_data, sizeof(int32_t) * 2); printf("\n"); + printf("recv "); + dump_hex(NULL, &recv_data, sizeof(int32_t) * 2, 0, -1, 24); printf("\n"); printf("recv data %08x %08x \n", recv_data[0], recv_data[1]); } if( (send_data[0] != recv_data[0]) || (send_data[1] != recv_data[1]) ) { @@ -175,12 +194,14 @@ int main(int argc, char *argv[]) if( verbose ) { printf("send data %08x %08x \n", send_data[0], send_data[1]); - printf("data "); dump_hex(&send_data, sizeof(int16_t) * 2); printf("\n"); + printf("data "); + dump_hex(NULL, &send_data, sizeof(int16_t) * 2, 0, -1, 24); printf("\n"); } (void)pack_unpack_datatype( send_data, &ompi_mpi_int16_t.dt, 2, recv_data, check_contiguous, (void*)&ompi_mpi_int16_t.dt ); if( verbose ) { - printf("recv "); dump_hex(&recv_data, sizeof(int16_t) * 2); printf("\n"); + printf("recv "); + dump_hex(NULL, &recv_data, sizeof(int16_t) * 2, 0, -1, 24); printf("\n"); printf("recv data %08x %08x \n", recv_data[0], recv_data[1]); } if( (send_data[0] != recv_data[0]) || (send_data[1] != recv_data[1]) ) { @@ -208,16 +229,18 @@ int main(int argc, char *argv[]) if( verbose ) { printf("send data %08x %x08x %08x \n", send_data[0], send_data[1], send_data[2]); - printf("data "); dump_hex(&send_data, sizeof(int32_t) * 3); printf("\n"); + printf("data "); dump_hex(NULL, &send_data, sizeof(int32_t) * 3, 0, -1, 24); printf("\n"); } (void)pack_unpack_datatype( send_data, ddt, 1, recv_data, check_vector, (void*)&ompi_mpi_int32_t.dt ); if( verbose ) { - printf("recv "); dump_hex(&recv_data, sizeof(int32_t) * 3); printf("\n"); + printf("recv "); dump_hex(NULL, &recv_data, sizeof(int32_t) * 3, 0, -1, 24); printf("\n"); printf("recv data %08x %08x %08x \n", recv_data[0], recv_data[1], recv_data[2]); } ompi_datatype_destroy(&ddt); if( (send_data[0] != recv_data[0]) || (send_data[2] != recv_data[2]) ) { printf("Error during external32 pack/unack for vector types (MPI_INT32_T)\n"); + printf("[0]: %d ? %d | [2]: %d ? %d ([1]: %d ? %d)\n", send_data[0], recv_data[0], + send_data[2], recv_data[2], send_data[1], recv_data[1]); exit(-1); } } diff --git a/test/datatype/partial.c b/test/datatype/partial.c new file mode 100644 index 00000000000..c064db7193d --- /dev/null +++ b/test/datatype/partial.c @@ -0,0 +1,171 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018-2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2018 Triad National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "opal/datatype/opal_convertor.h" +#include "ompi/datatype/ompi_datatype.h" +#include "opal/datatype/opal_datatype_checksum.h" +#include "opal/runtime/opal.h" + +#include +#include +#include + +#define TYPE_COUNT 3 +#define TYPE_BLEN 2 +#define TYPE_STRIDE 4 + +#define CONT_COUNT 2 + +#define COUNT 3 + +#define CHUNK ((TYPE_BLEN*8)*2-4) + +/** + * Print how many elements on both sides of ptr. + */ +static void show_neighborhood(double* ptr, int how_many, bool show_hex) +{ + int i; + + printf("%12p: ", (void*)ptr); + for( i = -how_many; i < how_many; i++ ) { + if( 0 == i ) { + printf(" <%g> ", ptr[i]); + } else { + printf(" %g ", ptr[i]); + } + } + if( show_hex ) { + char* cptr = (char*)ptr; + printf("\n : "); + for( i = -how_many; i < how_many; i++ ) { + if( 0 == i ) printf(" <"); + for( int j = 0; j < sizeof(double); j++ ) { + printf("%02x", cptr[i * sizeof(double)+j]); + } + if( 0 == i ) printf("> "); + else printf(" "); + } + } + printf("\n\n"); +} + +/** + * -------G---[---][---] OPAL_LOOP_S 19 times the next 2 elements extent 18432 + * -cC---P-DB-[---][---] OPAL_FLOAT8 count 72 disp 0x80 (128) blen 16 extent 256 (size 9216) + * -------G---[---][---] OPAL_LOOP_E prev 2 elements first elem displacement 128 size of data 9216 + * -------G---[---][---] OPAL_LOOP_E prev 3 elements first elem displacement 128 size of data 175104 + */ + +int main( int argc, char* argv[] ) +{ + opal_datatype_t* vector; + ompi_datatype_t* base; + uint32_t iov_count; + size_t max_data, size, length; + struct iovec iov[2]; + opal_convertor_t* convertor; + ptrdiff_t extent, base_extent; + double *array, *packed; + char* bpacked; + int i, j; + + opal_init_util (NULL, NULL); + ompi_datatype_init(); + + ompi_datatype_create_vector(TYPE_COUNT, TYPE_BLEN, TYPE_STRIDE, MPI_DOUBLE, &base); + ompi_datatype_create_contiguous(CONT_COUNT, base, &vector); + + opal_datatype_commit( vector ); + + ompi_datatype_dump(vector); + + opal_datatype_type_size(vector, &size); + opal_datatype_type_extent(vector, &extent); + opal_datatype_type_extent(base, &base_extent); + + array = (double*)malloc( extent * COUNT ); + packed = (double*)malloc( size * COUNT ); + bpacked = (char*)packed; + + /** + * Initialize the sparse data using the index. + */ + for( i = 0; i < (TYPE_BLEN * TYPE_COUNT * CONT_COUNT * COUNT); i++ ) { + packed[i] = (double)(i % TYPE_BLEN); + } + memset(array, extent * COUNT, TYPE_BLEN + 1); + + /** + * Pack the sparse data into the packed array. This simulate the first step + * of the buffered operation. + */ + convertor = opal_convertor_create( opal_local_arch, 0 ); + opal_convertor_prepare_for_recv( convertor, vector, COUNT, array ); + + for( length = 0; length < (size * COUNT); ) { + iov[0].iov_base = bpacked + length; + iov[0].iov_len = CHUNK; + max_data = iov[0].iov_len; + + iov_count = 1; + opal_convertor_unpack( convertor, iov, &iov_count, &max_data ); + length += max_data; + + int idx = 0, checked = 0; + for( int m = 0; m < COUNT; m++ ) { + char* mptr = (char*)array + m * extent; + for( int k = 0; k < CONT_COUNT; k++ ) { + char* kptr = mptr + k * base_extent; + for( j = 0; j < TYPE_COUNT; j++ ) { + double* jarray = (double*)kptr + j * TYPE_STRIDE; + for( i = 0; i < TYPE_BLEN; i++ ) { + checked += sizeof(double); + if( checked > length ) + goto next_iteration; + if( jarray[i] != (double)(idx % TYPE_BLEN) ) { + fprintf(stderr, "\n\n\nError during check for the %d element, length %" PRIsize_t " (chunk %d)\n", + idx, length, CHUNK); + fprintf(stderr, "Error at position %d [%d:%d:%d:%d] found %g expected %g\n\n\n", + idx, m, k, j, i, jarray[i], (double)(idx % TYPE_BLEN)); + show_neighborhood(jarray + i, 4, true); + exit(-1); + } + idx++; + } + } + } + } +next_iteration: + /* nothing special to do here, just move to the next conversion */ + continue; + } + + OBJ_RELEASE(convertor); + + /** + * The datatype is not useful anymore + */ + OBJ_RELEASE(vector); + + free(array); + free(packed); + + /* clean-ups all data allocations */ + ompi_datatype_finalize(); + opal_finalize_util (); + + return 0; +} diff --git a/test/datatype/unpack_ooo.c b/test/datatype/unpack_ooo.c index 58ef8a95774..febc78bc924 100644 --- a/test/datatype/unpack_ooo.c +++ b/test/datatype/unpack_ooo.c @@ -27,6 +27,7 @@ #define N 331 uint32_t remote_arch = 0xffffffff; +bool report_all_errors = true; struct foo_t { int i[3]; @@ -38,26 +39,28 @@ struct pfoo_t { double d[2]; } pfoo = {0}, *pbar = NULL; -static void print_hex(void* ptr, int count, int space) +static void print_hex(void* ptr, int count, char* epilog, char* prolog) { - for( int i = 0; i < count; i++ ) { + if ( NULL != epilog) fprintf(stderr, "%s", epilog); + for ( int i = 0; i < count; i++ ) { fprintf(stderr, "%02x", (unsigned int)(((unsigned char*)ptr)[i])); } - if(space) fprintf(stderr, " "); + if (NULL != prolog) fprintf(stderr, "%s", prolog); } -static void print_bar_pbar(struct foo_t* bar, struct pfoo_t* pbar) +static void print_bar_pbar(struct foo_t* _bar, struct pfoo_t* _pbar) { - print_hex(&bar->i[0], sizeof(int), 1); - print_hex(&bar->i[2], sizeof(int), 1); - print_hex(&bar->d[0], sizeof(double), 1); - print_hex(&bar->d[2], sizeof(double), 1); - fprintf(stderr, "\n"); - print_hex(&pbar->i[0], sizeof(int), 1); - print_hex(&pbar->i[1], sizeof(int), 1); - print_hex(&pbar->d[0], sizeof(double), 1); - print_hex(&pbar->d[1], sizeof(double), 1); - fprintf(stderr, "\n"); + print_hex(&_bar->i[0], sizeof(int), NULL, " "); + print_hex(&_bar->i[1], sizeof(int), "[", "] "); + print_hex(&_bar->i[2], sizeof(int), NULL, " "); + print_hex(&_bar->d[0], sizeof(double), NULL, " "); + print_hex(&_bar->d[1], sizeof(double), "[", "] "); + print_hex(&_bar->d[2], sizeof(double), NULL, "\n"); + + print_hex(&_pbar->i[0], sizeof(int), NULL, " "); + print_hex(&_pbar->i[1], sizeof(int), NULL, " "); + print_hex(&_pbar->d[0], sizeof(double), NULL, " "); + print_hex(&_pbar->d[1], sizeof(double), NULL, "\n"); } static void print_stack(opal_convertor_t* conv) @@ -72,7 +75,7 @@ static void print_stack(opal_convertor_t* conv) printf("\n"); } -static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { +static int testcase(ompi_datatype_t * newtype, size_t arr[][2]) { int i, j, errors = 0; struct iovec a; unsigned int iov_count; @@ -99,7 +102,7 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { return OMPI_ERROR; } - for (i=0; arr[i][0] != 0; i++) { + for ( i = 0; 0 != arr[i][0]; i++) { /* add some garbage before and after the source data */ a.iov_base = malloc(arr[i][0]+2048); if (NULL == a.iov_base) { @@ -129,11 +132,36 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { bar[j].d[0] != pbar[j].d[0] || bar[j].d[1] != 0.0 || bar[j].d[2] != pbar[j].d[1]) { - if(0 == errors) { + if(0 == errors || report_all_errors) { + ptrdiff_t displ; + char* error_location = "in gaps"; + if (bar[j].i[0] != pbar[j].i[0]) { + displ = (char*)&bar[j].i[0] - (char*)&bar[0]; + error_location = "i[0]"; + } else if (bar[j].i[2] != pbar[j].i[1]) { + displ = (char*)&bar[j].i[1] - (char*)&bar[0]; + error_location = "i[2]"; + } else if (bar[j].d[0] != pbar[j].d[0]) { + displ = (char*)&bar[j].d[0] - (char*)&bar[0]; + error_location = "d[0]"; + } else if (bar[j].d[2] != pbar[j].d[1]) { + displ = (char*)&bar[j].d[1] - (char*)&bar[0]; + error_location = "d[2]"; + } else { + displ = (char*)&bar[j] - (char*)&bar[0]; + } + for (i = 0; 0 != arr[i][0]; i++) { + if( (displ >= arr[i][1]) && (displ <= (arr[i][1] + arr[i][0])) ) { + fprintf(stderr, "Problem encountered %li bytes into the %d unpack [%"PRIsize_t":%"PRIsize_t"]\n", + displ - arr[i][1], i, arr[i][1], arr[i][0]); + break; + } + } + (void)opal_datatype_dump(&newtype->super); - fprintf(stderr, "ERROR ! position=%d/%d, ptr = %p" + fprintf(stderr, "ERROR ! struct %d/%d in field %s, ptr = %p" " got (%d,%d,%d,%g,%g,%g) expected (%d,%d,%d,%g,%g,%g)\n", - j, N, (void*)&bar[j], + j, N, error_location, (void*)&bar[j], bar[j].i[0], bar[j].i[1], bar[j].i[2], @@ -147,6 +175,7 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { 0.0, pbar[j].d[1]); print_bar_pbar(&bar[j], &pbar[j]); + if( report_all_errors ) fprintf(stderr, "\n\n"); } errors++; } @@ -198,13 +227,13 @@ static int unpack_ooo(void) */ size_t test1[9][2] = { {992, 0}, - {1325, 992}, - {992, 2317}, - {992, 3309}, - {992, 4301}, - {992, 5293}, - {992, 6285}, - {667, 7277}, + {1325, 0 + 992}, + {992, 992 + 1325 /* = 2317 */}, + {992, 2317 + 992 /* = 3309 */}, + {992, 3309 + 992 /* = 4301 */}, + {992, 4301 + 992 /* = 5293 */}, + {992, 5293 + 992 /* = 6285 */}, + {667, 6285 + 992 /* = 7277 */}, {0, -1}, };