diff --git a/cmake/SetupChaiOptions.cmake b/cmake/SetupChaiOptions.cmake index e2d65fa0..4e1e7c6b 100644 --- a/cmake/SetupChaiOptions.cmake +++ b/cmake/SetupChaiOptions.cmake @@ -13,12 +13,14 @@ option(CHAI_DISABLE_RM "Make ManagedArray a thin wrapper" Off) mark_as_advanced(CHAI_DISABLE_RM) option(CHAI_ENABLE_UM "Use CUDA unified (managed) memory" Off) +option(CHAI_THIN_GPU_ALLOCATE "Single memory space model" Off) option(CHAI_ENABLE_PINNED "Use pinned host memory" Off) option(CHAI_ENABLE_RAJA_PLUGIN "Build plugin to set RAJA execution spaces" Off) option(CHAI_ENABLE_GPU_ERROR_CHECKING "Enable GPU error checking" On) option(CHAI_ENABLE_MANAGED_PTR "Enable managed_ptr" On) option(CHAI_DEBUG "Enable Debug Logging." Off) option(CHAI_ENABLE_RAJA_NESTED_TEST "Enable raja-chai-nested-tests, which fails to build on Debug CUDA builds." On) +option(CHAI_ENABLE_MANAGED_PTR_ON_GPU "Enable managed_ptr on GPU" On) option(CHAI_ENABLE_TESTS "Enable CHAI tests" On) option(CHAI_ENABLE_BENCHMARKS "Enable benchmarks" On) @@ -35,6 +37,10 @@ option(CHAI_ENABLE_COPY_HEADERS "Enable CHAI copy headers" Off) set(ENABLE_CUDA Off CACHE BOOL "Enable CUDA") -if (CHAI_ENABLE_UM AND NOT ENABLE_CUDA) - message(FATAL_ERROR "Option CHAI_ENABLE_UM requires ENABLE_CUDA") +if (CHAI_ENABLE_UM AND NOT ENABLE_CUDA AND NOT CHAI_THIN_GPU_ALLOCATE) + message(FATAL_ERROR "Option CHAI_ENABLE_UM requires ENABLE_CUDA or CHAI_THIN_GPU_ALLOCATE") +endif() + +if (CHAI_THIN_GPU_ALLOCATE AND NOT CHAI_DISABLE_RM) + message(FATAL_ERROR "Option CHAI_THIN_GPU_ALLOCATE requires CHAI_DISABLE_RM") endif() diff --git a/src/chai/ArrayManager.cpp b/src/chai/ArrayManager.cpp index b3290d2d..49f6447b 100644 --- a/src/chai/ArrayManager.cpp +++ b/src/chai/ArrayManager.cpp @@ -9,8 +9,10 @@ #include "chai/config.hpp" #if defined(CHAI_ENABLE_CUDA) +#if !defined(CHAI_THIN_GPU_ALLOCATE) #include "cuda_runtime_api.h" #endif +#endif #include "umpire/ResourceManager.hpp" @@ -172,6 +174,12 @@ void ArrayManager::setExecutionSpace(ExecutionSpace space) m_synced_since_last_kernel = false; } +#if defined(CHAI_THIN_GPU_ALLOCATE) + if (chai::CPU == space) { + syncIfNeeded(); + } +#endif + m_current_execution_space = space; } @@ -226,6 +234,34 @@ void ArrayManager::resetTouch(PointerRecord* pointer_record) } } + +/* Not all GPU platform runtimes (notably HIP), will give you asynchronous copies to the device by default, so we leverage + * umpire's API for asynchronous copies using camp resources in this method, based off of the CHAI destination space + * */ +static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager & manager, ExecutionSpace dst_space, ExecutionSpace src_space) { + +#ifdef CHAI_ENABLE_CUDA + camp::resources::Resource device_resource(camp::resources::Cuda::get_default()); +#elif defined(CHAI_ENABLE_HIP) + camp::resources::Resource device_resource(camp::resources::Hip::get_default()); +#else + camp::resources::Resource device_resource(camp::resources::Host::get_default()); +#endif + + camp::resources::Resource host_resource(camp::resources::Host::get_default()); + if (dst_space == GPU || src_space == GPU) { + // Do the copy using the device resource + manager.copy(dst_pointer, src_pointer, device_resource); + } else { + // Do the copy using the host resource + manager.copy(dst_pointer, src_pointer, host_resource); + } + // Ensure device to host copies are synchronous + if (dst_space == CPU && src_space == GPU) { + device_resource.wait(); + } +} + void ArrayManager::move(PointerRecord* record, ExecutionSpace space) { if (space == NONE) { @@ -253,7 +289,9 @@ void ArrayManager::move(PointerRecord* record, ExecutionSpace space) } #endif - void* src_pointer = record->m_pointers[record->m_last_space]; + ExecutionSpace prev_space = record->m_last_space; + + void* src_pointer = record->m_pointers[prev_space]; void* dst_pointer = record->m_pointers[space]; if (!dst_pointer) { @@ -267,7 +305,7 @@ void ArrayManager::move(PointerRecord* record, ExecutionSpace space) } else if (dst_pointer != src_pointer) { // Exclude the copy if src and dst are the same (can happen for PINNED memory) { - m_resource_manager.copy(dst_pointer, src_pointer); + chai::copy(dst_pointer, src_pointer, m_resource_manager, space, prev_space); } callback(record, ACTION_MOVE, space); @@ -285,7 +323,6 @@ void ArrayManager::allocate( pointer_record->m_pointers[space] = alloc.allocate(size); callback(pointer_record, ACTION_ALLOC, space); - registerPointer(pointer_record, space); CHAI_LOG(Debug, "Allocated array at: " << pointer_record->m_pointers[space]); @@ -449,32 +486,32 @@ PointerRecord* ArrayManager::makeManaged(void* pointer, PointerRecord* ArrayManager::deepCopyRecord(PointerRecord const* record) { - PointerRecord* copy = new PointerRecord{}; + PointerRecord* new_record = new PointerRecord{}; const size_t size = record->m_size; - copy->m_size = size; - copy->m_user_callback = [] (const PointerRecord*, Action, ExecutionSpace) {}; + new_record->m_size = size; + new_record->m_user_callback = [] (const PointerRecord*, Action, ExecutionSpace) {}; const ExecutionSpace last_space = record->m_last_space; - copy->m_last_space = last_space; + new_record->m_last_space = last_space; for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { - copy->m_allocators[space] = record->m_allocators[space]; + new_record->m_allocators[space] = record->m_allocators[space]; } - allocate(copy, last_space); + allocate(new_record, last_space); for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { - copy->m_owned[space] = true; - copy->m_touched[space] = false; + new_record->m_owned[space] = true; + new_record->m_touched[space] = false; } - copy->m_touched[last_space] = true; + new_record->m_touched[last_space] = true; - void* dst_pointer = copy->m_pointers[last_space]; + void* dst_pointer = new_record->m_pointers[last_space]; void* src_pointer = record->m_pointers[last_space]; - m_resource_manager.copy(dst_pointer, src_pointer); + chai::copy(dst_pointer, src_pointer, m_resource_manager, last_space, last_space); - return copy; + return new_record; } std::unordered_map diff --git a/src/chai/ArrayManager.inl b/src/chai/ArrayManager.inl index 06a5253c..d1f0a728 100644 --- a/src/chai/ArrayManager.inl +++ b/src/chai/ArrayManager.inl @@ -17,8 +17,10 @@ #include "umpire/ResourceManager.hpp" #if defined(CHAI_ENABLE_UM) +#if !defined(CHAI_THIN_GPU_ALLOCATE) #include #endif +#endif namespace chai { diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp index ac6b709c..efed354b 100644 --- a/src/chai/ManagedArray.hpp +++ b/src/chai/ManagedArray.hpp @@ -81,7 +81,7 @@ class ManagedArray : public CHAICopyable /*! * \brief Default constructor creates a ManagedArray with no allocations. */ - CHAI_HOST_DEVICE ManagedArray( + ManagedArray( std::initializer_list spaces, std::initializer_list allocators); @@ -98,7 +98,7 @@ class ManagedArray : public CHAICopyable */ CHAI_HOST_DEVICE ManagedArray(size_t elems, ExecutionSpace space = get_default_space()); - CHAI_HOST_DEVICE ManagedArray( + ManagedArray( size_t elems, std::initializer_list spaces, std::initializer_list allocators, @@ -273,7 +273,7 @@ class ManagedArray : public CHAICopyable CHAI_HOST_DEVICE ManagedArray(T* data, ArrayManager* array_manager, - size_t m_elems, + size_t elems, PointerRecord* pointer_record); ManagedArray& operator=(ManagedArray const & other) = default; @@ -411,16 +411,15 @@ class ManagedArray : public CHAICopyable m_active_pointer = other.m_active_pointer; m_active_base_pointer = other.m_active_base_pointer; m_resource_manager = other.m_resource_manager; - m_elems = other.m_elems; + m_size = other.m_size; m_offset = other.m_offset; m_pointer_record = other.m_pointer_record; m_is_slice = other.m_is_slice; #ifndef CHAI_DISABLE_RM #if !defined(CHAI_DEVICE_COMPILE) - // if we can, ensure elems is based off the pointer_record size to protect against - // casting leading to incorrect size info in m_elems. + // if we can, ensure elems is based off the pointer_record size out of paranoia if (m_pointer_record != nullptr && !m_is_slice) { - m_elems = m_pointer_record->m_size / sizeof(T); + m_size = m_pointer_record->m_size; } #endif #endif @@ -444,7 +443,7 @@ class ManagedArray : public CHAICopyable typename std::enable_if::type = 0> CHAI_HOST bool initInner(size_t start = 0) { - for (size_t i = start; i < m_elems; ++i) { + for (size_t i = start; i < m_size/sizeof(T); ++i) { m_active_base_pointer[i] = nullptr; } return true; @@ -473,7 +472,7 @@ class ManagedArray : public CHAICopyable /*! * Number of elements in the ManagedArray. */ - mutable size_t m_elems = 0; + mutable size_t m_size = 0; mutable size_t m_offset = 0; /*! @@ -582,7 +581,7 @@ CHAI_INLINE CHAI_HOST_DEVICE ManagedArray ManagedArray::slice( size_t offs slice.m_active_base_pointer = m_active_base_pointer; slice.m_offset = offset + m_offset; slice.m_active_pointer = m_active_base_pointer + slice.m_offset; - slice.m_elems = elems; + slice.m_size = elems*sizeof(T); slice.m_is_slice = true; } return slice; diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl index 2f5f3445..ad97e76e 100644 --- a/src/chai/ManagedArray.inl +++ b/src/chai/ManagedArray.inl @@ -18,7 +18,7 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray(): m_active_pointer(nullptr), m_active_base_pointer(nullptr), m_resource_manager(nullptr), - m_elems(0), + m_size(0), m_offset(0), m_pointer_record(nullptr), m_is_slice(false) @@ -31,12 +31,11 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray(): template CHAI_INLINE -CHAI_HOST_DEVICE ManagedArray::ManagedArray( +ManagedArray::ManagedArray( std::initializer_list spaces, std::initializer_list allocators): ManagedArray() { -#if !defined(CHAI_DEVICE_COMPILE) m_pointer_record = new PointerRecord(); int i = 0; for (int s = CPU; s < NUM_EXECUTION_SPACES; ++s) { @@ -46,7 +45,6 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray( for (const auto& space : spaces) { m_pointer_record->m_allocators[space] = allocators.begin()[i++].getId(); } -#endif } @@ -64,16 +62,14 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray( template CHAI_INLINE -CHAI_HOST_DEVICE ManagedArray::ManagedArray( +ManagedArray::ManagedArray( size_t elems, std::initializer_list spaces, std::initializer_list allocators, ExecutionSpace space): ManagedArray(spaces, allocators) { -#if !defined(CHAI_DEVICE_COMPILE) this->allocate(elems, space); -#endif } template @@ -89,7 +85,7 @@ CHAI_HOST ManagedArray::ManagedArray(PointerRecord* record, ExecutionSpace sp m_active_pointer(static_cast(record->m_pointers[space])), m_active_base_pointer(static_cast(record->m_pointers[space])), m_resource_manager(nullptr), - m_elems(record->m_size/sizeof(T)), + m_size(record->m_size), m_offset(0), m_pointer_record(record), m_is_slice(false) @@ -107,16 +103,16 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray(ManagedArray const& other): m_active_pointer(other.m_active_pointer), m_active_base_pointer(other.m_active_base_pointer), m_resource_manager(other.m_resource_manager), - m_elems(other.m_elems), + m_size(other.m_size), m_offset(other.m_offset), m_pointer_record(other.m_pointer_record), m_is_slice(other.m_is_slice) { #if !defined(CHAI_DEVICE_COMPILE) - if (m_active_base_pointer || m_elems > 0 ) { - // we only update m_elems if we are not null and we have a pointer record + if (m_active_base_pointer || m_size > 0 ) { + // we only update m_size if we are not null and we have a pointer record if (m_pointer_record && !m_is_slice) { - m_elems = m_pointer_record->m_size/sizeof(T); + m_size = m_pointer_record->m_size; } move(m_resource_manager->getExecutionSpace()); } @@ -129,7 +125,7 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray(T* data, ArrayManager* array_mana m_active_pointer(data), m_active_base_pointer(data), m_resource_manager(array_manager), - m_elems(elems), + m_size(elems*sizeof(T)), m_offset(0), m_pointer_record(pointer_record), m_is_slice(false) @@ -139,7 +135,7 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray(T* data, ArrayManager* array_mana m_resource_manager = ArrayManager::getInstance(); } if (m_pointer_record == &ArrayManager::s_null_record || m_pointer_record==nullptr) { - m_pointer_record = m_resource_manager->makeManaged((void *) data, sizeof(T)*m_elems,ExecutionSpace(CPU),true); + m_pointer_record = m_resource_manager->makeManaged((void *) data, m_size,ExecutionSpace(CPU),true); } registerTouch(CPU); #endif @@ -166,8 +162,8 @@ CHAI_HOST void ManagedArray::allocate( } m_pointer_record->m_user_callback = cback; - m_elems = elems; - m_pointer_record->m_size = sizeof(T)*elems; + m_size = elems*sizeof(T); + m_pointer_record->m_size = m_size; if (space != NONE) { m_resource_manager->allocate(m_pointer_record, space); @@ -217,16 +213,16 @@ CHAI_HOST void ManagedArray::reallocate(size_t elems) { if(!m_is_slice) { if (elems > 0) { - if (m_elems == 0 && m_active_base_pointer == nullptr) { + if (m_size == 0 && m_active_base_pointer == nullptr) { return allocate(elems, CPU); } - CHAI_LOG(Debug, "Reallocating array of size " << m_elems << " with new size" << elems); + CHAI_LOG(Debug, "Reallocating array of size " << m_size << " bytes with new size" << elems*sizeof(T) << "bytes."); if (m_pointer_record == &ArrayManager::s_null_record) { - m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_elems*sizeof(T),CPU,true); + m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_size,CPU,true); } - size_t old_size = m_elems; + size_t old_size = m_size; - m_elems = elems; + m_size = elems*sizeof(T); m_active_base_pointer = static_cast(m_resource_manager->reallocate(m_active_base_pointer, elems, m_pointer_record)); @@ -235,13 +231,13 @@ CHAI_HOST void ManagedArray::reallocate(size_t elems) // if T is a CHAICopyable, then it is important to initialize all the new // ManagedArrays to nullptr at allocation, since it is extremely easy to // trigger a moveInnerImpl, which expects inner values to be initialized. - if (initInner(old_size)) { + if (initInner(old_size/sizeof(T))) { // if we are active on the GPU, we need to send any newly initialized inner members to the device - if (m_pointer_record->m_last_space == GPU && old_size < m_elems) { + if (m_pointer_record->m_last_space == GPU && old_size < m_size) { umpire::ResourceManager & umpire_rm = umpire::ResourceManager::getInstance(); - void *src = (T*)m_pointer_record->m_pointers[CPU] + old_size; - void *dst = (T*)m_pointer_record->m_pointers[GPU] + old_size; - umpire_rm.copy(dst,src,(m_elems-old_size)*sizeof(T)); + void *src = (void *)(((char *)(m_pointer_record->m_pointers[CPU])) + old_size); + void *dst = (void *)(((char *)(m_pointer_record->m_pointers[GPU])) + old_size); + umpire_rm.copy(dst,src,m_size-old_size); } } @@ -262,13 +258,13 @@ CHAI_HOST void ManagedArray::free(ExecutionSpace space) m_resource_manager = ArrayManager::getInstance(); } if (m_pointer_record == &ArrayManager::s_null_record) { - m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_elems*sizeof(T),space,true); + m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_size,space,true); } m_resource_manager->free(m_pointer_record, space); m_active_pointer = nullptr; m_active_base_pointer = nullptr; - m_elems = 0; + m_size = 0; m_offset = 0; // The call to m_resource_manager::free, above, has deallocated m_pointer_record if space == NONE. if (space == NONE) { @@ -289,7 +285,7 @@ CHAI_HOST void ManagedArray::reset() template CHAI_INLINE CHAI_HOST_DEVICE size_t ManagedArray::size() const { - return m_elems; + return m_size/sizeof(T); } template @@ -297,7 +293,7 @@ CHAI_INLINE CHAI_HOST void ManagedArray::registerTouch(ExecutionSpace space) { if (m_active_pointer && (m_pointer_record == nullptr || m_pointer_record == &ArrayManager::s_null_record)) { CHAI_LOG(Warning,"registerTouch called on ManagedArray with nullptr pointer record."); - m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_elems*sizeof(T),space,true); + m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_size,space,true); } m_resource_manager->registerTouch(m_pointer_record, space); } @@ -450,12 +446,12 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray(T* data, CHAIDISAMBIGUATE, bool ) m_active_base_pointer(data), #if !defined(CHAI_DEVICE_COMPILE) m_resource_manager(ArrayManager::getInstance()), - m_elems(m_resource_manager->getSize((void *)m_active_base_pointer)/sizeof(T)), + m_size(m_resource_manager->getSize((void *)m_active_base_pointer)), m_offset(0), m_pointer_record(m_resource_manager->getPointerRecord((void *)data)), #else m_resource_manager(nullptr), - m_elems(0), + m_size(0), m_offset(0), m_pointer_record(nullptr), #endif @@ -497,7 +493,7 @@ T* ManagedArray::data() const { move(CPU); } - if (m_elems == 0 && !m_is_slice) { + if (m_size == 0 && !m_is_slice) { return nullptr; } @@ -520,7 +516,7 @@ const T* ManagedArray::cdata() const { move(CPU, false); } - if (m_elems == 0 && !m_is_slice) { + if (m_size == 0 && !m_is_slice) { return nullptr; } @@ -536,7 +532,7 @@ T* ManagedArray::data(ExecutionSpace space, bool do_move) const { return nullptr; } - if (m_elems == 0 && !m_is_slice) { + if (m_size == 0 && !m_is_slice) { return nullptr; } @@ -600,7 +596,7 @@ ManagedArray& ManagedArray::operator= (std::nullptr_t) { m_active_pointer = nullptr; m_active_base_pointer = nullptr; - m_elems = 0; + m_size = 0; m_offset = 0; #if !defined(CHAI_DEVICE_COMPILE) m_pointer_record = &ArrayManager::s_null_record; @@ -653,21 +649,21 @@ CHAI_INLINE CHAI_HOST_DEVICE bool ManagedArray::operator== (std::nullptr_t from) const { - return m_active_pointer == from || m_elems == 0; + return m_active_pointer == from || m_size == 0; } template CHAI_INLINE CHAI_HOST_DEVICE bool ManagedArray::operator!= (std::nullptr_t from) const { - return m_active_pointer != from && m_elems > 0; + return m_active_pointer != from && m_size > 0; } template CHAI_INLINE CHAI_HOST_DEVICE ManagedArray::operator bool () const { - return m_elems > 0; + return m_size > 0; } template diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl index 5ae0b1ad..329ff18c 100644 --- a/src/chai/ManagedArray_thin.inl +++ b/src/chai/ManagedArray_thin.inl @@ -10,13 +10,15 @@ #include "ManagedArray.hpp" #if defined(CHAI_ENABLE_UM) +#if !defined(CHAI_THIN_GPU_ALLOCATE) #include #endif +#endif namespace chai { template -CHAI_INLINE CHAI_HOST_DEVICE ManagedArray::ManagedArray( +CHAI_INLINE ManagedArray::ManagedArray( std::initializer_list spaces, std::initializer_list allocators) : ManagedArray() @@ -32,14 +34,14 @@ CHAI_INLINE CHAI_HOST_DEVICE ManagedArray::ManagedArray( template CHAI_INLINE -CHAI_HOST_DEVICE ManagedArray::ManagedArray( +ManagedArray::ManagedArray( size_t elems, std::initializer_list spaces, std::initializer_list allocators, ExecutionSpace space) : ManagedArray(spaces, allocators) { - m_elems = elems; + m_size = elems*sizeof(T); this->allocate(elems, space); } @@ -50,9 +52,11 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray() = default; template CHAI_INLINE CHAI_HOST_DEVICE ManagedArray::ManagedArray(size_t elems, ExecutionSpace space) : - m_elems(elems) + m_size(elems*sizeof(T)) { +#ifndef CHAI_DEVICE_COMPILE this->allocate(elems, space); +#endif } @@ -61,7 +65,7 @@ CHAI_INLINE CHAI_HOST_DEVICE ManagedArray::ManagedArray(std::nullptr_t) : m_active_pointer(nullptr), m_active_base_pointer(nullptr), m_resource_manager(nullptr), - m_elems(0), + m_size(0), m_offset(0), m_pointer_record(nullptr), m_is_slice(false) @@ -75,7 +79,7 @@ CHAI_HOST ManagedArray::ManagedArray(PointerRecord* record, ExecutionSpace sp m_active_pointer(static_cast(record->m_pointers[space])), m_active_base_pointer(static_cast(record->m_pointers[space])), m_resource_manager(nullptr), - m_elems(record->m_size/sizeof(T)), + m_size(record->m_size), m_offset(0), m_pointer_record(nullptr), m_is_slice(!record->m_owned[space]) @@ -95,7 +99,7 @@ CHAI_HOST_DEVICE ManagedArray::ManagedArray(T* data, m_active_pointer(data), m_active_base_pointer(data), m_resource_manager(array_manager), - m_elems(elems), + m_size(elems*sizeof(T)), m_pointer_record(pointer_record) { } @@ -115,18 +119,29 @@ CHAI_HOST_DEVICE T* ManagedArray::getActivePointer() const template CHAI_HOST_DEVICE T* ManagedArray::data() const { +#if !defined(CHAI_DEVICE_COMPILE) && defined(CHAI_THIN_GPU_ALLOCATE) + ArrayManager::getInstance()->syncIfNeeded(); +#endif return m_active_pointer; } template CHAI_HOST_DEVICE const T* ManagedArray::cdata() const { +#if !defined(CHAI_DEVICE_COMPILE) && defined(CHAI_THIN_GPU_ALLOCATE) + ArrayManager::getInstance()->syncIfNeeded(); +#endif return m_active_pointer; } template -T* ManagedArray::data(ExecutionSpace /*space*/, bool /*do_move*/) const +T* ManagedArray::data(ExecutionSpace space, bool do_move) const { +#if defined(CHAI_THIN_GPU_ALLOCATE) + if (do_move && space != chai::GPU) { + ArrayManager::getInstance()->syncIfNeeded(); + } +#endif return m_active_pointer; } @@ -160,10 +175,12 @@ CHAI_HOST void ManagedArray::allocate(size_t elems, << " in space " << space); - m_elems = elems; + m_size = elems*sizeof(T); - #if defined(CHAI_ENABLE_UM) - gpuMallocManaged(&m_active_pointer, sizeof(T) * elems); + #if defined(CHAI_THIN_GPU_ALLOCATE) + m_active_pointer = (T*) chai::ArrayManager::getInstance()->getAllocator(chai::GPU).allocate(m_size); + #elif defined(CHAI_ENABLE_UM) + gpuMallocManaged(&m_active_pointer, m_size); #else // not CHAI_ENABLE_UM m_active_pointer = static_cast(malloc(sizeof(T) * elems)); #endif @@ -173,7 +190,7 @@ CHAI_HOST void ManagedArray::allocate(size_t elems, } else { m_active_pointer = nullptr; - m_elems = 0; + m_size = 0; } } else { @@ -187,16 +204,25 @@ CHAI_INLINE CHAI_HOST void ManagedArray::reallocate(size_t new_elems) { if (!m_is_slice) { - CHAI_LOG(Debug, "Reallocating array of size " << m_elems + CHAI_LOG(Debug, "Reallocating array of size " << m_size*sizeof(T) << " with new size" - << elems); + << new_elems*sizeof(T)); T* new_ptr = nullptr; - #if defined(CHAI_ENABLE_UM) + #if defined(CHAI_THIN_GPU_ALLOCATE) + auto allocator = chai::ArrayManager::getInstance()->getAllocator(chai::GPU); + if (new_elems > 0) { + new_ptr = (T*) allocator.allocate(sizeof(T) * new_elems); + ArrayManager::getInstance()->syncIfNeeded(); + chai::gpuMemcpy(new_ptr, m_active_pointer, std::min(m_size, new_elems*sizeof(T)), gpuMemcpyDefault); + registerTouch(chai::GPU); + } + allocator.deallocate(m_active_pointer); + #elif defined(CHAI_ENABLE_UM) if (new_elems > 0) { gpuMallocManaged(&new_ptr, sizeof(T) * new_elems); - gpuMemcpy(new_ptr, m_active_pointer, sizeof(T) * m_elems, gpuMemcpyDefault); + gpuMemcpy(new_ptr, m_active_pointer, std::min(new_elems*sizeof(T), m_size), gpuMemcpyDefault); } gpuFree(m_active_pointer); #else // not CHAI_ENABLE_UM @@ -208,7 +234,7 @@ CHAI_HOST void ManagedArray::reallocate(size_t new_elems) } #endif - m_elems = new_elems; + m_size= new_elems*sizeof(T); m_active_pointer = new_ptr; m_active_base_pointer = m_active_pointer; @@ -224,14 +250,19 @@ CHAI_INLINE CHAI_HOST void ManagedArray::free(ExecutionSpace space) { if (!m_is_slice) { if (space == CPU || space == NONE) { -#if defined(CHAI_ENABLE_UM) - gpuFree(m_active_pointer); +#if defined(CHAI_THIN_GPU_ALLOCATE) + if (m_active_pointer) { + auto allocator = chai::ArrayManager::getInstance()->getAllocator(chai::GPU); + allocator.deallocate((void *)m_active_pointer); + } +#elif defined(CHAI_ENABLE_UM) + chai::gpuFree(m_active_pointer); #else ::free((void *)m_active_pointer); #endif m_active_pointer = nullptr; m_active_base_pointer = nullptr; - m_elems = 0; + m_size = 0; } } else { @@ -251,7 +282,11 @@ template CHAI_INLINE CHAI_HOST_DEVICE typename ManagedArray::T_non_const ManagedArray< T>::pick(size_t i) const { -#ifdef CHAI_ENABLE_UM +#if defined(CHAI_THIN_GPU_ALLOCATE) +#if !defined(CHAI_DEVICE_COMPILE) + ArrayManager::getInstance()->syncIfNeeded(); +#endif +#elif defined(CHAI_ENABLE_UM) synchronize(); #endif return (T_non_const)m_active_pointer[i]; @@ -260,7 +295,11 @@ CHAI_INLINE CHAI_HOST_DEVICE typename ManagedArray::T_non_const ManagedArray< template CHAI_INLINE CHAI_HOST_DEVICE void ManagedArray::set(size_t i, T val) const { -#if defined(CHAI_ENABLE_UM) +#if defined(CHAI_THIN_GPU_ALLOCATE) +#if !defined(CHAI_DEVICE_COMPILE) + ArrayManager::getInstance()->syncIfNeeded(); +#endif +#elif defined(CHAI_ENABLE_UM) synchronize(); #endif m_active_pointer[i] = val; @@ -269,7 +308,11 @@ CHAI_INLINE CHAI_HOST_DEVICE void ManagedArray::set(size_t i, T val) const template CHAI_INLINE CHAI_HOST_DEVICE void ManagedArray::incr(size_t i) const { -#if defined(CHAI_ENABLE_UM) +#if defined(CHAI_THIN_GPU_ALLOCATE) +#if !defined(CHAI_DEVICE_COMPILE) + ArrayManager::getInstance()->syncIfNeeded(); +#endif +#elif defined(CHAI_ENABLE_UM) synchronize(); #endif ++m_active_pointer[i]; @@ -278,22 +321,29 @@ CHAI_INLINE CHAI_HOST_DEVICE void ManagedArray::incr(size_t i) const template CHAI_INLINE CHAI_HOST_DEVICE void ManagedArray::decr(size_t i) const { -#if defined(CHAI_ENABLE_UM) +#if defined(CHAI_THIN_GPU_ALLOCATE) +#if !defined(CHAI_DEVICE_COMPILE) + ArrayManager::getInstance()->syncIfNeeded(); +#endif +#elif defined(CHAI_ENABLE_UM) synchronize(); #endif --m_active_pointer[i]; } -#endif +#endif // CHAI_ENABLE_PICK template CHAI_INLINE CHAI_HOST_DEVICE size_t ManagedArray::size() const { - return m_elems; + return m_size/sizeof(T); } template -CHAI_INLINE CHAI_HOST void ManagedArray::registerTouch(ExecutionSpace) +CHAI_INLINE CHAI_HOST void ManagedArray::registerTouch(ExecutionSpace space) { +#if defined(CHAI_THIN_GPU_ALLOCATE) + chai::ArrayManager::getInstance()->setExecutionSpace(space) ; +#endif } template @@ -322,7 +372,7 @@ ManagedArray::ManagedArray(T* data, CHAIDISAMBIGUATE, bool) : m_active_pointer(data), m_active_base_pointer(data), m_resource_manager(nullptr), - m_elems(-1), + m_size(-1), m_pointer_record(nullptr), m_offset(0), m_is_slice(false) @@ -337,7 +387,7 @@ ManagedArray::operator typename std:: { return ManagedArray(const_cast(m_active_pointer), m_resource_manager, - m_elems, + m_size/sizeof(T), nullptr); } @@ -358,7 +408,7 @@ CHAI_INLINE CHAI_HOST_DEVICE ManagedArray& ManagedArray::operator=(std::nu { m_active_pointer = from; m_active_base_pointer = from; - m_elems = 0; + m_size = 0; m_is_slice = false; return *this; } diff --git a/src/chai/config.hpp.in b/src/chai/config.hpp.in index 8325184c..d1aabd6f 100644 --- a/src/chai/config.hpp.in +++ b/src/chai/config.hpp.in @@ -12,10 +12,12 @@ #cmakedefine CHAI_ENABLE_HIP #cmakedefine CHAI_ENABLE_IMPLICIT_CONVERSIONS #cmakedefine CHAI_DISABLE_RM +#cmakedefine CHAI_THIN_GPU_ALLOCATE #cmakedefine CHAI_ENABLE_UM #cmakedefine CHAI_DEBUG #cmakedefine CHAI_ENABLE_GPU_ERROR_CHECKING #cmakedefine CHAI_ENABLE_MANAGED_PTR +#cmakedefine CHAI_ENABLE_MANAGED_PTR_ON_GPU #cmakedefine CHAI_ENABLE_RAJA_PLUGIN #cmakedefine CHAI_ENABLE_GPU_SIMULATION_MODE #cmakedefine CHAI_ENABLE_PINNED diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp index a527437c..e236619d 100644 --- a/src/chai/managed_ptr.hpp +++ b/src/chai/managed_ptr.hpp @@ -4,14 +4,14 @@ // // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// -#ifndef MANAGED_PTR_H_ -#define MANAGED_PTR_H_ +#ifndef CHAI_MANAGED_PTR_HPP +#define CHAI_MANAGED_PTR_HPP #include "chai/config.hpp" #if defined(CHAI_ENABLE_MANAGED_PTR) -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) || defined(CHAI_THIN_GPU_ALLOCATE) #include "chai/ArrayManager.hpp" #endif @@ -26,7 +26,7 @@ namespace chai { -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) template CHAI_HOST void destroy_on_device(T* gpuPointer); #endif @@ -143,7 +143,9 @@ namespace chai { managed_ptr(std::initializer_list spaces, std::initializer_list pointers) : m_cpu_pointer(nullptr), +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) m_gpu_pointer(nullptr), +#endif m_pointer_record(new managed_ptr_record()) { static_assert(std::is_convertible::value, @@ -161,7 +163,7 @@ namespace chai { case CPU: m_cpu_pointer = pointers.begin()[i++]; break; -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) case GPU: m_gpu_pointer = pointers.begin()[i++]; break; @@ -191,7 +193,9 @@ namespace chai { std::initializer_list pointers, std::function callback) : m_cpu_pointer(nullptr), +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) m_gpu_pointer(nullptr), +#endif m_pointer_record(new managed_ptr_record(callback)) { static_assert(std::is_convertible::value, @@ -209,7 +213,7 @@ namespace chai { case CPU: m_cpu_pointer = pointers.begin()[i++]; break; -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) case GPU: m_gpu_pointer = pointers.begin()[i++]; break; @@ -235,7 +239,9 @@ namespace chai { /// CHAI_HOST_DEVICE managed_ptr(const managed_ptr& other) noexcept : m_cpu_pointer(other.m_cpu_pointer), +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) m_gpu_pointer(other.m_gpu_pointer), +#endif m_pointer_record(other.m_pointer_record) { #if !defined(CHAI_DEVICE_COMPILE) @@ -257,7 +263,9 @@ namespace chai { template CHAI_HOST_DEVICE managed_ptr(const managed_ptr& other) noexcept : m_cpu_pointer(other.m_cpu_pointer), +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) m_gpu_pointer(other.m_gpu_pointer), +#endif m_pointer_record(other.m_pointer_record) { static_assert(std::is_convertible::value, @@ -298,7 +306,7 @@ namespace chai { case CPU: m_cpu_pointer = pointers.begin()[i++]; break; -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) case GPU: m_gpu_pointer = pointers.begin()[i++]; break; @@ -326,7 +334,9 @@ namespace chai { CHAI_HOST_DEVICE managed_ptr& operator=(const managed_ptr& other) noexcept { if (this != &other) { m_cpu_pointer = other.m_cpu_pointer; +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) m_gpu_pointer = other.m_gpu_pointer; +#endif m_pointer_record = other.m_pointer_record; #if !defined(CHAI_DEVICE_COMPILE) @@ -354,7 +364,9 @@ namespace chai { "U* must be convertible to T*."); m_cpu_pointer = other.m_cpu_pointer; +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) m_gpu_pointer = other.m_gpu_pointer; +#endif m_pointer_record = other.m_pointer_record; #if !defined(CHAI_DEVICE_COMPILE) @@ -370,11 +382,13 @@ namespace chai { /// Returns the CPU or GPU pointer depending on the calling context. /// CHAI_HOST_DEVICE inline T* get() const { +#if defined(CHAI_DEVICE_COMPILE) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) + return m_gpu_pointer; +#else #if !defined(CHAI_DEVICE_COMPILE) move(); +#endif return m_cpu_pointer; -#else - return m_gpu_pointer; #endif } @@ -394,7 +408,7 @@ namespace chai { switch (space) { case CPU: return m_cpu_pointer; -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) case GPU: return m_gpu_pointer; #endif @@ -409,10 +423,13 @@ namespace chai { /// Returns the CPU or GPU pointer depending on the calling context. /// CHAI_HOST_DEVICE inline T* operator->() const { +#if defined(CHAI_DEVICE_COMPILE) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) + return m_gpu_pointer; +#else #if !defined(CHAI_DEVICE_COMPILE) + move(); +#endif return m_cpu_pointer; -#else - return m_gpu_pointer; #endif } @@ -422,10 +439,13 @@ namespace chai { /// Returns the CPU or GPU reference depending on the calling context. /// CHAI_HOST_DEVICE inline T& operator*() const { +#if defined(CHAI_DEVICE_COMPILE) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) + return *m_gpu_pointer; +#else #if !defined(CHAI_DEVICE_COMPILE) + move(); +#endif return *m_cpu_pointer; -#else - return *m_gpu_pointer; #endif } @@ -496,7 +516,7 @@ namespace chai { delete pointer; m_cpu_pointer = nullptr; break; -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) case GPU: { if (pointer) { @@ -524,7 +544,7 @@ namespace chai { delete pointer; m_cpu_pointer = nullptr; break; -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) case GPU: { if (pointer) { @@ -548,7 +568,9 @@ namespace chai { private: T* m_cpu_pointer = nullptr; /// The CPU pointer +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) T* m_gpu_pointer = nullptr; /// The GPU pointer +#endif managed_ptr_record* m_pointer_record = nullptr; /// The pointer record /// Needed for the converting constructor @@ -567,7 +589,7 @@ namespace chai { /// with the ACTION_MOVE event. /// CHAI_HOST void move() const { -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) if (m_pointer_record) { ExecutionSpace newSpace = ArrayManager::getInstance()->getExecutionSpace(); @@ -720,7 +742,7 @@ namespace chai { return arg.get(); } -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) /// /// @author Alan Dayton @@ -847,7 +869,7 @@ namespace chai { template CHAI_HOST T* make_on_host(Args&&... args) { -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) // Get the ArrayManager and save the current execution space chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); ExecutionSpace currentSpace = arrayManager->getExecutionSpace(); @@ -860,7 +882,7 @@ namespace chai { // Create on the host T* cpuPointer = new T(detail::processArguments(args)...); -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) // Set the execution space back to the previous value arrayManager->setExecutionSpace(currentSpace); #endif @@ -885,7 +907,7 @@ namespace chai { typename F, typename... Args> CHAI_HOST T* make_on_host_from_factory(F f, Args&&... args) { -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) // Get the ArrayManager and save the current execution space chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); ExecutionSpace currentSpace = arrayManager->getExecutionSpace(); @@ -898,7 +920,7 @@ namespace chai { // Create the object on the device T* cpuPointer = f(args...); -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) // Set the execution space back to the previous value arrayManager->setExecutionSpace(currentSpace); #endif @@ -919,21 +941,21 @@ namespace chai { delete cpuPointer; } -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) /// /// @author Alan Dayton /// /// Creates a new T on the device. /// - /// @param[in] args The arguments to T's constructor + /// @param[in] args The arguments to T's constructor /// /// @return The device pointer to the new T /// template CHAI_HOST T* make_on_device(Args... args) { -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) // Get the ArrayManager and save the current execution space chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); ExecutionSpace currentSpace = arrayManager->getExecutionSpace(); @@ -948,9 +970,9 @@ namespace chai { gpuMalloc((void**)(&gpuBuffer), sizeof(T*)); // Create the object on the device -#if defined(__CUDACC__) +#if defined(__CUDACC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) detail::make_on_device<<<1, 1>>>(gpuBuffer, args...); -#elif defined(__HIPCC__) +#elif defined(__HIPCC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) hipLaunchKernelGGL(detail::make_on_device, 1, 1, 0, 0, gpuBuffer, args...); #endif @@ -965,7 +987,7 @@ namespace chai { free(cpuBuffer); gpuFree(gpuBuffer); -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) // Set the execution space back to the previous value arrayManager->setExecutionSpace(currentSpace); #endif @@ -988,7 +1010,7 @@ namespace chai { typename F, typename... Args> CHAI_HOST T* make_on_device_from_factory(F f, Args&&... args) { -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) // Get the ArrayManager and save the current execution space chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance(); ExecutionSpace currentSpace = arrayManager->getExecutionSpace(); @@ -1003,9 +1025,9 @@ namespace chai { gpuMalloc((void**)(&gpuBuffer), sizeof(T*)); // Create the object on the device -#if defined(__CUDACC__) +#if defined(__CUDACC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) detail::make_on_device_from_factory<<<1, 1>>>(gpuBuffer, f, args...); -#elif defined(__HIPCC__) +#elif defined(__HIPCC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) hipLaunchKernelGGL(detail::make_on_device_from_factory, 1, 1, 0, 0, gpuBuffer, f, args...); #endif @@ -1020,7 +1042,7 @@ namespace chai { free(cpuBuffer); gpuFree(gpuBuffer); -#ifndef CHAI_DISABLE_RM +#if !defined(CHAI_DISABLE_RM) // Set the execution space back to the previous value arrayManager->setExecutionSpace(currentSpace); #endif @@ -1038,9 +1060,9 @@ namespace chai { /// template CHAI_HOST void destroy_on_device(T* gpuPointer) { -#if defined(__CUDACC__) +#if defined(__CUDACC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) detail::destroy_on_device<<<1, 1>>>(gpuPointer); -#elif defined(__HIPCC__) +#elif defined(__HIPCC__) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) hipLaunchKernelGGL(detail::destroy_on_device, 1, 1, 0, 0, gpuPointer); #endif } @@ -1058,7 +1080,7 @@ namespace chai { template CHAI_HOST managed_ptr make_managed(Args... args) { -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) // Construct on the GPU first to take advantage of asynchrony T* gpuPointer = make_on_device(args...); #endif @@ -1067,7 +1089,7 @@ namespace chai { T* cpuPointer = make_on_host(args...); // Construct and return the managed_ptr -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) return managed_ptr({CPU, GPU}, {cpuPointer, gpuPointer}); #else return managed_ptr({CPU}, {cpuPointer}); @@ -1098,7 +1120,7 @@ namespace chai { static_assert(std::is_convertible::value, "F does not return a pointer that is convertible to T*."); -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) // Construct on the GPU first to take advantage of asynchrony T* gpuPointer = make_on_device_from_factory(f, args...); #endif @@ -1107,7 +1129,7 @@ namespace chai { T* cpuPointer = make_on_host_from_factory(f, args...); // Construct and return the managed_ptr -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) return managed_ptr({CPU, GPU}, {cpuPointer, gpuPointer}); #else return managed_ptr({CPU}, {cpuPointer}); @@ -1127,7 +1149,7 @@ namespace chai { CHAI_HOST managed_ptr static_pointer_cast(const managed_ptr& other) noexcept { T* cpuPointer = static_cast(other.get()); -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) T* gpuPointer = static_cast(other.get(GPU, false)); return managed_ptr(other, {CPU, GPU}, {cpuPointer, gpuPointer}); @@ -1149,7 +1171,7 @@ namespace chai { CHAI_HOST managed_ptr dynamic_pointer_cast(const managed_ptr& other) noexcept { T* cpuPointer = dynamic_cast(other.get()); -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) T* gpuPointer = nullptr; if (cpuPointer) { @@ -1175,7 +1197,7 @@ namespace chai { CHAI_HOST managed_ptr const_pointer_cast(const managed_ptr& other) noexcept { T* cpuPointer = const_cast(other.get()); -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) T* gpuPointer = const_cast(other.get(GPU, false)); return managed_ptr(other, {CPU, GPU}, {cpuPointer, gpuPointer}); @@ -1197,7 +1219,7 @@ namespace chai { CHAI_HOST managed_ptr reinterpret_pointer_cast(const managed_ptr& other) noexcept { T* cpuPointer = reinterpret_cast(other.get()); -#if defined(CHAI_GPUCC) +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) T* gpuPointer = reinterpret_cast(other.get(GPU, false)); return managed_ptr(other, {CPU, GPU}, {cpuPointer, gpuPointer}); @@ -1301,7 +1323,9 @@ namespace chai { template void swap(managed_ptr& lhs, managed_ptr& rhs) noexcept { std::swap(lhs.m_cpu_pointer, rhs.m_cpu_pointer); +#if defined(CHAI_GPUCC) && defined(CHAI_ENABLE_MANAGED_PTR_ON_GPU) std::swap(lhs.m_gpu_pointer, rhs.m_gpu_pointer); +#endif std::swap(lhs.m_pointer_record, rhs.m_pointer_record); } } // namespace chai diff --git a/src/util/forall.hpp b/src/util/forall.hpp index 6ee7ee91..8ddcf73d 100644 --- a/src/util/forall.hpp +++ b/src/util/forall.hpp @@ -12,8 +12,10 @@ #include "chai/config.hpp" #if defined(CHAI_ENABLE_UM) +#if !defined(CHAI_THIN_GPU_ALLOCATE) #include #endif +#endif struct sequential { }; @@ -41,7 +43,9 @@ void forall(sequential, int begin, int end, LOOP_BODY body) chai::ArrayManager* rm = chai::ArrayManager::getInstance(); #if defined(CHAI_ENABLE_UM) +#if !defined(CHAI_THIN_GPU_ALLOCATE) cudaDeviceSynchronize(); +#endif #endif rm->setExecutionSpace(chai::CPU);