Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions cmake/SetupChaiOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ option(CHAI_DISABLE_RM "Make ManagedArray a thin wrapper" Off)
mark_as_advanced(CHAI_DISABLE_RM)

option(CHAI_ENABLE_UM "Use CUDA unified (managed) memory" Off)
option(CHAI_THIN_GPU_ALLOCATE "Single memory space model" Off)
option(CHAI_ENABLE_PINNED "Use pinned host memory" Off)
option(CHAI_ENABLE_RAJA_PLUGIN "Build plugin to set RAJA execution spaces" Off)
option(CHAI_ENABLE_GPU_ERROR_CHECKING "Enable GPU error checking" On)
option(CHAI_ENABLE_MANAGED_PTR "Enable managed_ptr" On)
option(CHAI_DEBUG "Enable Debug Logging." Off)
option(CHAI_ENABLE_RAJA_NESTED_TEST "Enable raja-chai-nested-tests, which fails to build on Debug CUDA builds." On)
option(CHAI_ENABLE_MANAGED_PTR_ON_GPU "Enable managed_ptr on GPU" On)

option(CHAI_ENABLE_TESTS "Enable CHAI tests" On)
option(CHAI_ENABLE_BENCHMARKS "Enable benchmarks" On)
Expand All @@ -35,6 +37,10 @@ option(CHAI_ENABLE_COPY_HEADERS "Enable CHAI copy headers" Off)

set(ENABLE_CUDA Off CACHE BOOL "Enable CUDA")

if (CHAI_ENABLE_UM AND NOT ENABLE_CUDA)
message(FATAL_ERROR "Option CHAI_ENABLE_UM requires ENABLE_CUDA")
if (CHAI_ENABLE_UM AND NOT ENABLE_CUDA AND NOT CHAI_THIN_GPU_ALLOCATE)
message(FATAL_ERROR "Option CHAI_ENABLE_UM requires ENABLE_CUDA or CHAI_THIN_GPU_ALLOCATE")
endif()

if (CHAI_THIN_GPU_ALLOCATE AND NOT CHAI_DISABLE_RM)
message(FATAL_ERROR "Option CHAI_THIN_GPU_ALLOCATE requires CHAI_DISABLE_RM")
endif()
67 changes: 52 additions & 15 deletions src/chai/ArrayManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
#include "chai/config.hpp"

#if defined(CHAI_ENABLE_CUDA)
#if !defined(CHAI_THIN_GPU_ALLOCATE)
#include "cuda_runtime_api.h"
#endif
#endif

#include "umpire/ResourceManager.hpp"

Expand Down Expand Up @@ -172,6 +174,12 @@ void ArrayManager::setExecutionSpace(ExecutionSpace space)
m_synced_since_last_kernel = false;
}

#if defined(CHAI_THIN_GPU_ALLOCATE)
if (chai::CPU == space) {
syncIfNeeded();
}
#endif

m_current_execution_space = space;
}

Expand Down Expand Up @@ -226,6 +234,34 @@ void ArrayManager::resetTouch(PointerRecord* pointer_record)
}
}


/* Not all GPU platform runtimes (notably HIP), will give you asynchronous copies to the device by default, so we leverage
* umpire's API for asynchronous copies using camp resources in this method, based off of the CHAI destination space
* */
static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager & manager, ExecutionSpace dst_space, ExecutionSpace src_space) {

#ifdef CHAI_ENABLE_CUDA
camp::resources::Resource device_resource(camp::resources::Cuda::get_default());
#elif defined(CHAI_ENABLE_HIP)
camp::resources::Resource device_resource(camp::resources::Hip::get_default());
#else
camp::resources::Resource device_resource(camp::resources::Host::get_default());
#endif

camp::resources::Resource host_resource(camp::resources::Host::get_default());
if (dst_space == GPU || src_space == GPU) {
// Do the copy using the device resource
manager.copy(dst_pointer, src_pointer, device_resource);
} else {
// Do the copy using the host resource
manager.copy(dst_pointer, src_pointer, host_resource);
}
// Ensure device to host copies are synchronous
if (dst_space == CPU && src_space == GPU) {
device_resource.wait();
}
}

void ArrayManager::move(PointerRecord* record, ExecutionSpace space)
{
if (space == NONE) {
Expand Down Expand Up @@ -253,7 +289,9 @@ void ArrayManager::move(PointerRecord* record, ExecutionSpace space)
}
#endif

void* src_pointer = record->m_pointers[record->m_last_space];
ExecutionSpace prev_space = record->m_last_space;

void* src_pointer = record->m_pointers[prev_space];
void* dst_pointer = record->m_pointers[space];

if (!dst_pointer) {
Expand All @@ -267,7 +305,7 @@ void ArrayManager::move(PointerRecord* record, ExecutionSpace space)
} else if (dst_pointer != src_pointer) {
// Exclude the copy if src and dst are the same (can happen for PINNED memory)
{
m_resource_manager.copy(dst_pointer, src_pointer);
chai::copy(dst_pointer, src_pointer, m_resource_manager, space, prev_space);
}

callback(record, ACTION_MOVE, space);
Expand All @@ -285,7 +323,6 @@ void ArrayManager::allocate(

pointer_record->m_pointers[space] = alloc.allocate(size);
callback(pointer_record, ACTION_ALLOC, space);

registerPointer(pointer_record, space);

CHAI_LOG(Debug, "Allocated array at: " << pointer_record->m_pointers[space]);
Expand Down Expand Up @@ -449,32 +486,32 @@ PointerRecord* ArrayManager::makeManaged(void* pointer,

PointerRecord* ArrayManager::deepCopyRecord(PointerRecord const* record)
{
PointerRecord* copy = new PointerRecord{};
PointerRecord* new_record = new PointerRecord{};
const size_t size = record->m_size;
copy->m_size = size;
copy->m_user_callback = [] (const PointerRecord*, Action, ExecutionSpace) {};
new_record->m_size = size;
new_record->m_user_callback = [] (const PointerRecord*, Action, ExecutionSpace) {};

const ExecutionSpace last_space = record->m_last_space;
copy->m_last_space = last_space;
new_record->m_last_space = last_space;
for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) {
copy->m_allocators[space] = record->m_allocators[space];
new_record->m_allocators[space] = record->m_allocators[space];
}

allocate(copy, last_space);
allocate(new_record, last_space);

for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) {
copy->m_owned[space] = true;
copy->m_touched[space] = false;
new_record->m_owned[space] = true;
new_record->m_touched[space] = false;
}

copy->m_touched[last_space] = true;
new_record->m_touched[last_space] = true;

void* dst_pointer = copy->m_pointers[last_space];
void* dst_pointer = new_record->m_pointers[last_space];
void* src_pointer = record->m_pointers[last_space];

m_resource_manager.copy(dst_pointer, src_pointer);
chai::copy(dst_pointer, src_pointer, m_resource_manager, last_space, last_space);

return copy;
return new_record;
}

std::unordered_map<void*, const PointerRecord*>
Expand Down
2 changes: 2 additions & 0 deletions src/chai/ArrayManager.inl
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
#include "umpire/ResourceManager.hpp"

#if defined(CHAI_ENABLE_UM)
#if !defined(CHAI_THIN_GPU_ALLOCATE)
#include <cuda_runtime_api.h>
#endif
#endif

namespace chai {

Expand Down
19 changes: 9 additions & 10 deletions src/chai/ManagedArray.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class ManagedArray : public CHAICopyable
/*!
* \brief Default constructor creates a ManagedArray with no allocations.
*/
CHAI_HOST_DEVICE ManagedArray(
ManagedArray(
std::initializer_list<chai::ExecutionSpace> spaces,
std::initializer_list<umpire::Allocator> allocators);

Expand All @@ -98,7 +98,7 @@ class ManagedArray : public CHAICopyable
*/
CHAI_HOST_DEVICE ManagedArray(size_t elems, ExecutionSpace space = get_default_space());

CHAI_HOST_DEVICE ManagedArray(
ManagedArray(
size_t elems,
std::initializer_list<chai::ExecutionSpace> spaces,
std::initializer_list<umpire::Allocator> allocators,
Expand Down Expand Up @@ -273,7 +273,7 @@ class ManagedArray : public CHAICopyable

CHAI_HOST_DEVICE ManagedArray(T* data,
ArrayManager* array_manager,
size_t m_elems,
size_t elems,
PointerRecord* pointer_record);

ManagedArray<T>& operator=(ManagedArray const & other) = default;
Expand Down Expand Up @@ -411,16 +411,15 @@ class ManagedArray : public CHAICopyable
m_active_pointer = other.m_active_pointer;
m_active_base_pointer = other.m_active_base_pointer;
m_resource_manager = other.m_resource_manager;
m_elems = other.m_elems;
m_size = other.m_size;
m_offset = other.m_offset;
m_pointer_record = other.m_pointer_record;
m_is_slice = other.m_is_slice;
#ifndef CHAI_DISABLE_RM
#if !defined(CHAI_DEVICE_COMPILE)
// if we can, ensure elems is based off the pointer_record size to protect against
// casting leading to incorrect size info in m_elems.
// if we can, ensure elems is based off the pointer_record size out of paranoia
if (m_pointer_record != nullptr && !m_is_slice) {
m_elems = m_pointer_record->m_size / sizeof(T);
m_size = m_pointer_record->m_size;
}
#endif
#endif
Expand All @@ -444,7 +443,7 @@ class ManagedArray : public CHAICopyable
typename std::enable_if<B, int>::type = 0>
CHAI_HOST bool initInner(size_t start = 0)
{
for (size_t i = start; i < m_elems; ++i) {
for (size_t i = start; i < m_size/sizeof(T); ++i) {
m_active_base_pointer[i] = nullptr;
}
return true;
Expand Down Expand Up @@ -473,7 +472,7 @@ class ManagedArray : public CHAICopyable
/*!
* Number of elements in the ManagedArray.
*/
mutable size_t m_elems = 0;
mutable size_t m_size = 0;
mutable size_t m_offset = 0;

/*!
Expand Down Expand Up @@ -582,7 +581,7 @@ CHAI_INLINE CHAI_HOST_DEVICE ManagedArray<T> ManagedArray<T>::slice( size_t offs
slice.m_active_base_pointer = m_active_base_pointer;
slice.m_offset = offset + m_offset;
slice.m_active_pointer = m_active_base_pointer + slice.m_offset;
slice.m_elems = elems;
slice.m_size = elems*sizeof(T);
slice.m_is_slice = true;
}
return slice;
Expand Down
Loading