Skip to content

Commit

Permalink
[Acxxel] Remove setActiveDeviceForThread
Browse files Browse the repository at this point in the history
Summary:
After experimenting with CUDA, I realized that we really only need to
set the active context right before creating an object such as a stream
or a device memory allocation. When we go on to use these objects later,
it is fine if the context that created them is no longer active,
operations with those objects will succeed anyway.

Since it turns out that we don't have to check the active context for
every operation, it makes sense to hide this active context from users
(by removing the "ActiveDeviceForThread" setter and getter) and to
change the Acxxel API to explicitly pass in the device ID to create
objects.

This change improves the Acxxel API and greatly simplifies the CUDA and
OpenCL implementations because they no longer require thread_local data.

Reviewers: jlebar, jprice

Subscribers: mgorny, parallel_libs-commits

Differential Revision: https://reviews.llvm.org/D26050

llvm-svn: 285372
  • Loading branch information
henline committed Oct 28, 2016
1 parent 3066514 commit bdc410b
Show file tree
Hide file tree
Showing 7 changed files with 232 additions and 249 deletions.
1 change: 1 addition & 0 deletions parallel-libs/acxxel/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.1)

option(ACXXEL_ENABLE_UNIT_TESTS "enable acxxel unit tests" ON)
option(ACXXEL_ENABLE_MULTI_DEVICE_UNIT_TESTS "enable acxxel multi-device unit tests" OFF)
option(ACXXEL_ENABLE_EXAMPLES "enable acxxel examples" OFF)
option(ACXXEL_ENABLE_DOXYGEN "enable Doxygen for acxxel" OFF)
option(ACXXEL_ENABLE_CUDA "enable CUDA for acxxel" ON)
Expand Down
83 changes: 33 additions & 50 deletions parallel-libs/acxxel/acxxel.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,7 @@ class Program {
/// All operations enqueued on a Stream are serialized, but operations enqueued
/// on different Streams may run concurrently.
///
/// Each Platform has a notion of the currently active device on a particular
/// thread (see Platform::getActiveDeviceForThread and
/// Platform::setActiveDeviceForThread). Each Stream is associated with a
/// specific, fixed device, set to the current thread's active device when the
/// Stream is created. Whenver a thread enqueues commands onto a Stream, its
/// active device must match the Stream's device.
/// Each Stream is associated with a specific, fixed device.
class Stream {
public:
Stream(const Stream &) = delete;
Expand Down Expand Up @@ -447,10 +442,16 @@ class Event {
private:
// Only a platform can make an event.
friend class Platform;
Event(Platform *APlatform, void *AHandle, HandleDestructor Destructor)
: ThePlatform(APlatform), TheHandle(AHandle, Destructor) {}
Event(Platform *APlatform, int DeviceIndex, void *AHandle,
HandleDestructor Destructor)
: ThePlatform(APlatform), TheDeviceIndex(DeviceIndex),
TheHandle(AHandle, Destructor) {}

Platform *ThePlatform;

// The index of the device on which the event can be enqueued.
int TheDeviceIndex;

std::unique_ptr<void, HandleDestructor> TheHandle;
};

Expand All @@ -470,29 +471,21 @@ class Platform {
/// Gets the number of devices for this platform in this system.
virtual Expected<int> getDeviceCount() = 0;

/// Sets the active device for this platform in this thread.
virtual Status setActiveDeviceForThread(int DeviceIndex) = 0;

/// Gets the currently active device for this platform in this thread.
virtual int getActiveDeviceForThread() = 0;
/// Creates a stream on the given device for the platform.
virtual Expected<Stream> createStream(int DeviceIndex = 0) = 0;

/// Creates a stream for the platform.
///
/// The created Stream is associated with the active device for this thread.
virtual Expected<Stream> createStream() = 0;

/// Creates an event for the platform.
///
/// The created Event is associated with the active device for this thread.
virtual Expected<Event> createEvent() = 0;
/// Creates an event on the given device for the platform.
virtual Expected<Event> createEvent(int DeviceIndex = 0) = 0;

/// Allocates owned device memory.
///
/// \warning This function only allocates space in device memory, it does not
/// call the constructor of T.
template <typename T>
Expected<DeviceMemory<T>> mallocD(ptrdiff_t ElementCount) {
Expected<void *> MaybePointer = rawMallocD(ElementCount * sizeof(T));
Expected<DeviceMemory<T>> mallocD(ptrdiff_t ElementCount,
int DeviceIndex = 0) {
Expected<void *> MaybePointer =
rawMallocD(ElementCount * sizeof(T), DeviceIndex);
if (MaybePointer.isError())
return MaybePointer.getError();
return DeviceMemory<T>(this, MaybePointer.getValue(), ElementCount,
Expand All @@ -505,12 +498,14 @@ class Platform {
/// pointer to a __device__ variable, this function returns a DeviceMemorySpan
/// referencing the device memory that stores that __device__ variable.
template <typename ElementType>
Expected<DeviceMemorySpan<ElementType>> getSymbolMemory(ElementType *Symbol) {
Expected<void *> MaybeAddress = rawGetDeviceSymbolAddress(Symbol);
Expected<DeviceMemorySpan<ElementType>> getSymbolMemory(ElementType *Symbol,
int DeviceIndex = 0) {
Expected<void *> MaybeAddress =
rawGetDeviceSymbolAddress(Symbol, DeviceIndex);
if (MaybeAddress.isError())
return MaybeAddress.getError();
ElementType *Address = static_cast<ElementType *>(MaybeAddress.getValue());
Expected<ptrdiff_t> MaybeSize = rawGetDeviceSymbolSize(Symbol);
Expected<ptrdiff_t> MaybeSize = rawGetDeviceSymbolSize(Symbol, DeviceIndex);
if (MaybeSize.isError())
return MaybeSize.getError();
ptrdiff_t Size = MaybeSize.getValue();
Expand Down Expand Up @@ -584,8 +579,8 @@ class Platform {

/// \}

virtual Expected<Program>
createProgramFromSource(Span<const char> Source) = 0;
virtual Expected<Program> createProgramFromSource(Span<const char> Source,
int DeviceIndex = 0) = 0;

protected:
friend class Stream;
Expand All @@ -597,15 +592,15 @@ class Platform {
void *getEventHandle(Event &Event) { return Event.TheHandle.get(); }

// Pass along access to Stream constructor to subclasses.
Stream constructStream(Platform *APlatform, void *AHandle,
Stream constructStream(Platform *APlatform, int DeviceIndex, void *AHandle,
HandleDestructor Destructor) {
return Stream(APlatform, getActiveDeviceForThread(), AHandle, Destructor);
return Stream(APlatform, DeviceIndex, AHandle, Destructor);
}

// Pass along access to Event constructor to subclasses.
Event constructEvent(Platform *APlatform, void *AHandle,
Event constructEvent(Platform *APlatform, int DeviceIndex, void *AHandle,
HandleDestructor Destructor) {
return Event(APlatform, AHandle, Destructor);
return Event(APlatform, DeviceIndex, AHandle, Destructor);
}

// Pass along access to Program constructor to subclasses.
Expand All @@ -623,28 +618,16 @@ class Platform {
virtual Expected<float> getSecondsBetweenEvents(void *StartEvent,
void *EndEvent) = 0;

virtual Expected<void *> rawMallocD(ptrdiff_t ByteCount) = 0;
virtual Expected<void *> rawMallocD(ptrdiff_t ByteCount, int DeviceIndex) = 0;
virtual HandleDestructor getDeviceMemoryHandleDestructor() = 0;
virtual void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
size_t ByteOffset) = 0;
virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) = 0;

virtual Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol) = 0;
virtual Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol) = 0;

virtual Status rawCopyDToD(const void *DeviceSrc,
ptrdiff_t DeviceSrcByteOffset, void *DeviceDst,
ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount) = 0;
virtual Status rawCopyDToH(const void *DeviceSrc,
ptrdiff_t DeviceSrcByteOffset, void *HostDst,
ptrdiff_t ByteCount) = 0;
virtual Status rawCopyHToD(const void *HostSrc, void *DeviceDst,
ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount) = 0;

virtual Status rawMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
ptrdiff_t ByteCount, char ByteValue) = 0;
virtual Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol,
int DeviceIndex) = 0;
virtual Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol,
int DeviceIndex) = 0;

virtual Status rawRegisterHostMem(const void *Memory,
ptrdiff_t ByteCount) = 0;
Expand Down
131 changes: 47 additions & 84 deletions parallel-libs/acxxel/cuda_acxxel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ namespace acxxel {

namespace {

/// Index of active device for this thread.
thread_local int ActiveDeviceIndex = 0;

static std::string getCUErrorMessage(CUresult Result) {
if (!Result)
return "success";
Expand Down Expand Up @@ -85,39 +82,25 @@ class CUDAPlatform : public Platform {

Expected<int> getDeviceCount() override;

Status setActiveDeviceForThread(int DeviceIndex) override;

int getActiveDeviceForThread() override;

Expected<Stream> createStream() override;
Expected<Stream> createStream(int DeviceIndex) override;

Status streamSync(void *Stream) override;

Status streamWaitOnEvent(void *Stream, void *Event) override;

Expected<Event> createEvent() override;
Expected<Event> createEvent(int DeviceIndex) override;

protected:
Expected<void *> rawMallocD(ptrdiff_t ByteCount) override;
Expected<void *> rawMallocD(ptrdiff_t ByteCount, int DeviceIndex) override;
HandleDestructor getDeviceMemoryHandleDestructor() override;
void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
size_t ByteOffset) override;
virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) override;

Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol) override;
Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol) override;

Status rawCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount) override;
Status rawCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
void *HostDst, ptrdiff_t ByteCount) override;
Status rawCopyHToD(const void *HostSrc, void *DeviceDst,
ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount) override;

Status rawMemsetD(void *DeviceDst, ptrdiff_t ByteOffset, ptrdiff_t ByteCount,
char ByteValue) override;
Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol,
int DeviceIndex) override;
Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol,
int DeviceIndex) override;

Status rawRegisterHostMem(const void *Memory, ptrdiff_t ByteCount) override;
HandleDestructor getUnregisterHostMemoryHandleDestructor() override;
Expand All @@ -141,7 +124,8 @@ class CUDAPlatform : public Platform {

Status addStreamCallback(Stream &Stream, StreamCallback Callback) override;

Expected<Program> createProgramFromSource(Span<const char> Source) override;
Expected<Program> createProgramFromSource(Span<const char> Source,
int DeviceIndex) override;

Status enqueueEvent(void *Event, void *Stream) override;
bool eventIsDone(void *Event) override;
Expand All @@ -163,6 +147,14 @@ class CUDAPlatform : public Platform {
explicit CUDAPlatform(const std::vector<CUcontext> &Contexts)
: TheContexts(Contexts) {}

Status setContext(int DeviceIndex) {
if (DeviceIndex < 0 ||
static_cast<size_t>(DeviceIndex) >= TheContexts.size())
return Status("invalid deivce index " + std::to_string(DeviceIndex));
return getCUError(cuCtxSetCurrent(TheContexts[DeviceIndex]),
"cuCtxSetCurrent");
}

// Vector of contexts for each device.
std::vector<CUcontext> TheContexts;
};
Expand Down Expand Up @@ -191,17 +183,6 @@ Expected<CUDAPlatform> CUDAPlatform::create() {
return CUDAPlatform(Contexts);
}

Status CUDAPlatform::setActiveDeviceForThread(int DeviceIndex) {
if (static_cast<size_t>(DeviceIndex) >= TheContexts.size())
return Status("invalid device index for SetActiveDevice: " +
std::to_string(DeviceIndex));
ActiveDeviceIndex = DeviceIndex;
return getCUError(cuCtxSetCurrent(TheContexts[DeviceIndex]),
"setActiveDeviceForThread cuCtxSetCurrent");
}

int CUDAPlatform::getActiveDeviceForThread() { return ActiveDeviceIndex; }

Expected<int> CUDAPlatform::getDeviceCount() {
int Count = 0;
if (CUresult Result = cuDeviceGetCount(&Count))
Expand All @@ -214,12 +195,15 @@ static void cudaDestroyStream(void *H) {
"cuStreamDestroy");
}

Expected<Stream> CUDAPlatform::createStream() {
Expected<Stream> CUDAPlatform::createStream(int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
unsigned int Flags = CU_STREAM_DEFAULT;
CUstream Handle;
if (CUresult Result = cuStreamCreate(&Handle, Flags))
return getCUError(Result, "cuStreamCreate");
return constructStream(this, Handle, cudaDestroyStream);
return constructStream(this, DeviceIndex, Handle, cudaDestroyStream);
}

Status CUDAPlatform::streamSync(void *Stream) {
Expand All @@ -239,12 +223,15 @@ static void cudaDestroyEvent(void *H) {
logCUWarning(cuEventDestroy(static_cast<CUevent_st *>(H)), "cuEventDestroy");
}

Expected<Event> CUDAPlatform::createEvent() {
Expected<Event> CUDAPlatform::createEvent(int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
unsigned int Flags = CU_EVENT_DEFAULT;
CUevent Handle;
if (CUresult Result = cuEventCreate(&Handle, Flags))
return getCUError(Result, "cuEventCreate");
return constructEvent(this, Handle, cudaDestroyEvent);
return constructEvent(this, DeviceIndex, Handle, cudaDestroyEvent);
}

Status CUDAPlatform::enqueueEvent(void *Event, void *Stream) {
Expand Down Expand Up @@ -272,7 +259,11 @@ Expected<float> CUDAPlatform::getSecondsBetweenEvents(void *StartEvent,
return Milliseconds * 1e-6;
}

Expected<void *> CUDAPlatform::rawMallocD(ptrdiff_t ByteCount) {
Expected<void *> CUDAPlatform::rawMallocD(ptrdiff_t ByteCount,
int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
if (!ByteCount)
return nullptr;
CUdeviceptr Pointer;
Expand All @@ -298,14 +289,22 @@ void CUDAPlatform::rawDestroyDeviceMemorySpanHandle(void *) {
// Do nothing for this platform.
}

Expected<void *> CUDAPlatform::rawGetDeviceSymbolAddress(const void *Symbol) {
Expected<void *> CUDAPlatform::rawGetDeviceSymbolAddress(const void *Symbol,
int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
void *Address;
if (cudaError_t Status = cudaGetSymbolAddress(&Address, Symbol))
return getCUDAError(Status, "cudaGetSymbolAddress");
return Address;
}

Expected<ptrdiff_t> CUDAPlatform::rawGetDeviceSymbolSize(const void *Symbol) {
Expected<ptrdiff_t> CUDAPlatform::rawGetDeviceSymbolSize(const void *Symbol,
int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
size_t Size;
if (cudaError_t Status = cudaGetSymbolSize(&Size, Symbol))
return getCUDAError(Status, "cudaGetSymbolSize");
Expand All @@ -320,45 +319,6 @@ static void *offsetVoidPtr(void *Ptr, ptrdiff_t ByteOffset) {
return static_cast<void *>(static_cast<char *>(Ptr) + ByteOffset);
}

Status CUDAPlatform::rawCopyDToD(const void *DeviceSrc,
ptrdiff_t DeviceSrcByteOffset, void *DeviceDst,
ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount) {
return getCUError(cuMemcpyDtoD(reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
DeviceDst, DeviceDstByteOffset)),
reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
DeviceSrc, DeviceSrcByteOffset)),
ByteCount),
"cuMemcpyDtoD");
}

Status CUDAPlatform::rawCopyDToH(const void *DeviceSrc,
ptrdiff_t DeviceSrcByteOffset, void *HostDst,
ptrdiff_t ByteCount) {
return getCUError(
cuMemcpyDtoH(HostDst, reinterpret_cast<CUdeviceptr>(
offsetVoidPtr(DeviceSrc, DeviceSrcByteOffset)),
ByteCount),
"cuMemcpyDtoH");
}

Status CUDAPlatform::rawCopyHToD(const void *HostSrc, void *DeviceDst,
ptrdiff_t DeviceDstByteOffset,
ptrdiff_t ByteCount) {
return getCUError(cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
DeviceDst, DeviceDstByteOffset)),
HostSrc, ByteCount),
"cuMemcpyHtoD");
}

Status CUDAPlatform::rawMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
ptrdiff_t ByteCount, char ByteValue) {
return getCUError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(
offsetVoidPtr(DeviceDst, ByteOffset)),
ByteValue, ByteCount),
"cuMemsetD8");
}

Status CUDAPlatform::rawRegisterHostMem(const void *Memory,
ptrdiff_t ByteCount) {
unsigned int Flags = 0;
Expand Down Expand Up @@ -468,8 +428,11 @@ static void cudaDestroyProgram(void *H) {
logCUWarning(cuModuleUnload(static_cast<CUmod_st *>(H)), "cuModuleUnload");
}

Expected<Program>
CUDAPlatform::createProgramFromSource(Span<const char> Source) {
Expected<Program> CUDAPlatform::createProgramFromSource(Span<const char> Source,
int DeviceIndex) {
Status S = setContext(DeviceIndex);
if (S.isError())
return S;
CUmodule Module;
constexpr int LogBufferSizeBytes = 1024;
char InfoLogBuffer[LogBufferSizeBytes];
Expand Down

0 comments on commit bdc410b

Please sign in to comment.