Skip to content

Commit

Permalink
[OpenMP][CUDA] Fix the issue that P2P memcpy doesn't work
Browse files Browse the repository at this point in the history
This patch fixes the issue that P2P memcpy doesn't work. The root cause is we didn't set current context when calling the API function. In addition, a matrix to track the states of each pair of devices is also added such that we only need to query and configure the device once.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D122764
  • Loading branch information
shiltian committed Jun 28, 2022
1 parent fd26d86 commit 2695e23
Showing 1 changed file with 67 additions and 29 deletions.
96 changes: 67 additions & 29 deletions openmp/libomptarget/plugins/cuda/src/rtl.cpp
Expand Up @@ -355,6 +355,10 @@ class DeviceRTLTy {
/// devices.
std::vector<bool> InitializedFlags;

enum class PeerAccessState : uint8_t { Unkown, Yes, No };
std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
std::mutex PeerAccessMatrixLock;

/// A class responsible for interacting with device native runtime library to
/// allocate and free memory.
class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
Expand Down Expand Up @@ -520,6 +524,9 @@ class DeviceRTLTy {
Modules.resize(NumberOfDevices);
StreamPool.resize(NumberOfDevices);
EventPool.resize(NumberOfDevices);
PeerAccessMatrix.resize(NumberOfDevices);
for (auto &V : PeerAccessMatrix)
V.resize(NumberOfDevices, PeerAccessState::Unkown);

// Get environment variables regarding teams
if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
Expand Down Expand Up @@ -1015,48 +1022,77 @@ class DeviceRTLTy {
}

int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
int64_t Size, __tgt_async_info *AsyncInfo) const {
int64_t Size, __tgt_async_info *AsyncInfo) {
assert(AsyncInfo && "AsyncInfo is nullptr");

CUresult Err;
CUstream Stream = getStream(SrcDevId, AsyncInfo);

// If they are two devices, we try peer to peer copy first
if (SrcDevId != DstDevId) {
int CanAccessPeer = 0;
Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
if (Err != CUDA_SUCCESS) {
REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
", dst = %" PRId32 "\n",
std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);

switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
case PeerAccessState::No: {
REPORT("Peer access from %" PRId32 " to %" PRId32
" is not supported. Fall back to D2D memcpy.\n",
SrcDevId, DstDevId);
CUDA_ERR_STRING(Err);
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}
case PeerAccessState::Unkown: {
int CanAccessPeer = 0;
Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
if (Err != CUDA_SUCCESS) {
REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
SrcDevId, DstDevId);
CUDA_ERR_STRING(Err);
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}

if (!CanAccessPeer) {
DP("P2P memcpy not supported so fall back to D2D memcpy");
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}
if (!CanAccessPeer) {
REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
"memcpy.\n",
SrcDevId, DstDevId);
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}

Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
if (Err != CUDA_SUCCESS) {
REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
", dst = %" PRId32 "\n",
SrcDevId, DstDevId);
Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
if (Err != CUDA_SUCCESS) {
REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
SrcDevId, DstDevId);
CUDA_ERR_STRING(Err);
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}

PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;

LLVM_FALLTHROUGH;
}
case PeerAccessState::Yes: {
Err = cuMemcpyPeerAsync(
(CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
(CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
if (Err == CUDA_SUCCESS)
return OFFLOAD_SUCCESS;

DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
". Fall back to D2D memcpy.\n",
DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
CUDA_ERR_STRING(Err);

return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
}

Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
(CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
Size, Stream);
if (Err == CUDA_SUCCESS)
return OFFLOAD_SUCCESS;

DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
CUDA_ERR_STRING(Err);
default:
REPORT("Unknown PeerAccessState %d.\n",
int(PeerAccessMatrix[SrcDevId][DstDevId]));
return OFFLOAD_FAIL;
}
}

return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
Expand Down Expand Up @@ -1598,8 +1634,10 @@ int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
assert(AsyncInfo && "AsyncInfo is nullptr");
// NOTE: We don't need to set context for data exchange as the device contexts
// are passed to CUDA function directly.

if (DeviceRTL.setContext(src_dev_id) != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;

return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
AsyncInfo);
}
Expand Down

0 comments on commit 2695e23

Please sign in to comment.