-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[Offload] Add device info for shared memory #167817
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-offload Author: Kevin Sala Penades (kevinsala) ChangesThis will be needed by #152831 Full diff: https://github.com/llvm/llvm-project/pull/167817.diff 8 Files Affected:
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index e9c154818c4a1..a918cff6de26e 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -43,6 +43,7 @@ def ol_device_info_t : Enum {
TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">,
TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">,
TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">,
+ TaggedEtor<"WORK_GROUP_SHARED_MEM_SIZE", "uint64_t", "The maximum size of shared memory per work group in bytes">,
];
list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">);
list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>);
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 84bc414396811..844ba18e3080c 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -495,6 +495,14 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
return Info.write(static_cast<uint32_t>(Value));
}
+ case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE: {
+ // Uint64 values
+ if (!std::holds_alternative<uint64_t>(Entry->Value))
+ return makeError(ErrorCode::BACKEND_FAILURE,
+ "plugin returned incorrect type");
+ return Info.write(std::get<uint64_t>(Entry->Value));
+ }
+
case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: {
// {x, y, z} triples
@@ -590,6 +598,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits);
case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
case OL_DEVICE_INFO_GLOBAL_MEM_SIZE:
+ case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE:
return Info.write<uint64_t>(0);
default:
return createOffloadError(ErrorCode::INVALID_ENUMERATION,
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 04b394452a448..17d2586dd2d14 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2186,6 +2186,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = checkIfAPU())
return Err;
+ // Retrieve the size of the group memory.
+ for (const auto *Pool : AllMemoryPools) {
+ if (Pool->isGroup()) {
+ size_t Size = 0;
+ if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
+ return Err;
+ MaxBlockSharedMemSize = Size;
+ break;
+ }
+ }
+
return Plugin::success();
}
@@ -2923,6 +2934,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (Status == HSA_STATUS_SUCCESS)
Info.add("Cacheline Size", TmpUInt);
+ Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes",
+ DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE);
+
Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Max Clock Freq", TmpUInt, "MHz",
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 2135e0608323e..b900f1b728736 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -794,6 +794,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Get the unique identifier of the device.
const char *getDeviceUid() const { return DeviceUid.c_str(); }
+ /// Get the total shared memory per block that can be used in any kernel.
+ uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
@@ -1251,6 +1254,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Internal representation for OMPT device (initialize & finalize)
std::atomic<bool> OmptInitialized;
#endif
+
+ /// The total per-block native shared memory that a kernel may use.
+ uint32_t MaxBlockSharedMemSize = 0;
};
/// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 45e580e7e0cd7..c8e26790f9f41 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -379,6 +379,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Err;
HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
+ uint32_t MaxSharedMem;
+ if (auto Err = getDeviceAttr(
+ CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
+ return Err;
+ MaxBlockSharedMemSize = MaxSharedMem;
+
return Plugin::success();
}
@@ -1089,10 +1095,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (Res == CUDA_SUCCESS)
Info.add("Total Constant Memory", TmpInt, "bytes");
- Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
- TmpInt);
- if (Res == CUDA_SUCCESS)
- Info.add("Max Shared Memory per Block", TmpInt, "bytes");
+ Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes",
+ DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE);
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
if (Res == CUDA_SUCCESS)
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 42ffb97d6d77c..75247760a4af3 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -205,6 +205,9 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B"));
OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE,
"Global Mem Size", "B"));
+ OFFLOAD_ERR(printDeviceValue<uint64_t>(
+ S, D, OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE,
+ "Work Group Shared Mem Size", "B"));
OFFLOAD_ERR(
(printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG,
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 30eafee026316..b0d8ea7faea5e 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -217,6 +217,11 @@ OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t,
OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0);
OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t,
OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(SharedMemSize, uint64_t,
+ OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE,
+ 0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(SharedMemSize, uint64_t,
+ OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE);
TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) {
ol_device_type_t DeviceType;
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index 79a18c1d133dc..11d20004e91fb 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -71,6 +71,8 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t,
OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE);
OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t,
OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(SharedMemSize, uint64_t,
+ OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE);
TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
size_t Size = 0;
|
ro-i
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM in general, but probably best to see if Joseph has any further comments :)
|
All comments have been fixed now. |
This reverts commit 1a86f0a.
This reverts commit 1a86f0a.
This reverts commit 1a86f0a.
This reverts commit 1a86f0a.
This will be needed by #152831