From 7154919a8e9cfa7e33ccbccec4093c8f303e24eb Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Wed, 12 Nov 2025 20:13:25 -0800 Subject: [PATCH 1/3] [Offload] Add device info for shared memory --- offload/liboffload/API/Device.td | 1 + offload/liboffload/src/OffloadImpl.cpp | 9 +++++++++ offload/plugins-nextgen/amdgpu/src/rtl.cpp | 14 ++++++++++++++ .../common/include/PluginInterface.h | 6 ++++++ offload/plugins-nextgen/cuda/src/rtl.cpp | 12 ++++++++---- .../tools/deviceinfo/llvm-offload-device-info.cpp | 3 +++ .../OffloadAPI/device/olGetDeviceInfo.cpp | 5 +++++ .../OffloadAPI/device/olGetDeviceInfoSize.cpp | 2 ++ 8 files changed, 48 insertions(+), 4 deletions(-) diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index e9c154818c4a1..a918cff6de26e 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -43,6 +43,7 @@ def ol_device_info_t : Enum { TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">, TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">, TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">, + TaggedEtor<"WORK_GROUP_SHARED_MEM_SIZE", "uint64_t", "The maximum size of shared memory per work group in bytes">, ]; list fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor); list native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>); diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 84bc414396811..844ba18e3080c 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -495,6 +495,14 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.write(static_cast(Value)); } + case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE: { + // Uint64 values + if (!std::holds_alternative(Entry->Value)) + return makeError(ErrorCode::BACKEND_FAILURE, + "plugin returned incorrect type"); + return Info.write(std::get(Entry->Value)); + } + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: { // {x, y, z} triples @@ -590,6 +598,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.write(std::numeric_limits::digits); case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: + case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE: return Info.write(0); default: return createOffloadError(ErrorCode::INVALID_ENUMERATION, diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 04b394452a448..17d2586dd2d14 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2186,6 +2186,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = checkIfAPU()) return Err; + // Retrieve the size of the group memory. + for (const auto *Pool : AllMemoryPools) { + if (Pool->isGroup()) { + size_t Size = 0; + if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size)) + return Err; + MaxBlockSharedMemSize = Size; + break; + } + } + return Plugin::success(); } @@ -2923,6 +2934,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (Status == HSA_STATUS_SUCCESS) Info.add("Cacheline Size", TmpUInt); + Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes", + DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE); + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); if (Status == HSA_STATUS_SUCCESS) Info.add("Max Clock Freq", TmpUInt, "MHz", diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 2135e0608323e..b900f1b728736 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -794,6 +794,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Get the unique identifier of the device. const char *getDeviceUid() const { return DeviceUid.c_str(); } + /// Get the total shared memory per block that can be used in any kernel. + uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; } + /// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. virtual Error setContext() = 0; @@ -1251,6 +1254,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Internal representation for OMPT device (initialize & finalize) std::atomic OmptInitialized; #endif + + /// The total per-block native shared memory that a kernel may use. + uint32_t MaxBlockSharedMemSize = 0; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 45e580e7e0cd7..c8e26790f9f41 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -379,6 +379,12 @@ struct CUDADeviceTy : public GenericDeviceTy { return Err; HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); + uint32_t MaxSharedMem; + if (auto Err = getDeviceAttr( + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem)) + return Err; + MaxBlockSharedMemSize = MaxSharedMem; + return Plugin::success(); } @@ -1089,10 +1095,8 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) Info.add("Total Constant Memory", TmpInt, "bytes"); - Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - TmpInt); - if (Res == CUDA_SUCCESS) - Info.add("Max Shared Memory per Block", TmpInt, "bytes"); + Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes", + DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt); if (Res == CUDA_SUCCESS) diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp index 42ffb97d6d77c..75247760a4af3 100644 --- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp +++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp @@ -205,6 +205,9 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) { S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B")); OFFLOAD_ERR(printDeviceValue(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, "Global Mem Size", "B")); + OFFLOAD_ERR(printDeviceValue( + S, D, OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE, + "Work Group Shared Mem Size", "B")); OFFLOAD_ERR( (printDeviceValue( S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG, diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index 30eafee026316..b0d8ea7faea5e 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -217,6 +217,11 @@ OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0); OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE); +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE, + 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE); TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) { ol_device_type_t DeviceType; diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index 79a18c1d133dc..11d20004e91fb 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -71,6 +71,8 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE); OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE); TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { size_t Size = 0; From a41ad516516b1da40b7aaf99a4d29615ff32e683 Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Wed, 12 Nov 2025 21:06:26 -0800 Subject: [PATCH 2/3] Fix review comment --- offload/liboffload/src/OffloadImpl.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 844ba18e3080c..84c86998c12e6 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -496,7 +496,6 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, } case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE: { - // Uint64 values if (!std::holds_alternative(Entry->Value)) return makeError(ErrorCode::BACKEND_FAILURE, "plugin returned incorrect type"); From 6745dd52e446e71b60ada48d79bef68b12923839 Mon Sep 17 00:00:00 2001 From: Kevin Sala Date: Thu, 13 Nov 2025 09:56:08 -0800 Subject: [PATCH 3/3] Fix review comments --- offload/liboffload/API/Device.td | 2 +- offload/liboffload/src/OffloadImpl.cpp | 4 ++-- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 7 +++---- offload/plugins-nextgen/common/include/PluginInterface.h | 7 ++++--- offload/plugins-nextgen/cuda/src/rtl.cpp | 2 +- offload/tools/deviceinfo/llvm-offload-device-info.cpp | 6 +++--- offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp | 4 ++-- .../unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index a918cff6de26e..6ada191089674 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -43,7 +43,7 @@ def ol_device_info_t : Enum { TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">, TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">, TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">, - TaggedEtor<"WORK_GROUP_SHARED_MEM_SIZE", "uint64_t", "The maximum size of shared memory per work group in bytes">, + TaggedEtor<"WORK_GROUP_LOCAL_MEM_SIZE", "uint64_t", "The maximum size of local shared memory per work group in bytes">, ]; list fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor); list native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>); diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 84c86998c12e6..eab9627217ca8 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -495,7 +495,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.write(static_cast(Value)); } - case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE: { + case OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE: { if (!std::holds_alternative(Entry->Value)) return makeError(ErrorCode::BACKEND_FAILURE, "plugin returned incorrect type"); @@ -597,7 +597,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.write(std::numeric_limits::digits); case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: - case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE: + case OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE: return Info.write(0); default: return createOffloadError(ErrorCode::INVALID_ENUMERATION, diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 17d2586dd2d14..51965093cb949 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2189,10 +2189,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // Retrieve the size of the group memory. for (const auto *Pool : AllMemoryPools) { if (Pool->isGroup()) { - size_t Size = 0; - if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size)) + if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, + MaxBlockSharedMemSize)) return Err; - MaxBlockSharedMemSize = Size; break; } } @@ -2935,7 +2934,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Info.add("Cacheline Size", TmpUInt); Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes", - DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE); + DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); if (Status == HSA_STATUS_SUCCESS) diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index b900f1b728736..1d52c960b7fde 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -794,8 +794,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Get the unique identifier of the device. const char *getDeviceUid() const { return DeviceUid.c_str(); } - /// Get the total shared memory per block that can be used in any kernel. - uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; } + /// Get the total shared memory per block (in bytes) that can be used in any + /// kernel. + size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; } /// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. @@ -1256,7 +1257,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy { #endif /// The total per-block native shared memory that a kernel may use. - uint32_t MaxBlockSharedMemSize = 0; + size_t MaxBlockSharedMemSize = 0; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index c8e26790f9f41..a27c6f3de0cd3 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1096,7 +1096,7 @@ struct CUDADeviceTy : public GenericDeviceTy { Info.add("Total Constant Memory", TmpInt, "bytes"); Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes", - DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE); + DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt); if (Res == CUDA_SUCCESS) diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp index 75247760a4af3..74af3bfb13303 100644 --- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp +++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp @@ -205,9 +205,9 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) { S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B")); OFFLOAD_ERR(printDeviceValue(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, "Global Mem Size", "B")); - OFFLOAD_ERR(printDeviceValue( - S, D, OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE, - "Work Group Shared Mem Size", "B")); + OFFLOAD_ERR( + printDeviceValue(S, D, OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE, + "Work Group Shared Mem Size", "B")); OFFLOAD_ERR( (printDeviceValue( S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG, diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index b0d8ea7faea5e..ba29fb153682d 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -218,10 +218,10 @@ OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t, OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE); OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(SharedMemSize, uint64_t, - OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE, + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE, 0); OL_DEVICE_INFO_TEST_HOST_SUCCESS(SharedMemSize, uint64_t, - OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE); + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE); TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) { ol_device_type_t DeviceType; diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index 11d20004e91fb..2c375eb555a48 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -72,7 +72,7 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t, OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE); OL_DEVICE_INFO_SIZE_TEST_EQ(SharedMemSize, uint64_t, - OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE); + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE); TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { size_t Size = 0;