Skip to content

Conversation

@kevinsala
Copy link
Contributor

This will be needed by #152831

@llvmbot
Copy link
Member

llvmbot commented Nov 13, 2025

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-offload

Author: Kevin Sala Penades (kevinsala)

Changes

This will be needed by #152831


Full diff: https://github.com/llvm/llvm-project/pull/167817.diff

8 Files Affected:

  • (modified) offload/liboffload/API/Device.td (+1)
  • (modified) offload/liboffload/src/OffloadImpl.cpp (+9)
  • (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+14)
  • (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+6)
  • (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+8-4)
  • (modified) offload/tools/deviceinfo/llvm-offload-device-info.cpp (+3)
  • (modified) offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp (+5)
  • (modified) offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp (+2)
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index e9c154818c4a1..a918cff6de26e 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -43,6 +43,7 @@ def ol_device_info_t : Enum {
     TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">,
     TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">,
     TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">,
+    TaggedEtor<"WORK_GROUP_SHARED_MEM_SIZE", "uint64_t", "The maximum size of shared memory per work group in bytes">,
   ];
   list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">);
   list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>);
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 84bc414396811..844ba18e3080c 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -495,6 +495,14 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
     return Info.write(static_cast<uint32_t>(Value));
   }
 
+  case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE: {
+    // Uint64 values
+    if (!std::holds_alternative<uint64_t>(Entry->Value))
+      return makeError(ErrorCode::BACKEND_FAILURE,
+                       "plugin returned incorrect type");
+    return Info.write(std::get<uint64_t>(Entry->Value));
+  }
+
   case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
   case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: {
     // {x, y, z} triples
@@ -590,6 +598,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
     return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits);
   case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
   case OL_DEVICE_INFO_GLOBAL_MEM_SIZE:
+  case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE:
     return Info.write<uint64_t>(0);
   default:
     return createOffloadError(ErrorCode::INVALID_ENUMERATION,
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 04b394452a448..17d2586dd2d14 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2186,6 +2186,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = checkIfAPU())
       return Err;
 
+    // Retrieve the size of the group memory.
+    for (const auto *Pool : AllMemoryPools) {
+      if (Pool->isGroup()) {
+        size_t Size = 0;
+        if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
+          return Err;
+        MaxBlockSharedMemSize = Size;
+        break;
+      }
+    }
+
     return Plugin::success();
   }
 
@@ -2923,6 +2934,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (Status == HSA_STATUS_SUCCESS)
       Info.add("Cacheline Size", TmpUInt);
 
+    Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes",
+             DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE);
+
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
       Info.add("Max Clock Freq", TmpUInt, "MHz",
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 2135e0608323e..b900f1b728736 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -794,6 +794,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Get the unique identifier of the device.
   const char *getDeviceUid() const { return DeviceUid.c_str(); }
 
+  /// Get the total shared memory per block that can be used in any kernel.
+  uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
   /// Set the context of the device if needed, before calling device-specific
   /// functions. Plugins may implement this function as a no-op if not needed.
   virtual Error setContext() = 0;
@@ -1251,6 +1254,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Internal representation for OMPT device (initialize & finalize)
   std::atomic<bool> OmptInitialized;
 #endif
+
+  /// The total per-block native shared memory that a kernel may use.
+  uint32_t MaxBlockSharedMemSize = 0;
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 45e580e7e0cd7..c8e26790f9f41 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -379,6 +379,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
       return Err;
     HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
 
+    uint32_t MaxSharedMem;
+    if (auto Err = getDeviceAttr(
+            CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
+      return Err;
+    MaxBlockSharedMemSize = MaxSharedMem;
+
     return Plugin::success();
   }
 
@@ -1089,10 +1095,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_SUCCESS)
       Info.add("Total Constant Memory", TmpInt, "bytes");
 
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                           TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Max Shared Memory per Block", TmpInt, "bytes");
+    Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes",
+             DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
     if (Res == CUDA_SUCCESS)
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 42ffb97d6d77c..75247760a4af3 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -205,6 +205,9 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
       S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B"));
   OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE,
                                          "Global Mem Size", "B"));
+  OFFLOAD_ERR(printDeviceValue<uint64_t>(
+      S, D, OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE,
+      "Work Group Shared Mem Size", "B"));
   OFFLOAD_ERR(
       (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
           S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG,
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 30eafee026316..b0d8ea7faea5e 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -217,6 +217,11 @@ OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t,
                                     OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0);
 OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t,
                                  OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(SharedMemSize, uint64_t,
+                                    OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE,
+                                    0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(SharedMemSize, uint64_t,
+                                 OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE);
 
 TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) {
   ol_device_type_t DeviceType;
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index 79a18c1d133dc..11d20004e91fb 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -71,6 +71,8 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t,
                             OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE);
 OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t,
                             OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(SharedMemSize, uint64_t,
+                            OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE);
 
 TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
   size_t Size = 0;

Copy link
Contributor

@ro-i ro-i left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM in general, but probably best to see if Joseph has any further comments :)

@kevinsala
Copy link
Contributor Author

All comments have been fixed now.

@kevinsala kevinsala merged commit 1a86f0a into llvm:main Nov 13, 2025
10 checks passed
ronlieb added a commit to ROCm/llvm-project that referenced this pull request Nov 13, 2025
ronlieb added a commit to ROCm/llvm-project that referenced this pull request Nov 14, 2025
ronlieb added a commit to ROCm/llvm-project that referenced this pull request Nov 14, 2025
ronlieb pushed a commit to ROCm/llvm-project that referenced this pull request Nov 14, 2025
ronlieb added a commit to ROCm/llvm-project that referenced this pull request Nov 14, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants