diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index e9c154818c4a1..6ada191089674 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -43,6 +43,7 @@ def ol_device_info_t : Enum { TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">, TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">, TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">, + TaggedEtor<"WORK_GROUP_LOCAL_MEM_SIZE", "uint64_t", "The maximum size of local shared memory per work group in bytes">, ]; list fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor); list native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>); diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 84bc414396811..eab9627217ca8 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -495,6 +495,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, return Info.write(static_cast(Value)); } + case OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE: { + if (!std::holds_alternative(Entry->Value)) + return makeError(ErrorCode::BACKEND_FAILURE, + "plugin returned incorrect type"); + return Info.write(std::get(Entry->Value)); + } + case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION: case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: { // {x, y, z} triples @@ -590,6 +597,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.write(std::numeric_limits::digits); case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: case OL_DEVICE_INFO_GLOBAL_MEM_SIZE: + case OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE: return Info.write(0); default: return createOffloadError(ErrorCode::INVALID_ENUMERATION, diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 04b394452a448..51965093cb949 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2186,6 +2186,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = checkIfAPU()) return Err; + // Retrieve the size of the group memory. + for (const auto *Pool : AllMemoryPools) { + if (Pool->isGroup()) { + if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, + MaxBlockSharedMemSize)) + return Err; + break; + } + } + return Plugin::success(); } @@ -2923,6 +2933,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (Status == HSA_STATUS_SUCCESS) Info.add("Cacheline Size", TmpUInt); + Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes", + DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); if (Status == HSA_STATUS_SUCCESS) Info.add("Max Clock Freq", TmpUInt, "MHz", diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 2135e0608323e..1d52c960b7fde 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -794,6 +794,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Get the unique identifier of the device. const char *getDeviceUid() const { return DeviceUid.c_str(); } + /// Get the total shared memory per block (in bytes) that can be used in any + /// kernel. + size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; } + /// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. virtual Error setContext() = 0; @@ -1251,6 +1255,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Internal representation for OMPT device (initialize & finalize) std::atomic OmptInitialized; #endif + + /// The total per-block native shared memory that a kernel may use. + size_t MaxBlockSharedMemSize = 0; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 45e580e7e0cd7..a27c6f3de0cd3 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -379,6 +379,12 @@ struct CUDADeviceTy : public GenericDeviceTy { return Err; HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); + uint32_t MaxSharedMem; + if (auto Err = getDeviceAttr( + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem)) + return Err; + MaxBlockSharedMemSize = MaxSharedMem; + return Plugin::success(); } @@ -1089,10 +1095,8 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) Info.add("Total Constant Memory", TmpInt, "bytes"); - Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - TmpInt); - if (Res == CUDA_SUCCESS) - Info.add("Max Shared Memory per Block", TmpInt, "bytes"); + Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes", + DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt); if (Res == CUDA_SUCCESS) diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp index 42ffb97d6d77c..74af3bfb13303 100644 --- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp +++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp @@ -205,6 +205,9 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) { S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B")); OFFLOAD_ERR(printDeviceValue(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, "Global Mem Size", "B")); + OFFLOAD_ERR( + printDeviceValue(S, D, OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE, + "Work Group Shared Mem Size", "B")); OFFLOAD_ERR( (printDeviceValue( S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG, diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index 30eafee026316..ba29fb153682d 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -217,6 +217,11 @@ OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0); OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE); +OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE, + 0); +OL_DEVICE_INFO_TEST_HOST_SUCCESS(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE); TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) { ol_device_type_t DeviceType; diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index 79a18c1d133dc..2c375eb555a48 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -71,6 +71,8 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE); OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t, OL_DEVICE_INFO_GLOBAL_MEM_SIZE); +OL_DEVICE_INFO_SIZE_TEST_EQ(SharedMemSize, uint64_t, + OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE); TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) { size_t Size = 0;