diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h index 573a2ef8fc200..64a1d3308aed0 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h @@ -288,6 +288,7 @@ typedef enum { HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, } hsa_executable_symbol_info_t; typedef struct hsa_code_object_s { diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index a529c379844e9..9b1f0204ad49d 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -436,6 +436,7 @@ struct AMDGPUKernelTy : public GenericKernelTy { {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &KernelObject}, {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &ArgsSize}, {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &GroupSize}, + {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &DynamicStack}, {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &PrivateSize}}; for (auto &Info : RequiredInfos) { @@ -485,6 +486,9 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// @return 56 for cov4 and 256 for cov5 uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; } + /// Indicates whether or not we need to set up our own private segment size. + bool usesDynamicStack() const { return DynamicStack; } + private: /// The kernel object to execute. uint64_t KernelObject; @@ -493,6 +497,7 @@ struct AMDGPUKernelTy : public GenericKernelTy { uint32_t ArgsSize; uint32_t GroupSize; uint32_t PrivateSize; + bool DynamicStack; /// The size of implicit kernel arguments. uint32_t ImplicitArgsSize; @@ -621,7 +626,8 @@ struct AMDGPUQueueTy { /// signal and can define an optional input signal (nullptr if none). Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, uint32_t NumThreads, uint64_t NumBlocks, - uint32_t GroupSize, AMDGPUSignalTy *OutputSignal, + uint32_t GroupSize, uint64_t StackSize, + AMDGPUSignalTy *OutputSignal, AMDGPUSignalTy *InputSignal) { assert(OutputSignal && "Invalid kernel output signal"); @@ -658,7 +664,8 @@ struct AMDGPUQueueTy { Packet->grid_size_x = NumBlocks * NumThreads; Packet->grid_size_y = 1; Packet->grid_size_z = 1; - Packet->private_segment_size = Kernel.getPrivateSize(); + Packet->private_segment_size = + Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize(); Packet->group_segment_size = GroupSize; Packet->kernel_object = Kernel.getKernelObject(); Packet->kernarg_address = KernelArgs; @@ -1124,7 +1131,7 @@ struct AMDGPUStreamTy { /// the kernel args buffer to the specified memory manager. Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, uint32_t NumThreads, uint64_t NumBlocks, - uint32_t GroupSize, + uint32_t GroupSize, uint64_t StackSize, AMDGPUMemoryManagerTy &MemoryManager) { if (Queue == nullptr) return Plugin::error("Target queue was nullptr"); @@ -1147,7 +1154,8 @@ struct AMDGPUStreamTy { // Push the kernel with the output signal and an input signal (optional) return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks, - GroupSize, OutputSignal, InputSignal); + GroupSize, StackSize, OutputSignal, + InputSignal); } /// Push an asynchronous memory copy between pinned memory buffers. @@ -2574,10 +2582,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Getters and setters for stack and heap sizes. Error getDeviceStackSize(uint64_t &Value) override { - Value = 0; + Value = StackSize; return Plugin::success(); } Error setDeviceStackSize(uint64_t Value) override { + StackSize = Value; return Plugin::success(); } Error getDeviceHeapSize(uint64_t &Value) override { @@ -2728,6 +2737,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// The current size of the global device memory pool (managed by us). uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */; + + /// The current size of the stack that will be used in cases where it could + /// not be statically determined. + uint64_t StackSize = 16 * 1024 /* 16 KB */; }; Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) { @@ -3100,6 +3113,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, GroupSize += MaxDynCGroupMem; } + uint64_t StackSize; + if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) + return Err; + // Initialize implicit arguments. utils::AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast( @@ -3138,7 +3155,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, - GroupSize, ArgsMemoryManager); + GroupSize, StackSize, ArgsMemoryManager); } Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h index f09ae24163dfc..9174ecaab08ca 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -864,6 +864,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy { return 0; } + virtual Error getDeviceStackSize(uint64_t &V) = 0; + private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, @@ -882,7 +884,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Get and set the stack size and heap size for the device. If not used, the /// plugin can implement the setters as no-op and setting the output /// value to zero for the getters. - virtual Error getDeviceStackSize(uint64_t &V) = 0; virtual Error setDeviceStackSize(uint64_t V) = 0; virtual Error getDeviceHeapSize(uint64_t &V) = 0; virtual Error setDeviceHeapSize(uint64_t V) = 0;