From e412342219be578041e6a4f4e5e029c760762431 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Mon, 17 Nov 2025 11:51:29 +0000 Subject: [PATCH 1/2] Allow amdgpu-waves-per-eu to lower occ. target --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 38 +++++---- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 +- .../AMDGPU/attr-amdgpu-waves-per-eu.ll | 12 +++ ...-work-group-size-overrides-waves-per-eu.ll | 61 -------------- .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 25 +++--- .../test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll | 4 +- ...ves-per-eu-hints-lower-occupancy-target.ll | 83 +++++++++++++++++++ 7 files changed, 135 insertions(+), 92 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll create mode 100644 llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index c7528f993da1e..5325fc22e603a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -190,23 +190,29 @@ std::pair AMDGPUSubtarget::getEffectiveWavesPerEU( // sizes limits the achievable maximum, and we aim to support enough waves per // EU so that we can concurrently execute all waves of a single workgroup of // maximum size on a CU. - std::pair Default = { + std::pair WavesPerEU = { getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second), getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second}; - Default.first = std::min(Default.first, Default.second); - - // Make sure requested minimum is within the default range and lower than the - // requested maximum. The latter must not violate target specification. - if (RequestedWavesPerEU.first < Default.first || - RequestedWavesPerEU.first > Default.second || - RequestedWavesPerEU.first > RequestedWavesPerEU.second || - RequestedWavesPerEU.second > getMaxWavesPerEU()) - return Default; - - // We cannot exceed maximum occupancy implied by flat workgroup size and LDS. - RequestedWavesPerEU.second = - std::min(RequestedWavesPerEU.second, Default.second); - return RequestedWavesPerEU; + WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second); + + // Requested minimum must not violate subtarget's specifications and be no + // greater than maximum. + if (RequestedWavesPerEU.first && + (RequestedWavesPerEU.first < getMinWavesPerEU() || + RequestedWavesPerEU.first > RequestedWavesPerEU.second)) + return WavesPerEU; + // Requested maximum must not violate subtarget's specifications. + if (RequestedWavesPerEU.second > getMaxWavesPerEU()) + return WavesPerEU; + + // A requested maximum may limit both the final minimum and maximum, but + // not increase them. A requested minimum can either decrease or increase the + // default minimum. + WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second); + if (RequestedWavesPerEU.first) + WavesPerEU.first = RequestedWavesPerEU.first; + WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second); + return WavesPerEU; } std::pair @@ -225,7 +231,7 @@ std::pair AMDGPUSubtarget::getWavesPerEU(std::pair FlatWorkGroupSizes, unsigned LDSBytes, const Function &F) const { // Default minimum/maximum number of waves per execution unit. - std::pair Default(1, getMaxWavesPerEU()); + std::pair Default(0, getMaxWavesPerEU()); // Requested minimum/maximum number of waves per execution unit. std::pair Requested = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index ed03ef21b6dda..2133d7297ac75 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -150,7 +150,9 @@ class AMDGPUSubtarget { /// Returns the target minimum/maximum number of waves per EU. This is based /// on the minimum/maximum number of \p RequestedWavesPerEU and further /// limited by the maximum achievable occupancy derived from the range of \p - /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. + /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A minimum + /// requested waves/EU value of 0 indicates an intent to not restrict the + /// minimum target occupancy. std::pair getEffectiveWavesPerEU(std::pair RequestedWavesPerEU, std::pair FlatWorkGroupSizes, diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 41bce31c6ebc0..4379fc7385d4f 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -225,3 +225,15 @@ entry: ret void } attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"} + +; At most 2 waves per execution unit. +; CHECK-LABEL: {{^}}empty_at_most_2: +; CHECK: SGPRBlocks: 12 +; CHECK: VGPRBlocks: 21 +; CHECK: NumSGPRsForWavesPerEU: 102 +; CHECK: NumVGPRsForWavesPerEU: 85 +define amdgpu_kernel void @empty_at_most_2() #13 { +entry: + ret void +} +attributes #13 = {"amdgpu-waves-per-eu"="0,2"} diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll deleted file mode 100644 index 67061bcb2a785..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll +++ /dev/null @@ -1,61 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s - -; Both of these kernels have the same value for -; amdgpu-flat-work-group-size, except one explicitly sets it. This is -; a program visible property which should always take precedence over -; the amdgpu-waves-per-eu optimization hint. -; -; The range is incompatible with the amdgpu-waves-per-eu value, so the -; flat work group size should take precedence implying a requirement -; to support 1024 size workgroups (which exceeds the available LDS -; amount). - -; CHECK-NOT: @no_flat_workgroup_size.stack -; CHECK-NOT: @explicit_default_workgroup_size.stack - -; CHECK-LABEL: @no_flat_workgroup_size( -; CHECK: alloca [5 x i32] -; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4 -define amdgpu_kernel void @no_flat_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { -entry: - %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, ptr addrspace(1) %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 - store i32 4, ptr addrspace(5) %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 - %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 - store i32 5, ptr addrspace(5) %arrayidx3, align 4 - %2 = load i32, ptr addrspace(5) %stack, align 4 - store i32 %2, ptr addrspace(1) %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 - %3 = load i32, ptr addrspace(5) %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 - store i32 %3, ptr addrspace(1) %arrayidx13 - ret void -} - -; CHECK-LABEL: @explicit_default_workgroup_size( -; CHECK: alloca [5 x i32] -; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4 -define amdgpu_kernel void @explicit_default_workgroup_size(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 { -entry: - %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, ptr addrspace(1) %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 - store i32 4, ptr addrspace(5) %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 - %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 - store i32 5, ptr addrspace(5) %arrayidx3, align 4 - %2 = load i32, ptr addrspace(5) %stack, align 4 - store i32 %2, ptr addrspace(1) %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 - %3 = load i32, ptr addrspace(5) %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 - store i32 %3, ptr addrspace(1) %arrayidx13 - ret void -} - -attributes #0 = { "amdgpu-waves-per-eu"="1,1" } -attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index 0be314772abdb..3b5a7cd3707b6 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s -; Check propagation of amdgpu-flat-work-group-size attribute. +; Check propagation of amdgpu-waves-per-eu attribute. ; Called from a single kernel with 1,8 define internal void @default_to_1_8_a() { @@ -216,30 +216,31 @@ define internal i32 @bitcasted_function() { ret i32 0 } -define internal void @called_from_invalid_bounds_0() { -; CHECK-LABEL: define internal void @called_from_invalid_bounds_0 +define internal void @called_without_min_waves() { +; CHECK-LABEL: define internal void @called_without_min_waves ; CHECK-SAME: () #[[ATTR1]] { ; CHECK-NEXT: ret void ; ret void } -define internal void @called_from_invalid_bounds_1() { -; CHECK-LABEL: define internal void @called_from_invalid_bounds_1 +define internal void @called_from_invalid_bounds() { +; CHECK-LABEL: define internal void @called_from_invalid_bounds ; CHECK-SAME: () #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void } -; Invalid range for amdgpu-waves-per-eu -define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { -; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8 +; Kernel does not specify a minimum desired occupancy so will end up with the +; default minimum (and the requested maximum). +define amdgpu_kernel void @kernel_without_min_waves() #9 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_without_min_waves ; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: call void @called_from_invalid_bounds_0() +; CHECK-NEXT: call void @called_without_min_waves() ; CHECK-NEXT: ret void ; - call void @called_from_invalid_bounds_0() + call void @called_without_min_waves() ret void } @@ -247,10 +248,10 @@ define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123 ; CHECK-SAME: () #[[ATTR11:[0-9]+]] { -; CHECK-NEXT: call void @called_from_invalid_bounds_1() +; CHECK-NEXT: call void @called_from_invalid_bounds() ; CHECK-NEXT: ret void ; - call void @called_from_invalid_bounds_1() + call void @called_from_invalid_bounds() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll index 9b992a35c3303..b14bb8c292ad7 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll @@ -575,5 +575,5 @@ define amdgpu_kernel void @use1024vgprs_codegen(ptr %p) #1281 { ret void } -attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="1" } -attributes #1281 = { nounwind "amdgpu-flat-work-group-size"="128,128" "amdgpu-waves-per-eu"="1" } +attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" } +attributes #1281 = { nounwind "amdgpu-flat-work-group-size"="128,128" } diff --git a/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll new file mode 100644 index 0000000000000..b0e386cba5aff --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll @@ -0,0 +1,83 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s + +; All kernels have the same value for amdgpu-flat-work-group-size, except the +; second and third kernels explicitly set it. The first and second kernel also +; have the same final waves/EU range, except the second kernel explicitly sets +; it with amdgpu-waves-per-eu. +; +; The third kernel hints the compiler that a maximum occupancy of 1 is desired +; with amdgpu-waves-per-eu, so the alloca promotion pass is free to use more LDS +; space than when limiting itself to support the maximum default occupancy of +; 10. This does not break the ABI requirement to support the full possible range +; of workgroup sizes + +; CHECK-NOT: @no_attributes.stack +; CHECK-NOT: @explicit_default_workgroup_size_and_waves.stack + +; CHECK-LABEL: @no_attributes( +; CHECK: alloca [5 x i32] +; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4 +define amdgpu_kernel void @no_attributes(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) { +entry: + %stack = alloca [5 x i32], align 4, addrspace(5) + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 + ret void +} + +; CHECK-LABEL: @explicit_default_workgroup_size_and_waves( +; CHECK: alloca [5 x i32] +; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4 +define amdgpu_kernel void @explicit_default_workgroup_size_and_waves(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { +entry: + %stack = alloca [5 x i32], align 4, addrspace(5) + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 + ret void +} + +; CHECK-LABEL: @explicit_low_occupancy_requested( +; CHECK-NOT: alloca [5 x i32] +define amdgpu_kernel void @explicit_low_occupancy_requested(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 { +entry: + %stack = alloca [5 x i32], align 4, addrspace(5) + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="4,10" "amdgpu-flat-work-group-size"="1,1024" } +attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" } From ae902de6f383a5b3f98992d2ff56be4926168bbb Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <11032120+lucas-rami@users.noreply.github.com> Date: Mon, 17 Nov 2025 13:14:21 +0100 Subject: [PATCH 2/2] Clarify test comment --- .../AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll index b0e386cba5aff..46733a8757ad8 100644 --- a/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll +++ b/llvm/test/CodeGen/AMDGPU/waves-per-eu-hints-lower-occupancy-target.ll @@ -3,13 +3,14 @@ ; All kernels have the same value for amdgpu-flat-work-group-size, except the ; second and third kernels explicitly set it. The first and second kernel also ; have the same final waves/EU range, except the second kernel explicitly sets -; it with amdgpu-waves-per-eu. +; it with amdgpu-waves-per-eu. As a result the first and second kernels are +; treated identically. ; ; The third kernel hints the compiler that a maximum occupancy of 1 is desired ; with amdgpu-waves-per-eu, so the alloca promotion pass is free to use more LDS ; space than when limiting itself to support the maximum default occupancy of ; 10. This does not break the ABI requirement to support the full possible range -; of workgroup sizes +; of workgroup sizes as specified by amdgpu-flat-work-group-size. ; CHECK-NOT: @no_attributes.stack ; CHECK-NOT: @explicit_default_workgroup_size_and_waves.stack