diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 5d197d8b4373e1..3fa80e56f288dd 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -856,6 +856,8 @@ The AMDGPU backend supports the following LLVM IR attributes. "amdgpu-flat-work-group-size"="min,max" Specify the minimum and maximum flat work group sizes that will be specified when the kernel is dispatched. Generated by the ``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_. + The implied default value is 1,1024. + "amdgpu-implicitarg-num-bytes"="n" Number of kernel argument bytes to add to the kernel argument block size for the implicit arguments. This varies by OS and language (for OpenCL see @@ -866,7 +868,12 @@ The AMDGPU backend supports the following LLVM IR attributes. ``amdgpu_num_vgpr`` CLANG attribute [CLANG-ATTR]_. "amdgpu-waves-per-eu"="m,n" Specify the minimum and maximum number of waves per execution unit. Generated by the ``amdgpu_waves_per_eu`` - CLANG attribute [CLANG-ATTR]_. + CLANG attribute [CLANG-ATTR]_. This is an optimization hint, + and the backend may not be able to satisfy the request. If + the specified range is incompatible with the function's + "amdgpu-flat-work-group-size" value, the implied occupancy + bounds by the workgroup size takes precedence. + "amdgpu-ieee" true/false. Specify whether the function expects the IEEE field of the mode register to be set on entry. Overrides the default for the calling convention. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 1d3b12380bb966..d19431a59dbff8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -544,8 +544,6 @@ std::pair AMDGPUSubtarget::getWavesPerEU( unsigned MinImpliedByFlatWorkGroupSize = getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); Default.first = MinImpliedByFlatWorkGroupSize; - bool RequestedFlatWorkGroupSize = - F.hasFnAttribute("amdgpu-flat-work-group-size"); // Requested minimum/maximum number of waves per execution unit. std::pair Requested = AMDGPU::getIntegerPairAttribute( @@ -562,8 +560,7 @@ std::pair AMDGPUSubtarget::getWavesPerEU( // Make sure requested values are compatible with values implied by requested // minimum/maximum flat work group sizes. - if (RequestedFlatWorkGroupSize && - Requested.first < MinImpliedByFlatWorkGroupSize) + if (Requested.first < MinImpliedByFlatWorkGroupSize) return Default; return Requested; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index c55fff82f117aa..ecff0fb2b1ceb0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -16,7 +16,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0 ; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40 ; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80 -; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 @@ -158,10 +157,23 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:260 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:264 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:268 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:272 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:276 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:280 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:284 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:272 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:276 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:280 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:284 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:540 ; 4-byte Folded Spill ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:288 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:292 ; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:296 @@ -218,38 +230,40 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:500 ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:504 ; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:508 -; GCN-NEXT: s_waitcnt vmcnt(60) -; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21] -; GCN-NEXT: s_waitcnt vmcnt(57) -; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16 -; GCN-NEXT: s_waitcnt vmcnt(54) -; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32 -; GCN-NEXT: s_waitcnt vmcnt(51) -; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48 -; GCN-NEXT: s_waitcnt vmcnt(48) -; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64 -; GCN-NEXT: s_waitcnt vmcnt(45) -; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80 -; GCN-NEXT: s_waitcnt vmcnt(42) -; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96 -; GCN-NEXT: s_waitcnt vmcnt(39) -; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112 -; GCN-NEXT: s_waitcnt vmcnt(36) -; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128 -; GCN-NEXT: s_waitcnt vmcnt(33) -; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144 -; GCN-NEXT: s_waitcnt vmcnt(30) -; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160 -; GCN-NEXT: s_waitcnt vmcnt(27) -; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176 -; GCN-NEXT: s_waitcnt vmcnt(24) -; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192 -; GCN-NEXT: s_waitcnt vmcnt(21) -; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208 -; GCN-NEXT: s_waitcnt vmcnt(18) -; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[20:21] offset:240 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21] +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[20:21] offset:32 +; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[20:21] offset:48 +; GCN-NEXT: global_store_dwordx4 v0, v[16:19], s[20:21] offset:64 +; GCN-NEXT: global_store_dwordx4 v0, v[20:23], s[20:21] offset:80 +; GCN-NEXT: global_store_dwordx4 v0, v[24:27], s[20:21] offset:96 +; GCN-NEXT: global_store_dwordx4 v0, v[28:31], s[20:21] offset:112 +; GCN-NEXT: global_store_dwordx4 v0, v[32:35], s[20:21] offset:128 +; GCN-NEXT: global_store_dwordx4 v0, v[36:39], s[20:21] offset:144 +; GCN-NEXT: global_store_dwordx4 v0, v[40:43], s[20:21] offset:160 +; GCN-NEXT: global_store_dwordx4 v0, v[44:47], s[20:21] offset:176 +; GCN-NEXT: global_store_dwordx4 v0, v[48:51], s[20:21] offset:192 +; GCN-NEXT: global_store_dwordx4 v0, v[52:55], s[20:21] offset:208 +; GCN-NEXT: global_store_dwordx4 v0, v[56:59], s[20:21] offset:224 +; GCN-NEXT: global_store_dwordx4 v0, v[60:63], s[20:21] offset:240 ; GCN-NEXT: s_endpgm %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx @@ -257,4 +271,4 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ret void } -attributes #0 = { "amdgpu-waves-per-eu"="1,10" } +attributes #0 = { "amdgpu-flat-workgroup-size"="1,256" "amdgpu-waves-per-eu"="1,10" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index e2e5e3369bd9a5..3d4acd79132dee 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -108,5 +108,5 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, declare i32 @llvm.amdgcn.workitem.id.x() #1 -attributes #0 = { "amdgpu-waves-per-eu"="1,10" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" } attributes #1 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index a5e97205de2137..1edec164ef2653 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -10,7 +10,7 @@ define amdgpu_kernel void @empty_exactly_1() #0 { entry: ret void } -attributes #0 = {"amdgpu-waves-per-eu"="1,1"} +attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" } ; Exactly 5 waves per execution unit. ; CHECK-LABEL: {{^}}empty_exactly_5: @@ -84,7 +84,7 @@ define amdgpu_kernel void @empty_at_most_5() #6 { entry: ret void } -attributes #6 = {"amdgpu-waves-per-eu"="1,5"} +attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64"} ; At most 10 waves per execution unit. ; CHECK-LABEL: {{^}}empty_at_most_10: diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll new file mode 100644 index 00000000000000..997cbc2ea46101 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll @@ -0,0 +1,63 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s + +; Both of these kernels have the same value for +; amdgpu-flat-work-group-size, except one explicitly sets it. This is +; a program visible property which should always take precedence over +; the amdgpu-waves-per-eu optimization hint. +; +; The range is incompatible with the amdgpu-waves-per-eu value, so the +; flat work group size should take precedence implying a requirement +; to support 1024 size workgroups (which exceeds the available LDS +; amount). + +; CHECK-NOT: @no_flat_workgroup_size.stack +; CHECK-NOT: @explicit_default_workgroup_size.stack + +; CHECK-LABEL: @no_flat_workgroup_size( +; CHECK: alloca [5 x i32] +; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4 +define amdgpu_kernel void @no_flat_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +entry: + %stack = alloca [5 x i32], align 4, addrspace(5) + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +; CHECK-LABEL: @explicit_default_workgroup_size( +; CHECK: alloca [5 x i32] +; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4 +define amdgpu_kernel void @explicit_default_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { +entry: + %stack = alloca [5 x i32], align 4, addrspace(5) + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="1,1" } +attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll index 25e0376dd7eea9..a503a79827fe20 100644 --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -392,7 +392,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 { ret void } -attributes #0 = { "amdgpu-waves-per-eu"="2,3" } +attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" } attributes #1 = { "amdgpu-waves-per-eu"="18,18" } attributes #2 = { "amdgpu-waves-per-eu"="19,19" } attributes #3 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll index a313664cb0e5eb..437e3a78ad5931 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll @@ -585,5 +585,5 @@ bb: ; Function Attrs: nounwind readnone declare float @llvm.fmuladd.f32(float, float, float) #1 -attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" } +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll index bc26b3fa19a67c..8ad70cf95c092e 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll @@ -588,4 +588,4 @@ bb: declare float @llvm.fmuladd.f32(float, float, float) #0 attributes #0 = { nounwind readnone } -attributes #1 = { "amdgpu-waves-per-eu"="1,1" } +attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll index 4750cf2020d9f5..9a56e85decebf9 100644 --- a/llvm/test/CodeGen/AMDGPU/target-cpu.ll +++ b/llvm/test/CodeGen/AMDGPU/target-cpu.ll @@ -107,5 +107,5 @@ attributes #1 = { nounwind readnone } attributes #2 = { nounwind "target-cpu"="tahiti" } attributes #3 = { nounwind "target-cpu"="bonaire" } attributes #4 = { nounwind "target-cpu"="fiji" } -attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" } -attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" } +attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" } +attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }