Skip to content

Commit

Permalink
AMDGPU: Don't consider whether amdgpu-flat-work-group-size was set
Browse files Browse the repository at this point in the history
It should be semantically identical if it was set to the same value as
the default. Also improve the documentation.
  • Loading branch information
arsenm committed Oct 22, 2021
1 parent cd824f9 commit 8d4b74a
Show file tree
Hide file tree
Showing 10 changed files with 132 additions and 51 deletions.
9 changes: 8 additions & 1 deletion llvm/docs/AMDGPUUsage.rst
Expand Up @@ -856,6 +856,8 @@ The AMDGPU backend supports the following LLVM IR attributes.
"amdgpu-flat-work-group-size"="min,max" Specify the minimum and maximum flat work group sizes that
will be specified when the kernel is dispatched. Generated
by the ``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_.
The implied default value is 1,1024.

"amdgpu-implicitarg-num-bytes"="n" Number of kernel argument bytes to add to the kernel
argument block size for the implicit arguments. This
varies by OS and language (for OpenCL see
Expand All @@ -866,7 +868,12 @@ The AMDGPU backend supports the following LLVM IR attributes.
``amdgpu_num_vgpr`` CLANG attribute [CLANG-ATTR]_.
"amdgpu-waves-per-eu"="m,n" Specify the minimum and maximum number of waves per
execution unit. Generated by the ``amdgpu_waves_per_eu``
CLANG attribute [CLANG-ATTR]_.
CLANG attribute [CLANG-ATTR]_. This is an optimization hint,
and the backend may not be able to satisfy the request. If
the specified range is incompatible with the function's
"amdgpu-flat-work-group-size" value, the implied occupancy
bounds by the workgroup size takes precedence.

"amdgpu-ieee" true/false. Specify whether the function expects the IEEE field of the
mode register to be set on entry. Overrides the default for
the calling convention.
Expand Down
5 changes: 1 addition & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -544,8 +544,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
unsigned MinImpliedByFlatWorkGroupSize =
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
Default.first = MinImpliedByFlatWorkGroupSize;
bool RequestedFlatWorkGroupSize =
F.hasFnAttribute("amdgpu-flat-work-group-size");

// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
Expand All @@ -562,8 +560,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(

// Make sure requested values are compatible with values implied by requested
// minimum/maximum flat work group sizes.
if (RequestedFlatWorkGroupSize &&
Requested.first < MinImpliedByFlatWorkGroupSize)
if (Requested.first < MinImpliedByFlatWorkGroupSize)
return Default;

return Requested;
Expand Down
90 changes: 52 additions & 38 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
Expand Up @@ -16,7 +16,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40
; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80
; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
Expand Down Expand Up @@ -158,10 +157,23 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:260
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:264
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:268
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:272
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:276
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:280
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:284
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:512 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:516 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:520 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:524 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:272
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:276
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:280
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:284
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:528 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:532 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:536 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:540 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:288
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:292
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:296
Expand Down Expand Up @@ -218,43 +230,45 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:500
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:504
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:508
; GCN-NEXT: s_waitcnt vmcnt(60)
; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21]
; GCN-NEXT: s_waitcnt vmcnt(57)
; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16
; GCN-NEXT: s_waitcnt vmcnt(54)
; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32
; GCN-NEXT: s_waitcnt vmcnt(51)
; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48
; GCN-NEXT: s_waitcnt vmcnt(48)
; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64
; GCN-NEXT: s_waitcnt vmcnt(45)
; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80
; GCN-NEXT: s_waitcnt vmcnt(42)
; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96
; GCN-NEXT: s_waitcnt vmcnt(39)
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112
; GCN-NEXT: s_waitcnt vmcnt(36)
; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128
; GCN-NEXT: s_waitcnt vmcnt(33)
; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144
; GCN-NEXT: s_waitcnt vmcnt(30)
; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160
; GCN-NEXT: s_waitcnt vmcnt(27)
; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176
; GCN-NEXT: s_waitcnt vmcnt(24)
; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192
; GCN-NEXT: s_waitcnt vmcnt(21)
; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208
; GCN-NEXT: s_waitcnt vmcnt(18)
; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[20:21] offset:240
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:512 ; 4-byte Folded Reload
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:516 ; 4-byte Folded Reload
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:520 ; 4-byte Folded Reload
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:524 ; 4-byte Folded Reload
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21]
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:528 ; 4-byte Folded Reload
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:532 ; 4-byte Folded Reload
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:536 ; 4-byte Folded Reload
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:540 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21] offset:16
; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[20:21] offset:32
; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[20:21] offset:48
; GCN-NEXT: global_store_dwordx4 v0, v[16:19], s[20:21] offset:64
; GCN-NEXT: global_store_dwordx4 v0, v[20:23], s[20:21] offset:80
; GCN-NEXT: global_store_dwordx4 v0, v[24:27], s[20:21] offset:96
; GCN-NEXT: global_store_dwordx4 v0, v[28:31], s[20:21] offset:112
; GCN-NEXT: global_store_dwordx4 v0, v[32:35], s[20:21] offset:128
; GCN-NEXT: global_store_dwordx4 v0, v[36:39], s[20:21] offset:144
; GCN-NEXT: global_store_dwordx4 v0, v[40:43], s[20:21] offset:160
; GCN-NEXT: global_store_dwordx4 v0, v[44:47], s[20:21] offset:176
; GCN-NEXT: global_store_dwordx4 v0, v[48:51], s[20:21] offset:192
; GCN-NEXT: global_store_dwordx4 v0, v[52:55], s[20:21] offset:208
; GCN-NEXT: global_store_dwordx4 v0, v[56:59], s[20:21] offset:224
; GCN-NEXT: global_store_dwordx4 v0, v[60:63], s[20:21] offset:240
; GCN-NEXT: s_endpgm
%vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
%insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx
store <64 x i32> %insert, <64 x i32> addrspace(1)* %out.ptr
ret void
}

attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
attributes #0 = { "amdgpu-flat-workgroup-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
Expand Up @@ -108,5 +108,5 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,

declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
attributes #1 = { nounwind readnone speculatable willreturn }
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
Expand Up @@ -10,7 +10,7 @@ define amdgpu_kernel void @empty_exactly_1() #0 {
entry:
ret void
}
attributes #0 = {"amdgpu-waves-per-eu"="1,1"}
attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" }

; Exactly 5 waves per execution unit.
; CHECK-LABEL: {{^}}empty_exactly_5:
Expand Down Expand Up @@ -84,7 +84,7 @@ define amdgpu_kernel void @empty_at_most_5() #6 {
entry:
ret void
}
attributes #6 = {"amdgpu-waves-per-eu"="1,5"}
attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64"}

; At most 10 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_most_10:
Expand Down
@@ -0,0 +1,63 @@
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s

; Both of these kernels have the same value for
; amdgpu-flat-work-group-size, except one explicitly sets it. This is
; a program visible property which should always take precedence over
; the amdgpu-waves-per-eu optimization hint.
;
; The range is incompatible with the amdgpu-waves-per-eu value, so the
; flat work group size should take precedence implying a requirement
; to support 1024 size workgroups (which exceeds the available LDS
; amount).

; CHECK-NOT: @no_flat_workgroup_size.stack
; CHECK-NOT: @explicit_default_workgroup_size.stack

; CHECK-LABEL: @no_flat_workgroup_size(
; CHECK: alloca [5 x i32]
; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
define amdgpu_kernel void @no_flat_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
%0 = load i32, i32 addrspace(1)* %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
store i32 %2, i32 addrspace(1)* %out, align 4
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
%3 = load i32, i32 addrspace(5)* %arrayidx12
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
store i32 %3, i32 addrspace(1)* %arrayidx13
ret void
}

; CHECK-LABEL: @explicit_default_workgroup_size(
; CHECK: alloca [5 x i32]
; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
define amdgpu_kernel void @explicit_default_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
%0 = load i32, i32 addrspace(1)* %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
store i32 %2, i32 addrspace(1)* %out, align 4
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
%3 = load i32, i32 addrspace(5)* %arrayidx12
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
store i32 %3, i32 addrspace(1)* %arrayidx13
ret void
}

attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
Expand Up @@ -392,7 +392,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
ret void
}

attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" }
attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
Expand Up @@ -585,5 +585,5 @@ bb:
; Function Attrs: nounwind readnone
declare float @llvm.fmuladd.f32(float, float, float) #1

attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { nounwind readnone }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
Expand Up @@ -588,4 +588,4 @@ bb:
declare float @llvm.fmuladd.f32(float, float, float) #0

attributes #0 = { nounwind readnone }
attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/target-cpu.ll
Expand Up @@ -107,5 +107,5 @@ attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "target-cpu"="tahiti" }
attributes #3 = { nounwind "target-cpu"="bonaire" }
attributes #4 = { nounwind "target-cpu"="fiji" }
attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" }
attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" }
attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }
attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }

0 comments on commit 8d4b74a

Please sign in to comment.