Skip to content

Commit

Permalink
AMDGPU: Switch backend default max workgroup size to 1024
Browse files Browse the repository at this point in the history
Previously this would default to 256, not the maximum supported size
of 1024. Using a maximum lower than the hardware maximum requires
language runtimes to enforce this limit for correctness, which no
language has correctly done. Switch the default to the conservatively
correct maximum, and force frontends to opt-in to the more optimal 256
default maximum.

I don't really understand why the changes in occupancy-levels.ll
increased the computed occupancy, which I expected to decrease. I'm
not sure if these tests should be forcing the old maximum.
  • Loading branch information
arsenm committed Nov 13, 2019
1 parent 25c5da5 commit 4b47213
Show file tree
Hide file tree
Showing 12 changed files with 58 additions and 24 deletions.
8 changes: 1 addition & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,11 +343,6 @@ AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
std::pair<unsigned, unsigned>
AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
switch (CC) {
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return std::make_pair(getWavefrontSize() * 2,
std::max(getWavefrontSize() * 4, 256u));
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_LS:
case CallingConv::AMDGPU_HS:
Expand All @@ -356,13 +351,12 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
case CallingConv::AMDGPU_PS:
return std::make_pair(1, getWavefrontSize());
default:
return std::make_pair(1, 16 * getWavefrontSize());
return std::make_pair(1u, getMaxFlatWorkGroupSize());
}
}

std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
const Function &F) const {
// FIXME: 1024 if function.
// Default minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> Default =
getDefaultFlatWorkGroupSize(F.getCallingConv());
Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; OPT-LABEL: @pointer_typed_alloca(
; OPT: getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) #1 {
entry:
%A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5)
store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4
Expand Down Expand Up @@ -556,7 +556,8 @@ entry:
ret void
}

attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { nounwind "amdgpu-flat-work-group-size"="1,256" }

; HSAOPT: !0 = !{}
; HSAOPT: !1 = !{i32 0, i32 257}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias
ret void
}

attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind convergent }

Expand Down
19 changes: 17 additions & 2 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
; CHECK: ---
; CHECK: amdhsa.kernels:

; CHECK: - .args:
; CHECK: - .args:
; CHECK: .group_segment_fixed_size: 0
; CHECK: .kernarg_segment_align: 8
; CHECK: .kernarg_segment_size: 24
; CHECK: .max_flat_workgroup_size: 256
; CHECK: .max_flat_workgroup_size: 1024
; CHECK: .name: test
; CHECK: .private_segment_fixed_size: 0
; WAVE64: .sgpr_count: 8
Expand All @@ -33,6 +33,20 @@ entry:
ret void
}

; CHECK: - .args:
; CHECK: .max_flat_workgroup_size: 256
define amdgpu_kernel void @test_max_flat_workgroup_size(
half addrspace(1)* %r,
half addrspace(1)* %a,
half addrspace(1)* %b) #2 {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%r.val = fadd half %a.val, %b.val
store half %r.val, half addrspace(1)* %r
ret void
}

; CHECK: .name: num_spilled_sgprs
; GFX700: .sgpr_spill_count: 40
; GFX803: .sgpr_spill_count: 24
Expand Down Expand Up @@ -149,3 +163,4 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {

attributes #0 = { "amdgpu-num-sgpr"="14" }
attributes #1 = { "amdgpu-num-vgpr"="20" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
26 changes: 25 additions & 1 deletion llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
; CHECK: WavefrontSize: 64
; CHECK: NumSGPRs: 8
; CHECK: NumVGPRs: 6
; CHECK: MaxFlatWorkGroupSize: 256
; CHECK: MaxFlatWorkGroupSize: 1024
define amdgpu_kernel void @test(
half addrspace(1)* %r,
half addrspace(1)* %a,
Expand All @@ -31,6 +31,29 @@ entry:
ret void
}

; CHECK-LABEL: - Name: test_max_flat_workgroup_size
; CHECK: SymbolName: 'test_max_flat_workgroup_size@kd'
; CHECK: CodeProps:
; CHECK: KernargSegmentSize: 24
; CHECK: GroupSegmentFixedSize: 0
; CHECK: PrivateSegmentFixedSize: 0
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
; CHECK: NumSGPRs: 8
; CHECK: NumVGPRs: 6
; CHECK: MaxFlatWorkGroupSize: 256
define amdgpu_kernel void @test_max_flat_workgroup_size(
half addrspace(1)* %r,
half addrspace(1)* %a,
half addrspace(1)* %b) #2 {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%r.val = fadd half %a.val, %b.val
store half %r.val, half addrspace(1)* %r
ret void
}

; CHECK-LABEL: - Name: num_spilled_sgprs
; CHECK: SymbolName: 'num_spilled_sgprs@kd'
; CHECK: CodeProps:
Expand Down Expand Up @@ -144,3 +167,4 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {

attributes #0 = { "amdgpu-num-sgpr"="14" }
attributes #1 = { "amdgpu-num-vgpr"="20" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ entry:

declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { norecurse nounwind }
attributes #0 = { norecurse nounwind "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { nounwind readnone }

!0 = !{i32 0, i32 1024}
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,8 @@ define amdgpu_kernel void @used_lds_6552() {
}

; GCN-LABEL: {{^}}used_lds_6556:
; GFX9: ; Occupancy: 9
; GFX1010W64: ; Occupancy: 19
; GFX9: ; Occupancy: 10
; GFX1010W64: ; Occupancy: 20
; GFX1010W32: ; Occupancy: 20
@lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
define amdgpu_kernel void @used_lds_6556() {
Expand All @@ -273,9 +273,9 @@ define amdgpu_kernel void @used_lds_6556() {
}

; GCN-LABEL: {{^}}used_lds_13112:
; GFX9: ; Occupancy: 4
; GFX1010W64: ; Occupancy: 9
; GFX1010W32: ; Occupancy: 19
; GFX9: ; Occupancy: 10
; GFX1010W64: ; Occupancy: 20
; GFX1010W32: ; Occupancy: 20
@lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
define amdgpu_kernel void @used_lds_13112() {
%p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
Original file line number Diff line number Diff line change
Expand Up @@ -300,4 +300,4 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; OPT: !0 = !{i32 0, i32 257}
; OPT: !1 = !{i32 0, i32 256}

attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ entry:
ret void
}

attributes #0 = { nounwind }
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,4 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)

declare i32* @get_unknown_pointer() #0

attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -201,4 +201,4 @@ for.body: ; preds = %for.body, %for.body

declare i32* @get_unknown_pointer() #0

attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,5 @@ bb:
ret void
}

attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" }
attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { norecurse nounwind }

0 comments on commit 4b47213

Please sign in to comment.