Skip to content

Commit

Permalink
[AMDGPU] Introduce more scratch registers in the ABI.
Browse files Browse the repository at this point in the history
The AMDGPU target has a convention that defined all VGPRs
(execept the initial 32 argument registers) as callee-saved.
This convention is not efficient always, esp. when the callee
requiring more registers, ended up emitting a large number of
spills, even though its caller requires only a few.

This patch revises the ABI by introducing more scratch registers
that a callee can freely use.
The 256 vgpr registers now become:
  32 argument registers
  112 scratch registers and
  112 callee saved registers.
The scratch registers and the CSRs are intermixed at regular
intervals (a split boundary of 8) to obtain a better occupancy.

Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr

Reviewed By: arsenm, t-tye

Differential Revision: https://reviews.llvm.org/D76356
  • Loading branch information
cdevadas committed May 5, 2020
1 parent 6fb7e9a commit 375cec4
Show file tree
Hide file tree
Showing 22 changed files with 399 additions and 230 deletions.
22 changes: 19 additions & 3 deletions llvm/docs/AMDGPUUsage.rst
Expand Up @@ -6507,11 +6507,27 @@ On exit from a function:
* FLAT_SCRATCH
* EXEC
* GFX6-8: M0
* All SGPR and VGPR registers except the clobbered registers of SGPR4-31 and
VGPR0-31.
* All SGPR registers except the clobbered registers of SGPR4-31.
* VGPR40-47
VGPR56-63
VGPR72-79
VGPR88-95
VGPR104-111
VGPR120-127
VGPR136-143
VGPR152-159
VGPR168-175
VGPR184-191
VGPR200-207
VGPR216-223
VGPR232-239
VGPR248-255
*Except the argument registers, the VGPR cloberred and the preserved
registers are intermixed at regular intervals in order to
get a better occupancy.*

For the AMDGPU backend, an inter-procedural register allocation (IPRA)
optimization may mark some of clobbered SGPR4-31 and VGPR0-31 registers as
optimization may mark some of clobbered SGPR and VGPR registers as
preserved if it can be determined that the called function does not change
their value.

Expand Down
20 changes: 19 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
Expand Up @@ -89,6 +89,24 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
(sequence "VGPR%u", 32, 255)
>;

def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
// The CSRs & scratch-registers are interleaved at a split boundary of 8.
(add (sequence "VGPR%u", 40, 47),
(sequence "VGPR%u", 56, 63),
(sequence "VGPR%u", 72, 79),
(sequence "VGPR%u", 88, 95),
(sequence "VGPR%u", 104, 111),
(sequence "VGPR%u", 120, 127),
(sequence "VGPR%u", 136, 143),
(sequence "VGPR%u", 152, 159),
(sequence "VGPR%u", 168, 175),
(sequence "VGPR%u", 184, 191),
(sequence "VGPR%u", 200, 207),
(sequence "VGPR%u", 216, 223),
(sequence "VGPR%u", 232, 239),
(sequence "VGPR%u", 248, 255))
>;

def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
(sequence "SGPR%u", 32, 105)
>;
Expand All @@ -104,7 +122,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
>;

def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
(add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105)
(add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
>;

// Calling convention for leaf functions
Expand Down
13 changes: 0 additions & 13 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
Expand Up @@ -727,9 +727,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GPRIDX-NEXT: s_mov_b32 s18, 0
; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000
; GPRIDX-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GPRIDX-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GPRIDX-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000
; GPRIDX-NEXT: s_mov_b32 s16, s18
; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000
Expand Down Expand Up @@ -793,9 +790,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
; GPRIDX-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; GPRIDX-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GPRIDX-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -816,9 +810,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; MOVREL-NEXT: s_mov_b32 s8, s18
; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0
; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0
; MOVREL-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
; MOVREL-NEXT: v_mov_b32_e32 v34, s19
; MOVREL-NEXT: v_mov_b32_e32 v33, s18
; MOVREL-NEXT: v_mov_b32_e32 v32, s17
Expand Down Expand Up @@ -868,10 +859,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
; MOVREL-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; MOVREL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; MOVREL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; MOVREL-NEXT: s_waitcnt vmcnt(0)
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
; MOVREL-NEXT: s_setpc_b64 s[30:31]
entry:
Expand Down
12 changes: 2 additions & 10 deletions llvm/test/CodeGen/AMDGPU/call-argument-types.ll
Expand Up @@ -744,17 +744,13 @@ entry:

; GCN-LABEL: {{^}}tail_call_byval_align16:
; GCN-NOT: s32
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8

; GCN: s_getpc_b64

; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
Expand All @@ -766,15 +762,11 @@ entry:

; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
; GCN-NOT: s32
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}}
; GCN: s_getpc_b64
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
Expand Up @@ -13,15 +13,15 @@ define void @use_vcc() #1 {
}

; GCN-LABEL: {{^}}indirect_use_vcc:
; GCN: v_writelane_b32 v32, s33, 2
; GCN: v_writelane_b32 v32, s30, 0
; GCN: v_writelane_b32 v32, s31, 1
; GCN: v_writelane_b32 v40, s33, 2
; GCN: v_writelane_b32 v40, s30, 0
; GCN: v_writelane_b32 v40, s31, 1
; GCN: s_swappc_b64
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s5, v32, 1
; GCN: v_readlane_b32 s33, v32, 2
; GCN: v_readlane_b32 s4, v40, 0
; GCN: v_readlane_b32 s5, v40, 1
; GCN: v_readlane_b32 s33, v40, 2
; GCN: ; NumSgprs: 36
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define void @indirect_use_vcc() #1 {
call void @use_vcc()
ret void
Expand All @@ -32,7 +32,7 @@ define void @indirect_use_vcc() #1 {
; CI: ; NumSgprs: 38
; VI-NOBUG: ; NumSgprs: 40
; VI-BUG: ; NumSgprs: 96
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 {
call void @indirect_use_vcc()
ret void
Expand All @@ -50,7 +50,7 @@ define void @use_flat_scratch() #1 {
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
; CI: ; NumSgprs: 38
; VI: ; NumSgprs: 40
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define void @indirect_use_flat_scratch() #1 {
call void @use_flat_scratch()
ret void
Expand All @@ -61,7 +61,7 @@ define void @indirect_use_flat_scratch() #1 {
; CI: ; NumSgprs: 38
; VI-NOBUG: ; NumSgprs: 40
; VI-BUG: ; NumSgprs: 96
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 {
call void @indirect_use_flat_scratch()
ret void
Expand All @@ -76,31 +76,31 @@ define void @use_10_vgpr() #1 {
}

; GCN-LABEL: {{^}}indirect_use_10_vgpr:
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define void @indirect_use_10_vgpr() #0 {
call void @use_10_vgpr()
ret void
}

; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
; GCN: is_dynamic_callstack = 0
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
call void @indirect_use_10_vgpr()
ret void
}

; GCN-LABEL: {{^}}use_40_vgpr:
; GCN: ; NumVgprs: 40
define void @use_40_vgpr() #1 {
call void asm sideeffect "", "~{v39}"() #0
; GCN-LABEL: {{^}}use_50_vgpr:
; GCN: ; NumVgprs: 50
define void @use_50_vgpr() #1 {
call void asm sideeffect "", "~{v49}"() #0
ret void
}

; GCN-LABEL: {{^}}indirect_use_40_vgpr:
; GCN: ; NumVgprs: 40
define void @indirect_use_40_vgpr() #0 {
call void @use_40_vgpr()
; GCN-LABEL: {{^}}indirect_use_50_vgpr:
; GCN: ; NumVgprs: 50
define void @indirect_use_50_vgpr() #0 {
call void @use_50_vgpr()
ret void
}

Expand Down
60 changes: 30 additions & 30 deletions llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
Expand Up @@ -23,22 +23,22 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_

; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
; GCN: buffer_store_dword
; GCN: v_writelane_b32 v32, s33, 4
; GCN: v_writelane_b32 v32, s34, 0
; GCN: v_writelane_b32 v32, s35, 1
; GCN: v_writelane_b32 v32, s30, 2
; GCN: v_writelane_b32 v32, s31, 3
; GCN: v_writelane_b32 v40, s33, 4
; GCN: v_writelane_b32 v40, s34, 0
; GCN: v_writelane_b32 v40, s35, 1
; GCN: v_writelane_b32 v40, s30, 2
; GCN: v_writelane_b32 v40, s31, 3

; GCN: s_swappc_b64
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
; GCN-DAG: v_readlane_b32 s4, v32, 2
; GCN-DAG: v_readlane_b32 s5, v32, 3
; GCN: v_readlane_b32 s35, v32, 1
; GCN: v_readlane_b32 s34, v32, 0
; GCN-DAG: v_readlane_b32 s4, v40, 2
; GCN-DAG: v_readlane_b32 s5, v40, 3
; GCN: v_readlane_b32 s35, v40, 1
; GCN: v_readlane_b32 s34, v40, 0

; GCN: v_readlane_b32 s33, v32, 4
; GCN: v_readlane_b32 s33, v40, 4
; GCN: buffer_load_dword
; GCN: s_setpc_b64
define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
Expand All @@ -49,16 +49,16 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
}

; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
; GCN: buffer_store_dword v32
; GCN: v_writelane_b32 v32, s33, 4
; GCN: buffer_store_dword v40
; GCN: v_writelane_b32 v40, s33, 4

; GCN: s_mov_b32 s33, s32
; GCN: s_add_u32 s32, s32, 0x400
; GCN: s_swappc_b64
; GCN-NEXT: s_swappc_b64

; GCN: v_readlane_b32 s33, v32, 4
; GCN: buffer_load_dword v32,
; GCN: v_readlane_b32 s33, v40, 4
; GCN: buffer_load_dword v40,
define void @test_func_call_external_void_funcx2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
Expand Down Expand Up @@ -115,9 +115,9 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace
}

; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
; GCN: v_mov_b32_e32 v32, v31
; GCN: v_mov_b32_e32 v40, v31
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: v_mov_b32_e32 v31, v32
; GCN-NEXT: v_mov_b32_e32 v31, v40
define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
call void @external_void_func_void()
Expand Down Expand Up @@ -177,31 +177,31 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
ret void
}

; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: {{.*}}
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}

; GCN-NOT: v32
; GCN: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4
; GCN: s_mov_b32 s32, 0
; GCN-NOT: v32
; GCN-NOT: v40

; GCN: ;;#ASMSTART
; GCN-NEXT: ; def v32
; GCN-NEXT: ; def v40
; GCN-NEXT: ;;#ASMEND

; GCN: s_swappc_b64 s[30:31], s[4:5]

; GCN-NOT: v32
; GCN-NOT: v40

; GCN: ;;#ASMSTART
; GCN-NEXT: ; use v32
; GCN-NEXT: ; use v40
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 {
%v32 = call i32 asm sideeffect "; def $0", "={v32}"()
define amdgpu_kernel void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 {
%v40 = call i32 asm sideeffect "; def $0", "={v40}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{v32}"(i32 %v32)
call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
ret void
}

Expand Down Expand Up @@ -255,12 +255,12 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {

; GCN-LABEL: {{^}}callee_saved_sgpr_func:
; GCN-NOT: s40
; GCN: v_writelane_b32 v32, s40
; GCN: v_writelane_b32 v40, s40
; GCN: s_swappc_b64
; GCN-NOT: s40
; GCN: ; use s40
; GCN-NOT: s40
; GCN: v_readlane_b32 s40, v32
; GCN: v_readlane_b32 s40, v40
; GCN-NOT: s40
define void @callee_saved_sgpr_func() #2 {
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
Expand All @@ -287,19 +287,19 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
; First call preserved VGPR is used so it can't be used for SGPR spills.
; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func:
; GCN-NOT: s40
; GCN: v_writelane_b32 v33, s40
; GCN: v_writelane_b32 v41, s40
; GCN: s_swappc_b64
; GCN-NOT: s40
; GCN: ; use s40
; GCN-NOT: s40
; GCN: v_readlane_b32 s40, v33
; GCN: v_readlane_b32 s40, v41
; GCN-NOT: s40
define void @callee_saved_sgpr_vgpr_func() #2 {
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
%v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
%v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0
call void @external_void_func_void()
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
call void asm sideeffect "; use $0", "v"(i32 %v32) #0
call void asm sideeffect "; use $0", "v"(i32 %v40) #0
ret void
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
Expand Up @@ -64,11 +64,11 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, s34
; GCN-NEXT: v_mov_b32_e32 v1, s35
; GCN-NEXT: global_store_dword v[0:1], v32, off
; GCN-NEXT: global_store_dword v[0:1], v40, off
; GCN-NEXT: s_endpgm
call void @func(i32 0)
store i32 0, i32 addrspace(1)* %ptr
Expand Down

0 comments on commit 375cec4

Please sign in to comment.