[AMDGPU] Introduce more scratch registers in the ABI.

The AMDGPU target has a convention that defined all VGPRs (execept the initial 32 argument registers) as callee-saved. This convention is not efficient always, esp. when the callee requiring more registers, ended up emitting a large number of spills, even though its caller requires only a few. This patch revises the ABI by introducing more scratch registers that a callee can freely use. The 256 vgpr registers now become: 32 argument registers 112 scratch registers and 112 callee saved registers. The scratch registers and the CSRs are intermixed at regular intervals (a split boundary of 8) to obtain a better occupancy. Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr Reviewed By: arsenm, t-tye Differential Revision: https://reviews.llvm.org/D76356
llvm · May 5, 2020 · 375cec4 · 375cec4
1 parent 6fb7e9a
commit 375cec4
Show file tree

Hide file tree

Showing 22 changed files with 399 additions and 230 deletions.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -6507,11 +6507,27 @@ On exit from a function:
     * FLAT_SCRATCH
     * EXEC
     * GFX6-8: M0
-    * All SGPR and VGPR registers except the clobbered registers of SGPR4-31 and
-      VGPR0-31.
+    * All SGPR registers except the clobbered registers of SGPR4-31.
+    * VGPR40-47
+      VGPR56-63
+      VGPR72-79
+      VGPR88-95
+      VGPR104-111
+      VGPR120-127
+      VGPR136-143
+      VGPR152-159
+      VGPR168-175
+      VGPR184-191
+      VGPR200-207
+      VGPR216-223
+      VGPR232-239
+      VGPR248-255
+        *Except the argument registers, the VGPR cloberred and the preserved
+        registers are intermixed at regular intervals in order to
+        get a better occupancy.*
 
       For the AMDGPU backend, an inter-procedural register allocation (IPRA)
-      optimization may mark some of clobbered SGPR4-31 and VGPR0-31 registers as
+      optimization may mark some of clobbered SGPR and VGPR registers as
       preserved if it can be determined that the called function does not change
       their value.
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -89,6 +89,24 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
   (sequence "VGPR%u", 32, 255)
 >;
 
+def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
+  // The CSRs & scratch-registers are interleaved at a split boundary of 8.
+  (add (sequence "VGPR%u", 40, 47),
+    (sequence "VGPR%u", 56, 63),
+    (sequence "VGPR%u", 72, 79),
+    (sequence "VGPR%u", 88, 95),
+    (sequence "VGPR%u", 104, 111),
+    (sequence "VGPR%u", 120, 127),
+    (sequence "VGPR%u", 136, 143),
+    (sequence "VGPR%u", 152, 159),
+    (sequence "VGPR%u", 168, 175),
+    (sequence "VGPR%u", 184, 191),
+    (sequence "VGPR%u", 200, 207),
+    (sequence "VGPR%u", 216, 223),
+    (sequence "VGPR%u", 232, 239),
+    (sequence "VGPR%u", 248, 255))
+>;
+
 def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
   (sequence "SGPR%u", 32, 105)
 >;
@@ -104,7 +122,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
 >;
 
 def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
-  (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105)
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
 >;
 
 // Calling convention for leaf functions

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -727,9 +727,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_mov_b32 s18, 0
 ; GPRIDX-NEXT:    s_mov_b32 s19, 0x40200000
-; GPRIDX-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GPRIDX-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GPRIDX-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GPRIDX-NEXT:    s_mov_b32 s17, 0x401c0000
 ; GPRIDX-NEXT:    s_mov_b32 s16, s18
 ; GPRIDX-NEXT:    s_mov_b32 s15, 0x40180000
@@ -793,9 +790,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
-; GPRIDX-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GPRIDX-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GPRIDX-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -816,9 +810,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    s_mov_b32 s8, s18
 ; MOVREL-NEXT:    s_mov_b64 s[6:7], 2.0
 ; MOVREL-NEXT:    s_mov_b64 s[4:5], 1.0
-; MOVREL-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; MOVREL-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; MOVREL-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; MOVREL-NEXT:    v_mov_b32_e32 v34, s19
 ; MOVREL-NEXT:    v_mov_b32_e32 v33, s18
 ; MOVREL-NEXT:    v_mov_b32_e32 v32, s17
@@ -868,10 +859,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
-; MOVREL-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; MOVREL-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; MOVREL-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 entry:

diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -744,17 +744,13 @@ entry:
 
 ; GCN-LABEL: {{^}}tail_call_byval_align16:
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
 
 ; GCN: s_getpc_b64
 
 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
 ; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
@@ -766,15 +762,11 @@ entry:
 
 ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}}
 ; GCN: s_getpc_b64
 ; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {

diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -13,15 +13,15 @@ define void @use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_use_vcc:
-; GCN: v_writelane_b32 v32, s33, 2
-; GCN: v_writelane_b32 v32, s30, 0
-; GCN: v_writelane_b32 v32, s31, 1
+; GCN: v_writelane_b32 v40, s33, 2
+; GCN: v_writelane_b32 v40, s30, 0
+; GCN: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s4, v32, 0
-; GCN: v_readlane_b32 s5, v32, 1
-; GCN: v_readlane_b32 s33, v32, 2
+; GCN: v_readlane_b32 s4, v40, 0
+; GCN: v_readlane_b32 s5, v40, 1
+; GCN: v_readlane_b32 s33, v40, 2
 ; GCN: ; NumSgprs: 36
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define void @indirect_use_vcc() #1 {
   call void @use_vcc()
   ret void
@@ -32,7 +32,7 @@ define void @indirect_use_vcc() #1 {
 ; CI: ; NumSgprs: 38
 ; VI-NOBUG: ; NumSgprs: 40
 ; VI-BUG: ; NumSgprs: 96
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 {
   call void @indirect_use_vcc()
   ret void
@@ -50,7 +50,7 @@ define void @use_flat_scratch() #1 {
 ; GCN-LABEL: {{^}}indirect_use_flat_scratch:
 ; CI: ; NumSgprs: 38
 ; VI: ; NumSgprs: 40
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define void @indirect_use_flat_scratch() #1 {
   call void @use_flat_scratch()
   ret void
@@ -61,7 +61,7 @@ define void @indirect_use_flat_scratch() #1 {
 ; CI: ; NumSgprs: 38
 ; VI-NOBUG: ; NumSgprs: 40
 ; VI-BUG: ; NumSgprs: 96
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 {
   call void @indirect_use_flat_scratch()
   ret void
@@ -76,31 +76,31 @@ define void @use_10_vgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_use_10_vgpr:
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define void @indirect_use_10_vgpr() #0 {
   call void @use_10_vgpr()
   ret void
 }
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
 ; GCN: is_dynamic_callstack = 0
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
   call void @indirect_use_10_vgpr()
   ret void
 }
 
-; GCN-LABEL: {{^}}use_40_vgpr:
-; GCN: ; NumVgprs: 40
-define void @use_40_vgpr() #1 {
-  call void asm sideeffect "", "~{v39}"() #0
+; GCN-LABEL: {{^}}use_50_vgpr:
+; GCN: ; NumVgprs: 50
+define void @use_50_vgpr() #1 {
+  call void asm sideeffect "", "~{v49}"() #0
   ret void
 }
 
-; GCN-LABEL: {{^}}indirect_use_40_vgpr:
-; GCN: ; NumVgprs: 40
-define void @indirect_use_40_vgpr() #0 {
-  call void @use_40_vgpr()
+; GCN-LABEL: {{^}}indirect_use_50_vgpr:
+; GCN: ; NumVgprs: 50
+define void @indirect_use_50_vgpr() #0 {
+  call void @use_50_vgpr()
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -23,22 +23,22 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
 
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GCN: buffer_store_dword
-; GCN: v_writelane_b32 v32, s33, 4
-; GCN: v_writelane_b32 v32, s34, 0
-; GCN: v_writelane_b32 v32, s35, 1
-; GCN: v_writelane_b32 v32, s30, 2
-; GCN: v_writelane_b32 v32, s31, 3
+; GCN: v_writelane_b32 v40, s33, 4
+; GCN: v_writelane_b32 v40, s34, 0
+; GCN: v_writelane_b32 v40, s35, 1
+; GCN: v_writelane_b32 v40, s30, 2
+; GCN: v_writelane_b32 v40, s31, 3
 
 ; GCN: s_swappc_b64
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s4, v32, 2
-; GCN-DAG: v_readlane_b32 s5, v32, 3
-; GCN: v_readlane_b32 s35, v32, 1
-; GCN: v_readlane_b32 s34, v32, 0
+; GCN-DAG: v_readlane_b32 s4, v40, 2
+; GCN-DAG: v_readlane_b32 s5, v40, 3
+; GCN: v_readlane_b32 s35, v40, 1
+; GCN: v_readlane_b32 s34, v40, 0
 
-; GCN: v_readlane_b32 s33, v32, 4
+; GCN: v_readlane_b32 s33, v40, 4
 ; GCN: buffer_load_dword
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
@@ -49,16 +49,16 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 }
 
 ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
-; GCN: buffer_store_dword v32
-; GCN: v_writelane_b32 v32, s33, 4
+; GCN: buffer_store_dword v40
+; GCN: v_writelane_b32 v40, s33, 4
 
 ; GCN: s_mov_b32 s33, s32
 ; GCN: s_add_u32 s32, s32, 0x400
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_swappc_b64
 
-; GCN: v_readlane_b32 s33, v32, 4
-; GCN: buffer_load_dword v32,
+; GCN: v_readlane_b32 s33, v40, 4
+; GCN: buffer_load_dword v40,
 define void @test_func_call_external_void_funcx2() #0 {
   call void @external_void_func_void()
   call void @external_void_func_void()
@@ -115,9 +115,9 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
-; GCN: v_mov_b32_e32 v32, v31
+; GCN: v_mov_b32_e32 v40, v31
 ; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: v_mov_b32_e32 v31, v32
+; GCN-NEXT: v_mov_b32_e32 v31, v40
 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
   %v31 = call i32 asm sideeffect "; def $0", "={v31}"()
   call void @external_void_func_void()
@@ -177,31 +177,31 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: {{.*}}
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
 
 ; GCN-NOT: v32
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4
 ; GCN: s_mov_b32 s32, 0
-; GCN-NOT: v32
+; GCN-NOT: v40
 
 ; GCN: ;;#ASMSTART
-; GCN-NEXT: ; def v32
+; GCN-NEXT: ; def v40
 ; GCN-NEXT: ;;#ASMEND
 
 ; GCN: s_swappc_b64 s[30:31], s[4:5]
 
-; GCN-NOT: v32
+; GCN-NOT: v40
 
 ; GCN: ;;#ASMSTART
-; GCN-NEXT: ; use v32
+; GCN-NEXT: ; use v40
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 {
-  %v32 = call i32 asm sideeffect "; def $0", "={v32}"()
+define amdgpu_kernel void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 {
+  %v40 = call i32 asm sideeffect "; def $0", "={v40}"()
   call void @external_void_func_void()
-  call void asm sideeffect "; use $0", "{v32}"(i32 %v32)
+  call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
   ret void
 }
 
@@ -255,12 +255,12 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {
 
 ; GCN-LABEL: {{^}}callee_saved_sgpr_func:
 ; GCN-NOT: s40
-; GCN: v_writelane_b32 v32, s40
+; GCN: v_writelane_b32 v40, s40
 ; GCN: s_swappc_b64
 ; GCN-NOT: s40
 ; GCN: ; use s40
 ; GCN-NOT: s40
-; GCN: v_readlane_b32 s40, v32
+; GCN: v_readlane_b32 s40, v40
 ; GCN-NOT: s40
 define void @callee_saved_sgpr_func() #2 {
   %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
@@ -287,19 +287,19 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
 ; First call preserved VGPR is used so it can't be used for SGPR spills.
 ; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func:
 ; GCN-NOT: s40
-; GCN: v_writelane_b32 v33, s40
+; GCN: v_writelane_b32 v41, s40
 ; GCN: s_swappc_b64
 ; GCN-NOT: s40
 ; GCN: ; use s40
 ; GCN-NOT: s40
-; GCN: v_readlane_b32 s40, v33
+; GCN: v_readlane_b32 s40, v41
 ; GCN-NOT: s40
 define void @callee_saved_sgpr_vgpr_func() #2 {
   %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
-  %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
+  %v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0
   call void @external_void_func_void()
   call void asm sideeffect "; use $0", "s"(i32 %s40) #0
-  call void asm sideeffect "; use $0", "v"(i32 %v32) #0
+  call void asm sideeffect "; use $0", "v"(i32 %v40) #0
   ret void
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -64,11 +64,11 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
 ; GCN-NEXT:    s_add_u32 s4, s4, func@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func@rel32@hi+4
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    v_mov_b32_e32 v32, 0
+; GCN-NEXT:    v_mov_b32_e32 v40, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-NEXT:    v_mov_b32_e32 v1, s35
-; GCN-NEXT:    global_store_dword v[0:1], v32, off
+; GCN-NEXT:    global_store_dword v[0:1], v40, off
 ; GCN-NEXT:    s_endpgm
   call void @func(i32 0)
   store i32 0, i32 addrspace(1)* %ptr