-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Use register pair for PC spill #169098
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/slinder1/amdgpu-cfi-4
Are you sure you want to change the base?
Use register pair for PC spill #169098
Conversation
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Scott Linder (slinder1) ChangesPatch is 1.26 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169098.diff 65 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 40eeeb8a8630d..057a5e2df8bf7 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -268,11 +268,19 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
std::vector<CalleeSavedInfo> CSI;
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ Register RetAddrReg = TRI->getReturnAddressReg(MF);
+ bool SpillRetAddrReg = false;
for (unsigned I = 0; CSRegs[I]; ++I) {
MCRegister Reg = CSRegs[I];
if (SavedRegs.test(Reg)) {
+ if (Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub0) ||
+ Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub1)) {
+ SpillRetAddrReg = true;
+ continue;
+ }
+
const TargetRegisterClass *RC =
TRI->getMinimalPhysRegClass(Reg, MVT::i32);
int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
@@ -283,6 +291,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
}
}
+ // Return address uses a register pair. Add the super register to the
+ // CSI list so that it's easier to identify the entire spill and CFI
+ // can be emitted appropriately.
+ if (SpillRetAddrReg) {
+ const TargetRegisterClass *RC =
+ TRI->getMinimalPhysRegClass(RetAddrReg, MVT::i64);
+ int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
+ TRI->getSpillAlign(*RC), true);
+ CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI));
+ CalleeSavedFIs.push_back(JunkFI);
+ }
+
if (!CSI.empty()) {
for (MachineBasicBlock *SaveBlock : SaveBlocks)
insertCSRSaves(*SaveBlock, CSI, Indexes, LIS);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index b84b31cd2702c..023398377de94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -23,10 +23,10 @@ define ptr addrspace(1) @call_assert_align() {
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
-; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: s_mov_b32 s32, s33
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 7e6f500181ec6..2c1beb8468576 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -238,8 +238,8 @@ define void @func_caller_stack() #2 {
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -277,8 +277,8 @@ define void @func_caller_stack() #2 {
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
@@ -363,8 +363,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
; MUBUF-NEXT: s_waitcnt vmcnt(1)
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
; MUBUF-NEXT: s_mov_b32 s32, s33
; MUBUF-NEXT: v_readlane_b32 s4, v40, 2
; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -414,8 +414,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:56
; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
; FLATSCR-NEXT: s_mov_b32 s32, s33
; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2
; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 72766f47030cc..35591cd602992 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -244,8 +244,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 {
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], 0
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index 3194581fa4213..0e24430e7be20 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -30,8 +30,8 @@ define void @parent_func_missing_inputs() #0 {
; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs@rel32@hi+12
; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1
; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0
+; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1
; FIXEDABI-NEXT: s_mov_b32 s32, s33
; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2
; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index 149b0cb4e052d..b6e65c8842904 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -35,8 +35,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
; DAGISEL-NEXT: s_clause 0x1
; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
; DAGISEL-NEXT: s_mov_b32 s32, s33
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -78,8 +78,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
; GISEL-NEXT: s_clause 0x1
; GISEL-NEXT: scratch_load_b32 v41, off, s33
; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GISEL-NEXT: v_readlane_b32 s31, v42, 1
; GISEL-NEXT: v_readlane_b32 s30, v42, 0
+; GISEL-NEXT: v_readlane_b32 s31, v42, 1
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s0, v42, 2
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -787,8 +787,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
; DAGISEL-NEXT: s_wait_alu 0xfffe
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
; DAGISEL-NEXT: s_mov_b32 s32, s33
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
@@ -822,8 +822,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 {
; GISEL-NEXT: s_wait_alu 0xfffe
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: s_mov_b32 s32, s33
; GISEL-NEXT: v_readlane_b32 s0, v40, 2
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index c794168c40075..1e2a46fcefa36 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -7466,42 +7466,42 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[4:5]
; SI-NEXT: s_waitcnt expcnt(2)
-; SI-NEXT: v_writelane_b32 v20, s30, 0
-; SI-NEXT: v_writelane_b32 v20, s31, 1
-; SI-NEXT: v_writelane_b32 v20, s34, 2
-; SI-NEXT: v_writelane_b32 v20, s35, 3
-; SI-NEXT: v_writelane_b32 v20, s36, 4
-; SI-NEXT: v_writelane_b32 v20, s37, 5
-; SI-NEXT: v_writelane_b32 v20, s38, 6
-; SI-NEXT: v_writelane_b32 v20, s39, 7
-; SI-NEXT: v_writelane_b32 v20, s48, 8
-; SI-NEXT: v_writelane_b32 v20, s49, 9
-; SI-NEXT: v_writelane_b32 v20, s50, 10
-; SI-NEXT: v_writelane_b32 v20, s51, 11
-; SI-NEXT: v_writelane_b32 v20, s52, 12
-; SI-NEXT: v_writelane_b32 v20, s53, 13
-; SI-NEXT: v_writelane_b32 v20, s54, 14
-; SI-NEXT: v_writelane_b32 v20, s55, 15
-; SI-NEXT: v_writelane_b32 v20, s64, 16
-; SI-NEXT: v_writelane_b32 v20, s65, 17
-; SI-NEXT: v_writelane_b32 v20, s66, 18
-; SI-NEXT: v_writelane_b32 v20, s67, 19
-; SI-NEXT: v_writelane_b32 v20, s68, 20
-; SI-NEXT: v_writelane_b32 v20, s69, 21
-; SI-NEXT: v_writelane_b32 v20, s70, 22
-; SI-NEXT: v_writelane_b32 v20, s71, 23
-; SI-NEXT: v_writelane_b32 v20, s80, 24
-; SI-NEXT: v_writelane_b32 v20, s81, 25
-; SI-NEXT: v_writelane_b32 v20, s82, 26
-; SI-NEXT: v_writelane_b32 v20, s83, 27
-; SI-NEXT: v_writelane_b32 v20, s84, 28
-; SI-NEXT: v_writelane_b32 v20, s85, 29
-; SI-NEXT: v_writelane_b32 v20, s86, 30
-; SI-NEXT: v_writelane_b32 v20, s87, 31
-; SI-NEXT: v_writelane_b32 v20, s96, 32
-; SI-NEXT: v_writelane_b32 v20, s97, 33
+; SI-NEXT: v_writelane_b32 v20, s34, 0
+; SI-NEXT: v_writelane_b32 v20, s35, 1
+; SI-NEXT: v_writelane_b32 v20, s36, 2
+; SI-NEXT: v_writelane_b32 v20, s37, 3
+; SI-NEXT: v_writelane_b32 v20, s38, 4
+; SI-NEXT: v_writelane_b32 v20, s39, 5
+; SI-NEXT: v_writelane_b32 v20, s48, 6
+; SI-NEXT: v_writelane_b32 v20, s49, 7
+; SI-NEXT: v_writelane_b32 v20, s50, 8
+; SI-NEXT: v_writelane_b32 v20, s51, 9
+; SI-NEXT: v_writelane_b32 v20, s52, 10
+; SI-NEXT: v_writelane_b32 v20, s53, 11
+; SI-NEXT: v_writelane_b32 v20, s54, 12
+; SI-NEXT: v_writelane_b32 v20, s55, 13
+; SI-NEXT: v_writelane_b32 v20, s64, 14
+; SI-NEXT: v_writelane_b32 v20, s65, 15
+; SI-NEXT: v_writelane_b32 v20, s66, 16
+; SI-NEXT: v_writelane_b32 v20, s67, 17
+; SI-NEXT: v_writelane_b32 v20, s68, 18
+; SI-NEXT: v_writelane_b32 v20, s69, 19
+; SI-NEXT: v_writelane_b32 v20, s70, 20
+; SI-NEXT: v_writelane_b32 v20, s71, 21
+; SI-NEXT: v_writelane_b32 v20, s80, 22
+; SI-NEXT: v_writelane_b32 v20, s81, 23
+; SI-NEXT: v_writelane_b32 v20, s82, 24
+; SI-NEXT: v_writelane_b32 v20, s83, 25
+; SI-NEXT: v_writelane_b32 v20, s84, 26
+; SI-NEXT: v_writelane_b32 v20, s85, 27
+; SI-NEXT: v_writelane_b32 v20, s86, 28
+; SI-NEXT: v_writelane_b32 v20, s87, 29
+; SI-NEXT: v_writelane_b32 v20, s96, 30
+; SI-NEXT: v_writelane_b32 v20, s97, 31
+; SI-NEXT: v_writelane_b32 v20, s98, 32
+; SI-NEXT: v_writelane_b32 v20, s99, 33
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT: v_writelane_b32 v20, s98, 34
+; SI-NEXT: v_writelane_b32 v20, s30, 34
; SI-NEXT: v_readfirstlane_b32 s44, v1
; SI-NEXT: v_readfirstlane_b32 s45, v2
; SI-NEXT: v_readfirstlane_b32 s42, v3
@@ -7521,7 +7521,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_readfirstlane_b32 s4, v17
; SI-NEXT: s_and_b64 s[46:47], vcc, exec
; SI-NEXT: v_readfirstlane_b32 s5, v18
-; SI-NEXT: v_writelane_b32 v20, s99, 35
+; SI-NEXT: v_writelane_b32 v20, s31, 35
; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
; SI-NEXT: s_cbranch_scc0 .LBB13_4
@@ -8391,6 +8391,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0
; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_readlane_b32 s30, v20, 34
; SI-NEXT: v_readlane_b32 s19, v22, 11
; SI-NEXT: v_readlane_b32 s17, v22, 17
; SI-NEXT: v_readlane_b32 s15, v22, 23
@@ -8398,42 +8399,41 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; SI-NEXT: v_readlane_b32 s11, v22, 35
; SI-NEXT: v_readlane_b32 s9, v22, 41
; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT: v_readlane_b32 s99, v20, 35
-; SI-NEXT: v_readlane_b32 s98, v20, 34
-; SI-NEXT: v_readlane_b32 s97, v20, 33
-; SI-NEXT: v_readlane_b32 s96, v20, 32
-; SI-NEXT: v_readlane_b32 s87, v20, 31
-; SI-NEXT: v_readlane_b32 s86, v20, 30
-; SI-NEXT: v_readlane_b32 s85, v20, 29
-; SI-NEXT: v_readlane_b32 s84, v20, 28
-; SI-NEXT: v_readlane_b32 s83, v20, 27
-; SI-NEXT: v_readlane_b32 s82, v20, 26
-; SI-NEXT: v_readlane_b32 s81, v20, 25
-; SI-NEXT: v_readlane_b32 s80, v20, 24
-; SI-NEXT: v_readlane_b32 s71, v20, 23
-; SI-NEXT: v_readlane_b32 s70, v20, 22
-; SI-NEXT: v_readlane_b32 s69, v20, 21
-; SI-NEXT: v_readlane_b32 s68, v20, 20
-; SI-NEXT: v_readlane_b32 s67, v20, 19
-; SI-NEXT: v_readlane_b32 s66, v20, 18
-; SI-NEXT: v_readlane_b32 s65, v20, 17
-; SI-NEXT: v_readlane_b32 s64, v20, 16
-; SI-NEXT: v_readlane_b32 s55, v20, 15
-; SI-NEXT: v_readlane_b32 s54, v20, 14
-; SI-NEXT: v_readlane_b32 s53, v20, 13
-; SI-NEXT: v_readlane_b32 s52, v20, 12
-; SI-NEXT: v_readlane_b32 s51, v20, 11
-; SI-NEXT: v_readlane_b32 s50, v20, 10
-; SI-NEXT: v_readlane_b32 s49, v20, 9
-; SI-NEXT: v_readlane_b32 s48, v20, 8
-; SI-NEXT: v_readlane_b32 s39, v20, 7
-; SI-NEXT: v_readlane_b32 s38, v20, 6
-; SI-NEXT: v_readlane_b32 s37, v20, 5
-; SI-NEXT: v_readlane_b32 s36, v20, 4
-; SI-NEXT: v_readlane_b32 s35, v20, 3
-; SI-NEXT: v_readlane_b32 s34, v20, 2
-; SI-NEXT: v_readlane_b32 s31, v20, 1
-; SI-NEXT: v_readlane_b32 s30, v20, 0
+; SI-NEXT: v_readlane_b32 s31, v20, 35
+; SI-NEXT: v_readlane_b32 s99, v20, 33
+; SI-NEXT: v_readlane_b32 s98, v20, 32
+; SI-NEXT: v_readlane_b32 s97, v20, 31
+; SI-NEXT: v_readlane_b32 s96, v20, 30
+; SI-NEXT: v_readlane_b32 s87, v20, 29
+; SI-NEXT: v_readlane_b32 s86, v20, 28
+; SI-NEXT: v_readlane_b32 s85, v20, 27
+; SI-NEXT: v_readlane_b32 s84, v20, 26
+; SI-NEXT: v_readlane_b32 s83, v20, 25
+; SI-NEXT: v_readlane_b32 s82, v20, 24
+; SI-NEXT: v_readlane_b32 s81, v20, 23
+; SI-NEXT: v_readlane_b32 s80, v20, 22
+; SI-NEXT: v_readlane_b32 s71, v20, 21
+; SI-NEXT: v_readlane_b32 s70, v20, 20
+; SI-NEXT: v_readlane_b32 s69, v20, 19
+; SI-NEXT: v_readlane_b32 s68, v20, 18
+; SI-NEXT: v_readlane_b32 s67, v20, 17
+; SI-NEXT: v_readlane_b32 s66, v20, 16
+; SI-NEXT: v_readlane_b32 s65, v20, 15
+; SI-NEXT: v_readlane_b32 s64, v20, 14
+; SI-NEXT: v_readlane_b32 s55, v20, 13
+; SI-NEXT: v_readlane_b32 s54, v20, 12
+; SI-NEXT: v_readlane_b32 s53, v20, 11
+; SI-NEXT: v_readlane_b32 s52, v20, 10
+; SI-NEXT: v_readlane_b32 s51, v20, 9
+; SI-NEXT: v_readlane_b32 s50, v20, 8
+; SI-NEXT: v_readlane_b32 s49, v20, 7
+; SI-NEXT: v_readlane_b32 s48, v20, 6
+; SI-NEXT: v_readlane_b32 s39, v20, 5
+; SI-NEXT: v_readlane_b32 s38, v20, 4
+; SI-NEXT: v_readlane_b32 s37, v20, 3
+; SI-NEXT: v_readlane_b32 s36, v20, 2
+; SI-NEXT: v_readlane_b32 s35, v20, 1
+; SI-NEXT: v_readlane_b32 s34, v20, 0
; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1
; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -8630,38 +8630,38 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: v_writelane_b32 v20, s30, 0
-; VI-NEXT: v_writelane_b32 v20, s31, 1
-; VI-NEXT: v_writelane_b32 v20, s34, 2
-; VI-NEXT: v_writelane_b32 v20, s35, 3
-; VI-NEXT: v_writelane_b32 v20, s36, 4
-; VI-NEXT: v_writelane_b32 v20, s37, 5
-; VI-NEXT: v_writelane_b32 v20, s38, 6
-; VI-NEXT: v_writelane_b32 v20, s39, 7
-; VI-NEXT: v_writelane_b32 v20, s48, 8
-; VI-NEXT: v_writelane_b32 v20, s49, 9
-; VI-NEXT: v_writelane_b32 v20, s50, 10
-; VI-NEXT: v_writelane_b32 v20, s51, 11
-; VI-NEXT: v_writelane_b32 v20, s52, 12
-; VI-NEXT: v_writelane_b32 v20, s53, 13
-; VI-NEXT: v_writelane_b32 v20, s54, 14
-; VI-NEXT: v_writelane_b32 v20, s55, 15
-; VI-NEXT: v_writelane_b32 v20, s64, 16
-; VI-NEXT: v_writelane_b32 v20, s65, 17
-; VI-NEXT: v_writelane_b32 v20, s66, 18
-; VI-NEXT: v_writelane_b32 v20, s67, 19
-; VI-NEXT: v_writelane_b32 v20, s68, 20
-; VI-NEXT: v_writelane_b32 v20, s69, 21
-; VI-NEXT: v_writelane_b32 v20, s70, 22
-; VI-NEXT: v_writelane_b32 v20, s71, 23
-; VI-NEXT: v_writelane_b32 v20, s80, 24
-; VI-NEXT: v_writelane_b32 v20, s81, 25
-; VI-NEXT: v_writelane_b32 v20, s82, 26
-; VI-NEXT: v_writelane_b32 v20, s83, 27
-; VI-NEXT: v_writelane_b32 v20, s84, 28
-; VI-NEXT: v_writelane_b32 v20, s85, 29
+; VI-NEXT: v_writelane_b32 v20, s34, 0
+; VI-NEXT: v_writelane_b32 v20, s35, 1
+; VI-NEXT: v_writelane_b32 v20, s36, 2
+; VI-NEXT: v_writelane_b32 v20, s37, 3
+; VI-NEXT: v_writelane_b32 v20, s38, 4
+; VI-NEXT: v_writelane_b32 v20, s39, 5
+; VI-NEXT: v_writelane_b32 v20, s48, 6
+; VI-NEXT: v_writelane_b32 v20, s49, 7
+; VI-NEXT: v_writelane_b32 v20, s50, 8
+; VI-NEXT: v_writelane_b32 v20, s51, 9
+; VI-NEXT: v_writelane_b32 v20, s52, 10
+; VI-NEXT: v_writelane_b32 v20, s53, 11
+; VI-NEXT: v_writelane_b32 v20, s54, 12
+; VI-NEXT: v_writelane_b32 v20, s55, 13
+; VI-NEXT: v_writelane_b32 v20, s64, 14
+; VI-NEXT: v_writelane_b32 v20, s65, 15
+; VI-NEXT: v_writelane_b32 v20, s66, 16
+; VI-NEXT: v_writelane_b32 v20, s67, 17
+; VI-NEXT: v_writelane_b32 v20, s68, 18
+; VI-NEXT: v_writelane_b32 v20, s69, 19
+; VI-NEXT: v_writelane_b32 v20, s70, 20
+; VI-NEXT: v_writelane_b32 v20, s71, 21
+; VI-NEXT: v_writelane_b32 v20, s80, 22
+; VI-NEXT: v_writelane_b32 v20, s81, 23
+; VI-NEXT: v_writelane_b32 v20, s82, 24
+; VI-NEXT: v_writelane_b32 v20, s83, 25
+; VI-NEXT: v_writelane_b32 v20, s84, 26
+; VI-NEXT: v_writelane_b32 v20, s85, 27
+; VI-NEXT: v_writelane_b32 v20, s86, 28
+; VI-NEXT: v_writelane_b32 v20, s87, 29
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; VI-NEXT: v_writelane_b32 v20, s86, 30
+; VI-NEXT: v_writelane_b32 v20, s30, 30
; VI-NEXT: v_readfirstlane_b32 s44, v1
; VI-NEXT: v_readfirstlane_b32 s45, v2
; VI-NEXT: v_readfirstlane_b32 s42, v3
@@ -8681,7 +8681,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; VI-NEXT: v_readfirstlane_b32 s4, v17
; VI-NEXT: s_and_b64 s[46:47], vcc, exec
; VI-NEXT: v_readfirstlane_b32 s5, v18
-; VI-NEXT: v_writelane_b32 v20, s87, 31
+; VI-NEXT: v_writelane_b32 v20, s31, 31
; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spil...
[truncated]
|
e811d05 to
e38cf3c
Compare
17e3b5d to
4bce3ac
Compare
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
e38cf3c to
32bd3c3
Compare
4bce3ac to
b46525b
Compare
🐧 Linux x64 Test Results
Failed Tests(click on a test name to see its output) ClangClang.Driver/amdgpu-unwind.clLLVMLLVM.CodeGen/AMDGPU/amdgcn.bitcast.1024bit.llLLVM.CodeGen/AMDGPU/amdgcn.bitcast.512bit.llLLVM.CodeGen/AMDGPU/amdgcn.bitcast.768bit.llLLVM.CodeGen/AMDGPU/amdgcn.bitcast.832bit.llLLVM.CodeGen/AMDGPU/amdgcn.bitcast.896bit.llLLVM.CodeGen/AMDGPU/amdgcn.bitcast.960bit.llLLVM.CodeGen/AMDGPU/call-args-inreg-bfloat.llLLVM.CodeGen/AMDGPU/call-args-inreg.llLLVM.CodeGen/AMDGPU/call-skip.llLLVM.CodeGen/AMDGPU/cc-entry.llLLVM.CodeGen/AMDGPU/copysign-simplify-demanded-bits.llLLVM.CodeGen/AMDGPU/ds-read2-write2-debug-info.llLLVM.CodeGen/AMDGPU/dynamic_stackalloc.llLLVM.CodeGen/AMDGPU/gfx-callable-return-types.llLLVM.CodeGen/AMDGPU/nofpclass-call.llLLVM.CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.llLLVM.CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.llLLVM.CodeGen/AMDGPU/stack-realign.llLLVM.CodeGen/AMDGPU/waterfall-call-target-av-register-failure.llLLVM.CodeGen/AMDGPU/whole-wave-functions.llIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
| if (Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub0) || | ||
| Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub1)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull the getSubRegs out of the loop
| } | ||
| } | ||
|
|
||
| // Return address uses a register pair. Add the super register to the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this really special case the return address? Could we compact the representation by emitting CFI for the largest possible tuple covering?

No description provided.