-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Greedy: Take copy hints involving subregisters #159570
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Previously this would only accept full copy hints. This relaxes this to accept some subregister copies. Specifically, this now accepts: - Copies to/from physical registers if there is a compatible super register - Subreg-to-subreg copies This has the potential to repeatedly add the same hint to the hint vector, but not sure if that's a real problem.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-regalloc Author: Matt Arsenault (arsenm) ChangesPreviously this would only accept full copy hints. This relaxes
This has the potential to repeatedly add the same hint to the Patch is 29.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159570.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index fa384b296f2e6..7c8444fc93af4 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2387,19 +2387,42 @@ void RAGreedy::initializeCSRCost() {
/// The results are stored into \p Out.
/// \p Out is not cleared before being populated.
void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
- if (!TII->isFullCopyInstr(Instr))
+ if (!Instr.isCopy())
continue;
+
// Look for the other end of the copy.
Register OtherReg = Instr.getOperand(0).getReg();
+ unsigned OtherSubReg = Instr.getOperand(0).getSubReg();
+ unsigned SubReg = Instr.getOperand(1).getSubReg();
+
if (OtherReg == Reg) {
OtherReg = Instr.getOperand(1).getReg();
+ OtherSubReg = Instr.getOperand(1).getSubReg();
+ SubReg = Instr.getOperand(0).getSubReg();
if (OtherReg == Reg)
continue;
}
+
// Get the current assignment.
MCRegister OtherPhysReg =
OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
+ if (OtherSubReg) {
+ if (OtherReg.isPhysical()) {
+ MCRegister Tuple =
+ TRI->getMatchingSuperReg(OtherPhysReg, OtherSubReg, RC);
+ if (!Tuple)
+ continue;
+ OtherPhysReg = Tuple;
+ } else {
+ // TODO: There should be a hinting mechanism for subregisters
+ if (SubReg != OtherSubReg)
+ continue;
+ }
+ }
+
// Push the collected information.
Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
OtherPhysReg));
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 1ce7179774349..be08c4e33f072 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -159246,7 +159246,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61
; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60
-; GFX9-NEXT: v_mov_b32_e32 v33, v60
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v60
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
@@ -159259,7 +159259,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v33
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61
; GFX9-NEXT: s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 09d3c3b01b809..bca39d06e941c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -7398,7 +7398,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14
@@ -7413,7 +7413,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v5
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v19
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index ddd1ce66c013a..f44a0b0ac2c65 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -3851,9 +3851,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11
; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; VI-DS128-NEXT: v_mov_b32_e32 v31, v15
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -3864,17 +3864,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10
+; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
-; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
@@ -3944,7 +3943,7 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
@@ -3992,8 +3991,8 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v15
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -4004,17 +4003,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
+; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
-; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
@@ -4890,7 +4888,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@@ -4901,14 +4899,13 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10
-; VI-DS128-NEXT: v_mov_b32_e32 v23, v15
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
@@ -4986,7 +4983,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18
@@ -5031,15 +5028,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
-; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v15
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
+; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index 8878e9b65a088..a81d9a458e23a 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -101,7 +101,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[10:13]
+; CHECK-NEXT: ; def v[6:9]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: ;;#ASMSTART
@@ -142,7 +142,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
+; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -306,10 +306,10 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[10:13]
+; CHECK-NEXT: ; def v[8:11]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[14:17]
+; CHECK-NEXT: ; def v[12:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:31]
@@ -349,9 +349,9 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
+; CHECK-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
+; CHECK-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index 1267bcd1e0717..461b4d0e02cb8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -415,8 +415,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmv4r.v v8, v24
+; RV32-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
@@ -726,8 +725,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: mul a4, a4, a5
; RV64-NEXT: add a4, sp, a4
; RV64-NEXT: addi a4, a4, 32
-; RV64-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmv4r.v v8, v24
+; RV64-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
; RV64-NEXT: csrr a4, vlenb
; RV64-NEXT: slli a4, a4, 4
; RV64-NEXT: add a4, sp, a4
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index 0bfa68298f6b5..0a11501905b81 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -8831,8 +8831,7 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a1, a0, .LBB286_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
@@ -9460,8 +9459,7 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
@@ -9832,8 +9830,7 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
@@ -10347,8 +10344,7 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a1, a0, .LBB298_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
@@ -10975,8 +10971,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a0, a1, .LBB303_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
@@ -11343,8 +11338,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a0, a1, .LBB306_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
@@ -11453,12 +11447,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv4r.v v8, v24
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
@@ -11580,12 +11573,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: sltu a3, a0, a2
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index ff416dbe3f1a0..4fc1e06a14983 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -175,33 +175,30 @@ define void @vst3_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.f32 s0, s5
; CHECK-NEXT: vmov.f32 s2, s14
; CHECK-NEXT: vmov.f32 s3, s6
-; CHECK-NEXT: vmov.f32 s26, s10
; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [sp, #160] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s26, s10
; CHECK-NEXT: vmov.f32 s20, s8
+; CHECK-NEXT: vmov.32 q6[1], r3
; CHECK-NEXT: vmov.f32 s23, s9
; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s16, s2
-; CHECK-NEXT: vmov.32 q6[1], r3
-; CHECK-NEXT: vmov.f32 s19, s3
; CHECK-NEXT: vstrw.32 q6, [r1, #16]
+; CHECK-NEXT: vmov.f32 s19, s3
+; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s17, s31
; CHECK-NEXT: vstrw.3...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not a regalloc expert by any means but makes sense to me. LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As with Luke, not an expert, but this does look reasonable. LGTM
if (OtherReg == Reg) { | ||
OtherReg = Instr.getOperand(1).getReg(); | ||
OtherSubReg = Instr.getOperand(1).getSubReg(); | ||
SubReg = Instr.getOperand(0).getSubReg(); | ||
if (OtherReg == Reg) | ||
continue; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is starting to look ugly so I created #159724 to try to simplify it.
Previously this would only accept full copy hints. This relaxes
this to accept some subregister copies. Specifically, this now
accepts:
super register
This has the potential to repeatedly add the same hint to the
hint vector, but not sure if that's a real problem.