diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index fa384b296f2e6..7c8444fc93af4 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2387,19 +2387,42 @@ void RAGreedy::initializeCSRCost() { /// The results are stored into \p Out. /// \p Out is not cleared before being populated. void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) { + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) { - if (!TII->isFullCopyInstr(Instr)) + if (!Instr.isCopy()) continue; + // Look for the other end of the copy. Register OtherReg = Instr.getOperand(0).getReg(); + unsigned OtherSubReg = Instr.getOperand(0).getSubReg(); + unsigned SubReg = Instr.getOperand(1).getSubReg(); + if (OtherReg == Reg) { OtherReg = Instr.getOperand(1).getReg(); + OtherSubReg = Instr.getOperand(1).getSubReg(); + SubReg = Instr.getOperand(0).getSubReg(); if (OtherReg == Reg) continue; } + // Get the current assignment. MCRegister OtherPhysReg = OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg); + if (OtherSubReg) { + if (OtherReg.isPhysical()) { + MCRegister Tuple = + TRI->getMatchingSuperReg(OtherPhysReg, OtherSubReg, RC); + if (!Tuple) + continue; + OtherPhysReg = Tuple; + } else { + // TODO: There should be a hinting mechanism for subregisters + if (SubReg != OtherSubReg) + continue; + } + } + // Push the collected information. Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg, OtherPhysReg)); diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 1ce7179774349..be08c4e33f072 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -159246,7 +159246,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60 -; GFX9-NEXT: v_mov_b32_e32 v33, v60 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v60 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload @@ -159259,7 +159259,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v33 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61 ; GFX9-NEXT: s_waitcnt vmcnt(2) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 09d3c3b01b809..bca39d06e941c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -7398,7 +7398,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 @@ -7413,7 +7413,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index ddd1ce66c013a..f44a0b0ac2c65 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -3851,9 +3851,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; VI-DS128-NEXT: v_mov_b32_e32 v31, v15 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -3864,17 +3864,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 ; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 +; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 @@ -3944,7 +3943,7 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 @@ -3992,8 +3991,8 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -4004,17 +4003,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 ; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 -; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 @@ -4890,7 +4888,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 ; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -4901,14 +4899,13 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 -; VI-DS128-NEXT: v_mov_b32_e32 v23, v15 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 @@ -4986,7 +4983,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 @@ -5031,15 +5028,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v15 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll index 8878e9b65a088..a81d9a458e23a 100644 --- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -101,7 +101,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[10:13] +; CHECK-NEXT: ; def v[6:9] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ;;#ASMSTART @@ -142,7 +142,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -306,10 +306,10 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 ; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[10:13] +; CHECK-NEXT: ; def v[8:11] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[14:17] +; CHECK-NEXT: ; def v[12:15] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a[0:31] @@ -349,9 +349,9 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] +; CHECK-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll index 1267bcd1e0717..461b4d0e02cb8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll @@ -415,8 +415,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmv4r.v v8, v24 +; RV32-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 @@ -726,8 +725,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV64-NEXT: mul a4, a4, a5 ; RV64-NEXT: add a4, sp, a4 ; RV64-NEXT: addi a4, a4, 32 -; RV64-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmv4r.v v8, v24 +; RV64-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: slli a4, a4, 4 ; RV64-NEXT: add a4, sp, a4 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index 0bfa68298f6b5..0a11501905b81 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -8831,8 +8831,7 @@ define @vfnmadd_vv_nxv32f16( %va, @vfnmadd_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t -; ZVFHMIN-NEXT: vmv.v.v v4, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -9832,8 +9830,7 @@ define @vfnmadd_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t -; ZVFHMIN-NEXT: vmv.v.v v4, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -10347,8 +10344,7 @@ define @vfnmsub_vv_nxv32f16( %va, @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t -; ZVFHMIN-NEXT: vmv.v.v v4, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB303_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -11343,8 +11338,7 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t -; ZVFHMIN-NEXT: vmv.v.v v4, v12 +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB306_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -11453,12 +11447,11 @@ define @vfnmsub_vf_nxv32f16_neg_splat_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked(