-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[RISCV] Move MachineCombiner to addILPOpts() #158071
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Move MachineCombiner to addILPOpts() #158071
Conversation
So that it runs before `MachineCSE` and other passes. Fixes llvm#158063.
@llvm/pr-subscribers-backend-risc-v Author: Pengcheng Wang (wangpc-pp) ChangesSo that it runs before Fixes #158063. Full diff: https://github.com/llvm/llvm-project/pull/158071.diff 7 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 460bb33f2553a..d7e0be65fb6fa 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -427,6 +427,7 @@ class RISCVPassConfig : public TargetPassConfig {
void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addFastRegAlloc() override;
+ bool addILPOpts() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@@ -612,9 +613,6 @@ void RISCVPassConfig::addMachineSSAOptimization() {
TargetPassConfig::addMachineSSAOptimization();
- if (EnableMachineCombiner)
- addPass(&MachineCombinerID);
-
if (TM->getTargetTriple().isRISCV64()) {
addPass(createRISCVOptWInstrsPass());
}
@@ -649,6 +647,13 @@ void RISCVPassConfig::addPostRegAlloc() {
addPass(createRISCVRedundantCopyEliminationPass());
}
+bool RISCVPassConfig::addILPOpts() {
+ if (EnableMachineCombiner)
+ addPass(&MachineCombinerID);
+
+ return true;
+}
+
void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerLateLoopOptimizationsEPCallback([=](LoopPassManager &LPM,
OptimizationLevel Level) {
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index c7f70a9d266c2..ea08061221fd4 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -107,6 +107,9 @@
; CHECK-NEXT: Remove dead machine instructions
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Natural Loop Construction
+; CHECK-NEXT: Machine Trace Metrics
+; CHECK-NEXT: Lazy Machine Block Frequency Analysis
+; CHECK-NEXT: Machine InstCombiner
; CHECK-NEXT: Machine Block Frequency Analysis
; CHECK-NEXT: Early Machine Loop Invariant Code Motion
; CHECK-NEXT: MachineDominator Tree Construction
@@ -117,9 +120,6 @@
; CHECK-NEXT: Machine code sinking
; CHECK-NEXT: Peephole Optimizations
; CHECK-NEXT: Remove dead machine instructions
-; CHECK-NEXT: Machine Trace Metrics
-; CHECK-NEXT: Lazy Machine Block Frequency Analysis
-; CHECK-NEXT: Machine InstCombiner
; RV64-NEXT: RISC-V Optimize W Instructions
; CHECK-NEXT: RISC-V Pre-RA pseudo instruction expansion pass
; CHECK-NEXT: RISC-V Merge Base Offset
diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll
index 7a1c41c1839fa..69eca6dd7768a 100644
--- a/llvm/test/CodeGen/RISCV/machine-combiner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll
@@ -1094,33 +1094,19 @@ declare float @llvm.maxnum.f32(float, float)
declare double @llvm.maxnum.f64(double, double)
define double @test_fmadd_strategy(double %a0, double %a1, double %a2, double %a3, i64 %flag) {
-; CHECK_LOCAL-LABEL: test_fmadd_strategy:
-; CHECK_LOCAL: # %bb.0: # %entry
-; CHECK_LOCAL-NEXT: fsub.d fa4, fa0, fa1
-; CHECK_LOCAL-NEXT: andi a0, a0, 1
-; CHECK_LOCAL-NEXT: fmv.d fa5, fa0
-; CHECK_LOCAL-NEXT: fmul.d fa0, fa4, fa2
-; CHECK_LOCAL-NEXT: beqz a0, .LBB76_2
-; CHECK_LOCAL-NEXT: # %bb.1: # %entry
-; CHECK_LOCAL-NEXT: fmul.d fa4, fa5, fa1
-; CHECK_LOCAL-NEXT: fmadd.d fa5, fa5, fa1, fa0
-; CHECK_LOCAL-NEXT: fsub.d fa0, fa5, fa4
-; CHECK_LOCAL-NEXT: .LBB76_2: # %entry
-; CHECK_LOCAL-NEXT: ret
-;
-; CHECK_GLOBAL-LABEL: test_fmadd_strategy:
-; CHECK_GLOBAL: # %bb.0: # %entry
-; CHECK_GLOBAL-NEXT: fsub.d fa4, fa0, fa1
-; CHECK_GLOBAL-NEXT: andi a0, a0, 1
-; CHECK_GLOBAL-NEXT: fmv.d fa5, fa0
-; CHECK_GLOBAL-NEXT: fmul.d fa0, fa4, fa2
-; CHECK_GLOBAL-NEXT: beqz a0, .LBB76_2
-; CHECK_GLOBAL-NEXT: # %bb.1: # %entry
-; CHECK_GLOBAL-NEXT: fmul.d fa5, fa5, fa1
-; CHECK_GLOBAL-NEXT: fadd.d fa4, fa5, fa0
-; CHECK_GLOBAL-NEXT: fsub.d fa0, fa4, fa5
-; CHECK_GLOBAL-NEXT: .LBB76_2: # %entry
-; CHECK_GLOBAL-NEXT: ret
+; CHECK-LABEL: test_fmadd_strategy:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: fsub.d fa5, fa0, fa1
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: beqz a0, .LBB76_2
+; CHECK-NEXT: # %bb.1: # %entry
+; CHECK-NEXT: fmul.d fa4, fa0, fa1
+; CHECK-NEXT: fmadd.d fa5, fa5, fa2, fa4
+; CHECK-NEXT: fsub.d fa0, fa5, fa4
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB76_2:
+; CHECK-NEXT: fmul.d fa0, fa5, fa2
+; CHECK-NEXT: ret
entry:
%sub = fsub contract double %a0, %a1
%mul = fmul contract double %sub, %a2
@@ -1132,3 +1118,6 @@ entry:
%retval.0 = select i1 %tobool.not, double %mul, double %sub3
ret double %retval.0
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK_GLOBAL: {{.*}}
+; CHECK_LOCAL: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index da81fe5708814..f9ccf7637eee9 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -208,14 +208,14 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
; RV32I-NEXT: sub a1, a1, a3
; RV32I-NEXT: neg a0, a0
; RV32I-NEXT: .LBB5_2:
-; RV32I-NEXT: snez a3, a0
-; RV32I-NEXT: neg a4, a1
-; RV32I-NEXT: sub a3, a4, a3
-; RV32I-NEXT: neg a4, a0
+; RV32I-NEXT: snez a4, a0
+; RV32I-NEXT: neg a3, a0
+; RV32I-NEXT: add a4, a1, a4
+; RV32I-NEXT: neg a4, a4
; RV32I-NEXT: sw a0, 0(a2)
; RV32I-NEXT: sw a1, 4(a2)
-; RV32I-NEXT: mv a0, a4
-; RV32I-NEXT: mv a1, a3
+; RV32I-NEXT: mv a0, a3
+; RV32I-NEXT: mv a1, a4
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: neg_abs64_multiuse:
@@ -227,14 +227,14 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
; RV32ZBB-NEXT: sub a1, a1, a3
; RV32ZBB-NEXT: neg a0, a0
; RV32ZBB-NEXT: .LBB5_2:
-; RV32ZBB-NEXT: snez a3, a0
-; RV32ZBB-NEXT: neg a4, a1
-; RV32ZBB-NEXT: sub a3, a4, a3
-; RV32ZBB-NEXT: neg a4, a0
+; RV32ZBB-NEXT: snez a4, a0
+; RV32ZBB-NEXT: neg a3, a0
+; RV32ZBB-NEXT: add a4, a1, a4
+; RV32ZBB-NEXT: neg a4, a4
; RV32ZBB-NEXT: sw a0, 0(a2)
; RV32ZBB-NEXT: sw a1, 4(a2)
-; RV32ZBB-NEXT: mv a0, a4
-; RV32ZBB-NEXT: mv a1, a3
+; RV32ZBB-NEXT: mv a0, a3
+; RV32ZBB-NEXT: mv a1, a4
; RV32ZBB-NEXT: ret
;
; RV64I-LABEL: neg_abs64_multiuse:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 83b435ddff902..056f55260b854 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -934,7 +934,7 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
; CHECK-NEXT: add a1, a1, a5
; CHECK-NEXT: slli a3, a3, 32
; CHECK-NEXT: srli a3, a3, 32
-; CHECK-NEXT: add a0, a4, a0
+; CHECK-NEXT: add a0, a0, a4
; CHECK-NEXT: add a0, a0, a3
; CHECK-NEXT: addi a0, a0, 1
; CHECK-NEXT: .LBB14_6: # %bb35
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index dddcd4f107e3b..ead79fcf53d8b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -18,13 +18,10 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV32-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph
; RV32-NEXT: blez a6, .LBB0_17
; RV32-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader
-; RV32-NEXT: addi t0, a7, -1
+; RV32-NEXT: addi t3, a7, -1
; RV32-NEXT: csrr t2, vlenb
-; RV32-NEXT: mul t3, a1, t0
-; RV32-NEXT: mul t4, a3, t0
-; RV32-NEXT: mul t5, a5, t0
; RV32-NEXT: slli t1, t2, 1
-; RV32-NEXT: li t6, 32
+; RV32-NEXT: li t4, 32
; RV32-NEXT: mv t0, t1
; RV32-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader
; RV32-NEXT: li t0, 32
@@ -34,27 +31,32 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 0(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: .cfi_offset s2, -12
+; RV32-NEXT: .cfi_offset s3, -16
; RV32-NEXT: .cfi_remember_state
-; RV32-NEXT: add t3, a0, t3
-; RV32-NEXT: add t4, a2, t4
-; RV32-NEXT: add s0, a4, t5
-; RV32-NEXT: bltu t6, t1, .LBB0_6
+; RV32-NEXT: mul t5, a1, t3
+; RV32-NEXT: add s0, a0, a6
+; RV32-NEXT: mul t6, a3, t3
+; RV32-NEXT: add s2, a2, a6
+; RV32-NEXT: mul s1, a5, t3
+; RV32-NEXT: add s3, a4, a6
+; RV32-NEXT: bltu t4, t1, .LBB0_6
; RV32-NEXT: # %bb.5: # %for.cond1.preheader.us.preheader
; RV32-NEXT: li t1, 32
; RV32-NEXT: .LBB0_6: # %for.cond1.preheader.us.preheader
-; RV32-NEXT: add t3, t3, a6
-; RV32-NEXT: add t5, t4, a6
-; RV32-NEXT: add t4, s0, a6
+; RV32-NEXT: add t3, s0, t5
+; RV32-NEXT: add t6, s2, t6
+; RV32-NEXT: add t4, s3, s1
; RV32-NEXT: j .LBB0_8
; RV32-NEXT: # %bb.7: # %for.cond1.preheader.us.preheader
; RV32-NEXT: mv t1, t0
; RV32-NEXT: .LBB0_8: # %for.cond1.preheader.us.preheader
; RV32-NEXT: .cfi_restore_state
; RV32-NEXT: li t0, 0
-; RV32-NEXT: sltu t5, a0, t5
+; RV32-NEXT: sltu t5, a0, t6
; RV32-NEXT: sltu t6, a2, t3
; RV32-NEXT: and t5, t5, t6
; RV32-NEXT: sltu t4, a0, t4
@@ -140,9 +142,11 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 0(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore s0
; RV32-NEXT: .cfi_restore s1
; RV32-NEXT: .cfi_restore s2
+; RV32-NEXT: .cfi_restore s3
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: .LBB0_17: # %for.cond.cleanup
@@ -190,7 +194,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64P670-NEXT: or t6, s0, s1
; RV64P670-NEXT: sltu s1, a0, t5
; RV64P670-NEXT: sltu s0, a4, t4
-; RV64P670-NEXT: mv t5, a0
+; RV64P670-NEXT: add t4, a0, a6
; RV64P670-NEXT: and s0, s0, s1
; RV64P670-NEXT: or s1, a1, a5
; RV64P670-NEXT: srli s1, s1, 63
@@ -200,11 +204,11 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64P670-NEXT: or s0, t6, s0
; RV64P670-NEXT: sltu s1, a6, s1
; RV64P670-NEXT: or s0, s0, s1
-; RV64P670-NEXT: andi t4, s0, 1
+; RV64P670-NEXT: andi t5, s0, 1
; RV64P670-NEXT: j .LBB0_4
; RV64P670-NEXT: .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1
-; RV64P670-NEXT: add t5, t5, a1
+; RV64P670-NEXT: add a0, a0, a1
; RV64P670-NEXT: add a2, a2, a3
; RV64P670-NEXT: add a4, a4, a5
; RV64P670-NEXT: addiw t1, t1, 1
@@ -214,7 +218,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64P670-NEXT: # =>This Loop Header: Depth=1
; RV64P670-NEXT: # Child Loop BB0_7 Depth 2
; RV64P670-NEXT: # Child Loop BB0_10 Depth 2
-; RV64P670-NEXT: beqz t4, .LBB0_6
+; RV64P670-NEXT: beqz t5, .LBB0_6
; RV64P670-NEXT: # %bb.5: # in Loop: Header=BB0_4 Depth=1
; RV64P670-NEXT: li t6, 0
; RV64P670-NEXT: j .LBB0_9
@@ -223,7 +227,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64P670-NEXT: slli s1, t2, 28
; RV64P670-NEXT: mv s2, a2
; RV64P670-NEXT: mv s3, a4
-; RV64P670-NEXT: mv s4, t5
+; RV64P670-NEXT: mv s4, a0
; RV64P670-NEXT: sub s1, s1, t3
; RV64P670-NEXT: vsetvli s0, zero, e8, m2, ta, ma
; RV64P670-NEXT: and t6, s1, a6
@@ -246,11 +250,10 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64P670-NEXT: .LBB0_9: # %for.body4.us.preheader
; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1
; RV64P670-NEXT: mul s2, a1, t0
-; RV64P670-NEXT: add s0, a0, a6
-; RV64P670-NEXT: add s1, t5, t6
+; RV64P670-NEXT: add s1, a0, t6
; RV64P670-NEXT: add s4, a4, t6
; RV64P670-NEXT: add t6, t6, a2
-; RV64P670-NEXT: add s2, s2, s0
+; RV64P670-NEXT: add s2, s2, t4
; RV64P670-NEXT: .LBB0_10: # %for.body4.us
; RV64P670-NEXT: # Parent Loop BB0_4 Depth=1
; RV64P670-NEXT: # => This Inner Loop Header: Depth=2
@@ -332,12 +335,12 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64X60-NEXT: or s0, t4, s0
; RV64X60-NEXT: sltu s1, a6, s1
; RV64X60-NEXT: or s0, s0, s1
-; RV64X60-NEXT: andi t4, s0, 1
-; RV64X60-NEXT: mv t5, a0
+; RV64X60-NEXT: add t4, a0, a6
+; RV64X60-NEXT: andi t5, s0, 1
; RV64X60-NEXT: j .LBB0_4
; RV64X60-NEXT: .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1
-; RV64X60-NEXT: add t5, t5, a1
+; RV64X60-NEXT: add a0, a0, a1
; RV64X60-NEXT: add a2, a2, a3
; RV64X60-NEXT: addiw t1, t1, 1
; RV64X60-NEXT: add a4, a4, a5
@@ -347,7 +350,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64X60-NEXT: # =>This Loop Header: Depth=1
; RV64X60-NEXT: # Child Loop BB0_7 Depth 2
; RV64X60-NEXT: # Child Loop BB0_10 Depth 2
-; RV64X60-NEXT: beqz t4, .LBB0_6
+; RV64X60-NEXT: beqz t5, .LBB0_6
; RV64X60-NEXT: # %bb.5: # in Loop: Header=BB0_4 Depth=1
; RV64X60-NEXT: li t6, 0
; RV64X60-NEXT: j .LBB0_9
@@ -358,7 +361,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64X60-NEXT: and t6, s1, a6
; RV64X60-NEXT: mv s2, a2
; RV64X60-NEXT: mv s3, a4
-; RV64X60-NEXT: mv s4, t5
+; RV64X60-NEXT: mv s4, a0
; RV64X60-NEXT: mv s1, t6
; RV64X60-NEXT: vsetvli s0, zero, e8, m2, ta, ma
; RV64X60-NEXT: .LBB0_7: # %vector.body
@@ -379,9 +382,8 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64X60-NEXT: .LBB0_9: # %for.body4.us.preheader
; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1
; RV64X60-NEXT: mul s2, a1, t0
-; RV64X60-NEXT: add s1, a0, a6
-; RV64X60-NEXT: add s0, t5, t6
-; RV64X60-NEXT: add s2, s2, s1
+; RV64X60-NEXT: add s0, a0, t6
+; RV64X60-NEXT: add s2, s2, t4
; RV64X60-NEXT: add s4, a4, t6
; RV64X60-NEXT: add t6, t6, a2
; RV64X60-NEXT: .LBB0_10: # %for.body4.us
@@ -466,16 +468,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64-NEXT: or s0, a1, a5
; RV64-NEXT: srli s0, s0, 63
; RV64-NEXT: or t5, t5, s0
+; RV64-NEXT: sltu s0, a6, t4
; RV64-NEXT: or t5, t6, t5
-; RV64-NEXT: sltu t4, a6, t4
-; RV64-NEXT: or t4, t4, t5
-; RV64-NEXT: andi t4, t4, 1
-; RV64-NEXT: mv t5, a0
+; RV64-NEXT: add t4, a0, a6
+; RV64-NEXT: or t5, s0, t5
+; RV64-NEXT: andi t5, t5, 1
; RV64-NEXT: csrwi vxrm, 0
; RV64-NEXT: j .LBB0_6
; RV64-NEXT: .LBB0_5: # %for.cond1.for.cond.cleanup3_crit_edge.us
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT: add t5, t5, a1
+; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: add a4, a4, a5
; RV64-NEXT: addiw t3, t3, 1
@@ -485,7 +487,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64-NEXT: # =>This Loop Header: Depth=1
; RV64-NEXT: # Child Loop BB0_9 Depth 2
; RV64-NEXT: # Child Loop BB0_12 Depth 2
-; RV64-NEXT: beqz t4, .LBB0_8
+; RV64-NEXT: beqz t5, .LBB0_8
; RV64-NEXT: # %bb.7: # in Loop: Header=BB0_6 Depth=1
; RV64-NEXT: li t6, 0
; RV64-NEXT: j .LBB0_11
@@ -496,7 +498,7 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64-NEXT: and t6, t6, a6
; RV64-NEXT: mv s0, a2
; RV64-NEXT: mv s1, a4
-; RV64-NEXT: mv s2, t5
+; RV64-NEXT: mv s2, a0
; RV64-NEXT: mv s3, t6
; RV64-NEXT: vsetvli s4, zero, e8, m2, ta, ma
; RV64-NEXT: .LBB0_9: # %vector.body
@@ -516,25 +518,24 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64-NEXT: beq t6, a6, .LBB0_5
; RV64-NEXT: .LBB0_11: # %for.body4.us.preheader
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT: mul s1, a1, t2
-; RV64-NEXT: add s2, a0, a6
-; RV64-NEXT: add s0, t5, t6
-; RV64-NEXT: add s1, s2, s1
-; RV64-NEXT: add s2, a4, t6
+; RV64-NEXT: mul s2, a1, t2
+; RV64-NEXT: add s0, a0, t6
+; RV64-NEXT: add s1, a4, t6
+; RV64-NEXT: add s2, t4, s2
; RV64-NEXT: add t6, a2, t6
; RV64-NEXT: .LBB0_12: # %for.body4.us
; RV64-NEXT: # Parent Loop BB0_6 Depth=1
; RV64-NEXT: # => This Inner Loop Header: Depth=2
; RV64-NEXT: lbu s3, 0(t6)
-; RV64-NEXT: lbu s4, 0(s2)
+; RV64-NEXT: lbu s4, 0(s1)
; RV64-NEXT: add s3, s3, s4
; RV64-NEXT: addi s3, s3, 1
; RV64-NEXT: srli s3, s3, 1
; RV64-NEXT: sb s3, 0(s0)
; RV64-NEXT: addi s0, s0, 1
-; RV64-NEXT: addi s2, s2, 1
+; RV64-NEXT: addi s1, s1, 1
; RV64-NEXT: addi t6, t6, 1
-; RV64-NEXT: bne s0, s1, .LBB0_12
+; RV64-NEXT: bne s0, s2, .LBB0_12
; RV64-NEXT: j .LBB0_5
; RV64-NEXT: .LBB0_13:
; RV64-NEXT: ld s0, 40(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index 59a702ab6b17f..1bfeeb92e06dd 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -2075,14 +2075,14 @@ define i64 @abs_i64(i64 %x) {
; RV32SFB-LABEL: abs_i64:
; RV32SFB: # %bb.0:
; RV32SFB-NEXT: snez a2, a0
-; RV32SFB-NEXT: add a2, a2, a1
+; RV32SFB-NEXT: neg a3, a1
; RV32SFB-NEXT: bgez a1, .LBB35_2
; RV32SFB-NEXT: # %bb.1:
; RV32SFB-NEXT: neg a0, a0
; RV32SFB-NEXT: .LBB35_2:
; RV32SFB-NEXT: bgez a1, .LBB35_4
; RV32SFB-NEXT: # %bb.3:
-; RV32SFB-NEXT: neg a1, a2
+; RV32SFB-NEXT: sub a1, a3, a2
; RV32SFB-NEXT: .LBB35_4:
; RV32SFB-NEXT: ret
%a = call i64 @llvm.abs.i64(i64 %x, i1 false)
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm. As it was mentioned in the issue, my measurements several years ago showed that the original pass placement provided better performance. But I have no bandwidth to re-measure the current state, so I'm happy with the change if it fixes known perf issue and does not introduce new ones (at least we do not see them right now).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I did a quick evaluation on dynamic instruction count:
The result shows a slight decrease in total (considered as noise). 505.mcf has the biggest gain (almost 2.8% decrease). |
So that it runs before
MachineCSE
and other passes.Fixes #158063.