diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 0a8838cbd45c7..c742b92416362 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -62,7 +62,7 @@ struct DemandedVL { }; class RISCVVLOptimizer : public MachineFunctionPass { - const MachineRegisterInfo *MRI; + MachineRegisterInfo *MRI; const MachineDominatorTree *MDT; const TargetInstrInfo *TII; @@ -1392,6 +1392,42 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { return true; } +/// Given a vslidedown.vx like: +/// +/// %slideamt = ADDI %x, -1 +/// %v = PseudoVSLIDEDOWN_VX %passthru, %src, %slideamt, avl=1 +/// +/// %v will only read the first %slideamt + 1 lanes of %src, which = %x. +/// This is a common case when lowering extractelement. +/// +/// Note that if %x is 0, %slideamt will be all ones. In this case %src will be +/// completely slid down and none of its lanes will be read (since %slideamt is +/// greater than the largest VLMAX of 65536) so we can demand any minimum VL. +static std::optional +getMinimumVLForVSLIDEDOWN_VX(const MachineOperand &UserOp, + const MachineRegisterInfo *MRI) { + const MachineInstr &MI = *UserOp.getParent(); + if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VSLIDEDOWN_VX) + return std::nullopt; + // We're looking at what lanes are used from the src operand. + if (UserOp.getOperandNo() != 2) + return std::nullopt; + // For now, the AVL must be 1. + const MachineOperand &AVL = MI.getOperand(4); + if (!AVL.isImm() || AVL.getImm() != 1) + return std::nullopt; + // The slide amount must be %x - 1. + const MachineOperand &SlideAmt = MI.getOperand(3); + if (!SlideAmt.getReg().isVirtual()) + return std::nullopt; + MachineInstr *SlideAmtDef = MRI->getUniqueVRegDef(SlideAmt.getReg()); + if (SlideAmtDef->getOpcode() != RISCV::ADDI || + SlideAmtDef->getOperand(2).getImm() != -AVL.getImm() || + !SlideAmtDef->getOperand(1).getReg().isVirtual()) + return std::nullopt; + return SlideAmtDef->getOperand(1); +} + DemandedVL RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { const MachineInstr &UserMI = *UserOp.getParent(); @@ -1406,6 +1442,9 @@ RISCVVLOptimizer::getMinimumVLForUser(const MachineOperand &UserOp) const { return DemandedVL::vlmax(); } + if (auto VL = getMinimumVLForVSLIDEDOWN_VX(UserOp, MRI)) + return *VL; + if (RISCVII::readsPastVL( TII->get(RISCV::getRVVMCOpcode(UserMI.getOpcode())).TSFlags)) { LLVM_DEBUG(dbgs() << " Abort because used by unsafe instruction\n"); @@ -1624,6 +1663,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &MI) const { // All our checks passed. We can reduce VL. VLOp.ChangeToRegister(CommonVL->getReg(), false); + MRI->constrainRegClass(CommonVL->getReg(), &RISCV::GPRNoX0RegClass); return true; } diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll new file mode 100644 index 0000000000000..cf15fad5533b9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-live-out.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s + +define i32 @loop_live_out(ptr %p, i64 %n) { +; CHECK-LABEL: loop_live_out: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: .LBB0_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a2) +; CHECK-NEXT: sub a1, a1, a3 +; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: vse32.v v8, (a2) +; CHECK-NEXT: slli a2, a3, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: bnez a1, .LBB0_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a3 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %avl = phi i64 [%n, %entry], [%avl.next, %loop] + %gep = phi ptr [%p, %entry], [%gep.next, %loop] + %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) + %x = call @llvm.vp.load(ptr %gep, splat (i1 true), i32 %vl) + %y = add %x, splat (i32 1) + call void @llvm.vp.store( %y, ptr %gep, splat (i1 true), i32 %vl) + %vl.zext = zext i32 %vl to i64 + %avl.next = sub i64 %avl, %vl.zext + %gep.next = getelementptr i32, ptr %p, i32 %vl + %ec = icmp eq i64 %avl.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + %lastidx = sub i64 %vl.zext, 1 + %lastelt = extractelement %y, i64 %lastidx + ret i32 %lastelt +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir index 4d6d0e122b1cf..55d1c84d5f8d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir @@ -778,3 +778,38 @@ body: | ; CHECK: DBG_VALUE %0:vr DBG_VALUE %0:vr ... +--- +name: vslidedown_vx +tracksRegLiveness: true +body: | + bb.0: + liveins: $x8 + ; CHECK-LABEL: name: vslidedown_vx + ; CHECK: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:gprnox0 = COPY $x8 + ; CHECK-NEXT: %y:gprnox0 = ADDI %x, -1 + ; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %x, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:gpr = COPY $x8 + %y:gprnox0 = ADDI %x, -1 + %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ +... +--- +# Make sure we ignore LIs (ADDI $x0, -1) +name: vslidedown_vx_li +tracksRegLiveness: true +body: | + bb.0: + liveins: $x8 + ; CHECK-LABEL: name: vslidedown_vx_li + ; CHECK: liveins: $x8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %y:gprnox0 = ADDI $x0, -1 + ; CHECK-NEXT: %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ + %y:gprnox0 = ADDI $x0, -1 + %v:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %v, %y, 1, 5 /* e32 */, 0 /* tu, mu */ +...