-
Notifications
You must be signed in to change notification settings - Fork 15k
[VPlan] Replace ExtractLast(Elem|LanePerPart) with ExtractLast(Lane/Part) #164124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-risc-v Author: Florian Hahn (fhahn) Changes… Replace ExtractLastElement and ExtractLastLanePerPart with more generic and specific ExtractLastLane and ExtractLastPart, which model distinct parts of extracting across parts and lanes. ExtractLastElement == ExtractLastLane(ExtractLastPart) and ExtractLastLanePerPart == ExtractLastLane, the latter clarifying the name of the opcode. A new m_ExtractLastElement matcher is provided for convenience. The patch should be NFC modulo printing changes. Patch is 39.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164124.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b0427ae488..a36ff4aad93aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1008,12 +1008,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ComputeAnyOfResult,
ComputeFindIVResult,
ComputeReductionResult,
- // Extracts the last lane from its operand if it is a vector, or the last
- // part if scalar. In the latter case, the recipe will be removed during
- // unrolling.
- ExtractLastElement,
- // Extracts the last lane for each part from its operand.
- ExtractLastLanePerPart,
+ // Extracts the last part of its operand.
+ ExtractLastPart,
+ // Extracts the last lane of the current part of its operand.
+ ExtractLastLane,
// Extracts the second-to-last lane from its operand or the second-to-last
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63c6d14c..276f7e03ea0be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,14 +115,17 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return inferScalarType(R->getOperand(1));
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
return BaseTy;
}
+ case VPInstruction::ExtractLastPart: {
+ // ExtractLastPart returns the same type as its operand
+ return inferScalarType(R->getOperand(0));
+ }
case VPInstruction::LogicalAnd:
assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) &&
inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index d8203e251a5d1..a0b7fde957756 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -383,15 +383,23 @@ m_EVL(const Op0_t &Op0) {
}
template <typename Op0_t>
-inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t>
-m_ExtractLastElement(const Op0_t &Op0) {
- return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
+inline VPInstruction_match<VPInstruction::ExtractLastLane, Op0_t>
+m_ExtractLastLane(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExtractLastLane>(Op0);
}
template <typename Op0_t>
-inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
-m_ExtractLastLanePerPart(const Op0_t &Op0) {
- return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
+inline VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>
+m_ExtractLastPart(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExtractLastPart>(Op0);
+}
+
+template <typename Op0_t>
+inline VPInstruction_match<
+ VPInstruction::ExtractLastLane,
+ VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>>
+m_ExtractLastElement(const Op0_t &Op0) {
+ return m_ExtractLastLane(m_ExtractLastPart(Op0));
}
template <typename Op0_t, typename Op1_t, typename Op2_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c7595fe6a..331c1109a55ef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -510,8 +510,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
@@ -879,8 +879,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
return ReducedPartRdx;
}
- case VPInstruction::ExtractLastLanePerPart:
- case VPInstruction::ExtractLastElement:
+ case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
unsigned Offset =
getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
@@ -1148,7 +1147,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
- case VPInstruction::ExtractLastElement: {
+ case VPInstruction::ExtractLastLane: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
@@ -1168,8 +1167,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
bool VPInstruction::isVectorToScalar() const {
- return getOpcode() == VPInstruction::ExtractLastElement ||
- getOpcode() == VPInstruction::ExtractLastLanePerPart ||
+ return getOpcode() == VPInstruction::ExtractLastLane ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
@@ -1232,8 +1230,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
@@ -1378,11 +1376,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLane:
O << "extract-lane";
break;
- case VPInstruction::ExtractLastElement:
- O << "extract-last-element";
+ case VPInstruction::ExtractLastLane:
+ O << "extract-last-lane";
break;
- case VPInstruction::ExtractLastLanePerPart:
- O << "extract-last-lane-per-part";
+ case VPInstruction::ExtractLastPart:
+ O << "extract-last-part";
break;
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
@@ -1542,7 +1540,8 @@ void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
if (Exiting->isLiveIn())
return;
- Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
+ Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting);
+ Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting);
setOperand(0, Exiting);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee82c9c3..27be581a6849c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1206,9 +1206,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- // Look through ExtractLastElement (BuildVector ....).
- if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
- m_ExtractLastLanePerPart(m_BuildVector())))) {
+ // Look through ExtractLastLane (BuildVector ....).
+ if (match(&R, m_ExtractLastLane(m_BuildVector()))) {
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
Def->replaceAllUsesWith(
BuildVector->getOperand(BuildVector->getNumOperands() - 1));
@@ -1274,15 +1273,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- if (match(Def,
- m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))),
- m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) {
+ if (match(Def, m_ExtractLastLane(m_Broadcast(m_VPValue(A))))) {
Def->replaceAllUsesWith(A);
return;
}
- if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)),
- m_ExtractLastLanePerPart(m_VPValue(A)))) &&
+ if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
(isa<VPReplicateRecipe>(A) &&
cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
@@ -1291,11 +1287,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return Def->replaceAllUsesWith(A);
}
- if (Plan->getUF() == 1 &&
- match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) {
- return Def->replaceAllUsesWith(
- Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
- }
+ if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
+ return Def->replaceAllUsesWith(A);
}
void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
@@ -1333,13 +1326,14 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
Clone->insertBefore(RepOrWidenR);
- unsigned ExtractOpc =
- vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
- ? VPInstruction::ExtractLastElement
- : VPInstruction::ExtractLastLanePerPart;
- auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
- Ext->insertBefore(Clone);
- Clone->setOperand(0, Ext);
+ VPBuilder Builder(Clone);
+ VPValue *ExtractOp = Clone->getOperand(0);
+ if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
+ ExtractOp =
+ Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
+ ExtractOp =
+ Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
+ Clone->setOperand(0, ExtractOp);
RepR->eraseFromParent();
continue;
}
@@ -1350,9 +1344,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
if (!vputils::isSingleScalar(RepOrWidenR) ||
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
return U->usesScalars(RepOrWidenR) ||
- match(cast<VPRecipeBase>(U),
- m_CombineOr(m_ExtractLastElement(m_VPValue()),
- m_ExtractLastLanePerPart(m_VPValue())));
+ match(cast<VPRecipeBase>(U), m_ExtractLastPart(m_VPValue()));
}))
continue;
@@ -4316,10 +4308,13 @@ void VPlanTransforms::addScalarResumePhis(
auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
"Cannot handle loops with uncountable early exits");
- if (IsFOR)
- ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
- "vector.recur.extract");
+ if (IsFOR) {
+ auto *ExtractPart = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
+ ResumeFromVectorLoop =
+ MiddleBuilder.createNaryOp(VPInstruction::ExtractLastLane,
+ ExtractPart, {}, "vector.recur.extract");
+ }
StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
{ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
@@ -4417,10 +4412,11 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
// the VPIRInstruction modeling the phi.
- for (VPUser *U : FOR->users()) {
- using namespace llvm::VPlanPatternMatch;
- if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
+ for (VPRecipeBase &R : make_early_inc_range(
+ make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
+ if (!match(&R, m_ExtractLastElement(m_Specific(FOR))))
continue;
+
// For VF vscale x 1, if vscale = 1, we are unable to extract the
// penultimate value of the recurrence. Instead we rely on the existing
// extract of the last element from the result of
@@ -4430,9 +4426,11 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
Range))
return;
VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
+ VPInstruction::ExtractPenultimateElement,
+ MiddleBuilder.createNaryOp(VPInstruction::ExtractLastPart,
+ FOR->getBackedgeValue()),
{}, "vector.recur.extract.for.phi");
- cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
+ cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
}
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 5aeda3e11b138..96dc1d8d2525a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -372,22 +372,27 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
R.addOperand(getValueForPart(Op1, Part));
continue;
}
- if (match(&R, m_ExtractLastElement(m_VPValue(Op0))) ||
- match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
- m_VPValue(Op0)))) {
- addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
+
+ // Handle extraction from the last part. For scalar VF, directly replace
+ // with the appropriate scalar part. Otherwise, update operand to use the
+ // part.
+ if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
+ m_ExtractLastPart(m_VPValue(Op0)))) ||
+ match(&R, m_ExtractLastElement(m_VPValue(Op0)))) {
+ auto *I = cast<VPInstruction>(&R);
+ bool IsPenultimate =
+ I->getOpcode() == VPInstruction::ExtractPenultimateElement;
+ unsigned PartIdx = IsPenultimate ? UF - 2 : UF - 1;
+
if (Plan.hasScalarVFOnly()) {
- auto *I = cast<VPInstruction>(&R);
- // Extracting from end with VF = 1 implies retrieving the last or
- // penultimate scalar part (UF-1 or UF-2).
- unsigned Offset =
- I->getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
- I->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
- R.eraseFromParent();
- } else {
- // Otherwise we extract from the last part.
- remapOperands(&R, UF - 1);
+ // For scalar VF, directly use the scalar part value.
+ addUniformForAllParts(I);
+ I->replaceAllUsesWith(getValueForPart(Op0, PartIdx));
+ continue;
}
+ // For vector VF, extract from the last part.
+ addUniformForAllParts(I);
+ R.setOperand(0, getValueForPart(Op0, UF - 1));
continue;
}
@@ -480,8 +485,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
continue;
}
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+ auto *ExtractPart =
+ Builder.createNaryOp(VPInstruction::ExtractLastPart, {Op});
NewOps.push_back(
- Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
+ Builder.createNaryOp(VPInstruction::ExtractLastLane, {ExtractPart}));
continue;
}
if (vputils::isSingleScalar(Op)) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index d4e5dea3d4aab..77e6556535863 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -42,7 +42,9 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT_PART:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT_PART2:%.+]]> = extract-last-part vp<[[RED_RESULT_PART]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = extract-last-lane vp<[[RED_RESULT_PART2]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -53,7 +55,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VEC_TC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[RED_RESULT_PART]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
; CHECK-NEXT: Successor(s): ir-bb<for.body>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body>:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index dff4971ffdfa1..7932adbe158b8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -58,7 +58,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: Successor(s): middle.block
; IF-EVL-OUTLOOP-EMPTY:
; IF-EVL-OUTLOOP-NEXT: middle.block:
-; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>
; IF-EVL-OUTLOOP-EMPTY:
; IF-EVL-OUTLOOP-NEXT: ir-bb<for.end>:
@@ -97,7 +99,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: Successor(s): middle.block
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: middle.block:
-; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
@@ -131,7 +135,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: Successor(s): middle.block
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: middle.block:
-; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -142,7 +148,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: scalar.ph:
; NO-...
[truncated]
|
|
@llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) Changes… Replace ExtractLastElement and ExtractLastLanePerPart with more generic and specific ExtractLastLane and ExtractLastPart, which model distinct parts of extracting across parts and lanes. ExtractLastElement == ExtractLastLane(ExtractLastPart) and ExtractLastLanePerPart == ExtractLastLane, the latter clarifying the name of the opcode. A new m_ExtractLastElement matcher is provided for convenience. The patch should be NFC modulo printing changes. Patch is 39.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164124.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b0427ae488..a36ff4aad93aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1008,12 +1008,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ComputeAnyOfResult,
ComputeFindIVResult,
ComputeReductionResult,
- // Extracts the last lane from its operand if it is a vector, or the last
- // part if scalar. In the latter case, the recipe will be removed during
- // unrolling.
- ExtractLastElement,
- // Extracts the last lane for each part from its operand.
- ExtractLastLanePerPart,
+ // Extracts the last part of its operand.
+ ExtractLastPart,
+ // Extracts the last lane of the current part of its operand.
+ ExtractLastLane,
// Extracts the second-to-last lane from its operand or the second-to-last
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63c6d14c..276f7e03ea0be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,14 +115,17 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return inferScalarType(R->getOperand(1));
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
return BaseTy;
}
+ case VPInstruction::ExtractLastPart: {
+ // ExtractLastPart returns the same type as its operand
+ return inferScalarType(R->getOperand(0));
+ }
case VPInstruction::LogicalAnd:
assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) &&
inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index d8203e251a5d1..a0b7fde957756 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -383,15 +383,23 @@ m_EVL(const Op0_t &Op0) {
}
template <typename Op0_t>
-inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t>
-m_ExtractLastElement(const Op0_t &Op0) {
- return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
+inline VPInstruction_match<VPInstruction::ExtractLastLane, Op0_t>
+m_ExtractLastLane(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExtractLastLane>(Op0);
}
template <typename Op0_t>
-inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
-m_ExtractLastLanePerPart(const Op0_t &Op0) {
- return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
+inline VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>
+m_ExtractLastPart(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExtractLastPart>(Op0);
+}
+
+template <typename Op0_t>
+inline VPInstruction_match<
+ VPInstruction::ExtractLastLane,
+ VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>>
+m_ExtractLastElement(const Op0_t &Op0) {
+ return m_ExtractLastLane(m_ExtractLastPart(Op0));
}
template <typename Op0_t, typename Op1_t, typename Op2_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c7595fe6a..331c1109a55ef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -510,8 +510,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
@@ -879,8 +879,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
return ReducedPartRdx;
}
- case VPInstruction::ExtractLastLanePerPart:
- case VPInstruction::ExtractLastElement:
+ case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
unsigned Offset =
getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
@@ -1148,7 +1147,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
- case VPInstruction::ExtractLastElement: {
+ case VPInstruction::ExtractLastLane: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
@@ -1168,8 +1167,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
bool VPInstruction::isVectorToScalar() const {
- return getOpcode() == VPInstruction::ExtractLastElement ||
- getOpcode() == VPInstruction::ExtractLastLanePerPart ||
+ return getOpcode() == VPInstruction::ExtractLastLane ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
@@ -1232,8 +1230,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
@@ -1378,11 +1376,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLane:
O << "extract-lane";
break;
- case VPInstruction::ExtractLastElement:
- O << "extract-last-element";
+ case VPInstruction::ExtractLastLane:
+ O << "extract-last-lane";
break;
- case VPInstruction::ExtractLastLanePerPart:
- O << "extract-last-lane-per-part";
+ case VPInstruction::ExtractLastPart:
+ O << "extract-last-part";
break;
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
@@ -1542,7 +1540,8 @@ void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
if (Exiting->isLiveIn())
return;
- Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
+ Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting);
+ Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting);
setOperand(0, Exiting);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee82c9c3..27be581a6849c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1206,9 +1206,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- // Look through ExtractLastElement (BuildVector ....).
- if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
- m_ExtractLastLanePerPart(m_BuildVector())))) {
+ // Look through ExtractLastLane (BuildVector ....).
+ if (match(&R, m_ExtractLastLane(m_BuildVector()))) {
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
Def->replaceAllUsesWith(
BuildVector->getOperand(BuildVector->getNumOperands() - 1));
@@ -1274,15 +1273,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- if (match(Def,
- m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))),
- m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) {
+ if (match(Def, m_ExtractLastLane(m_Broadcast(m_VPValue(A))))) {
Def->replaceAllUsesWith(A);
return;
}
- if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)),
- m_ExtractLastLanePerPart(m_VPValue(A)))) &&
+ if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
(isa<VPReplicateRecipe>(A) &&
cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
@@ -1291,11 +1287,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return Def->replaceAllUsesWith(A);
}
- if (Plan->getUF() == 1 &&
- match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) {
- return Def->replaceAllUsesWith(
- Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
- }
+ if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
+ return Def->replaceAllUsesWith(A);
}
void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
@@ -1333,13 +1326,14 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
Clone->insertBefore(RepOrWidenR);
- unsigned ExtractOpc =
- vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
- ? VPInstruction::ExtractLastElement
- : VPInstruction::ExtractLastLanePerPart;
- auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
- Ext->insertBefore(Clone);
- Clone->setOperand(0, Ext);
+ VPBuilder Builder(Clone);
+ VPValue *ExtractOp = Clone->getOperand(0);
+ if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
+ ExtractOp =
+ Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
+ ExtractOp =
+ Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
+ Clone->setOperand(0, ExtractOp);
RepR->eraseFromParent();
continue;
}
@@ -1350,9 +1344,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
if (!vputils::isSingleScalar(RepOrWidenR) ||
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
return U->usesScalars(RepOrWidenR) ||
- match(cast<VPRecipeBase>(U),
- m_CombineOr(m_ExtractLastElement(m_VPValue()),
- m_ExtractLastLanePerPart(m_VPValue())));
+ match(cast<VPRecipeBase>(U), m_ExtractLastPart(m_VPValue()));
}))
continue;
@@ -4316,10 +4308,13 @@ void VPlanTransforms::addScalarResumePhis(
auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
"Cannot handle loops with uncountable early exits");
- if (IsFOR)
- ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
- "vector.recur.extract");
+ if (IsFOR) {
+ auto *ExtractPart = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
+ ResumeFromVectorLoop =
+ MiddleBuilder.createNaryOp(VPInstruction::ExtractLastLane,
+ ExtractPart, {}, "vector.recur.extract");
+ }
StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
{ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
@@ -4417,10 +4412,11 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
// the VPIRInstruction modeling the phi.
- for (VPUser *U : FOR->users()) {
- using namespace llvm::VPlanPatternMatch;
- if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
+ for (VPRecipeBase &R : make_early_inc_range(
+ make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
+ if (!match(&R, m_ExtractLastElement(m_Specific(FOR))))
continue;
+
// For VF vscale x 1, if vscale = 1, we are unable to extract the
// penultimate value of the recurrence. Instead we rely on the existing
// extract of the last element from the result of
@@ -4430,9 +4426,11 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
Range))
return;
VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
+ VPInstruction::ExtractPenultimateElement,
+ MiddleBuilder.createNaryOp(VPInstruction::ExtractLastPart,
+ FOR->getBackedgeValue()),
{}, "vector.recur.extract.for.phi");
- cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
+ cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
}
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 5aeda3e11b138..96dc1d8d2525a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -372,22 +372,27 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
R.addOperand(getValueForPart(Op1, Part));
continue;
}
- if (match(&R, m_ExtractLastElement(m_VPValue(Op0))) ||
- match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
- m_VPValue(Op0)))) {
- addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
+
+ // Handle extraction from the last part. For scalar VF, directly replace
+ // with the appropriate scalar part. Otherwise, update operand to use the
+ // part.
+ if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
+ m_ExtractLastPart(m_VPValue(Op0)))) ||
+ match(&R, m_ExtractLastElement(m_VPValue(Op0)))) {
+ auto *I = cast<VPInstruction>(&R);
+ bool IsPenultimate =
+ I->getOpcode() == VPInstruction::ExtractPenultimateElement;
+ unsigned PartIdx = IsPenultimate ? UF - 2 : UF - 1;
+
if (Plan.hasScalarVFOnly()) {
- auto *I = cast<VPInstruction>(&R);
- // Extracting from end with VF = 1 implies retrieving the last or
- // penultimate scalar part (UF-1 or UF-2).
- unsigned Offset =
- I->getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
- I->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
- R.eraseFromParent();
- } else {
- // Otherwise we extract from the last part.
- remapOperands(&R, UF - 1);
+ // For scalar VF, directly use the scalar part value.
+ addUniformForAllParts(I);
+ I->replaceAllUsesWith(getValueForPart(Op0, PartIdx));
+ continue;
}
+ // For vector VF, extract from the last part.
+ addUniformForAllParts(I);
+ R.setOperand(0, getValueForPart(Op0, UF - 1));
continue;
}
@@ -480,8 +485,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
continue;
}
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+ auto *ExtractPart =
+ Builder.createNaryOp(VPInstruction::ExtractLastPart, {Op});
NewOps.push_back(
- Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
+ Builder.createNaryOp(VPInstruction::ExtractLastLane, {ExtractPart}));
continue;
}
if (vputils::isSingleScalar(Op)) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index d4e5dea3d4aab..77e6556535863 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -42,7 +42,9 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT_PART:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT_PART2:%.+]]> = extract-last-part vp<[[RED_RESULT_PART]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = extract-last-lane vp<[[RED_RESULT_PART2]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -53,7 +55,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VEC_TC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[RED_RESULT_PART]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
; CHECK-NEXT: Successor(s): ir-bb<for.body>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body>:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index dff4971ffdfa1..7932adbe158b8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -58,7 +58,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: Successor(s): middle.block
; IF-EVL-OUTLOOP-EMPTY:
; IF-EVL-OUTLOOP-NEXT: middle.block:
-; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>
; IF-EVL-OUTLOOP-EMPTY:
; IF-EVL-OUTLOOP-NEXT: ir-bb<for.end>:
@@ -97,7 +99,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: Successor(s): middle.block
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: middle.block:
-; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
@@ -131,7 +135,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: Successor(s): middle.block
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: middle.block:
-; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -142,7 +148,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: scalar.ph:
; NO-...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) Changes… Replace ExtractLastElement and ExtractLastLanePerPart with more generic and specific ExtractLastLane and ExtractLastPart, which model distinct parts of extracting across parts and lanes. ExtractLastElement == ExtractLastLane(ExtractLastPart) and ExtractLastLanePerPart == ExtractLastLane, the latter clarifying the name of the opcode. A new m_ExtractLastElement matcher is provided for convenience. The patch should be NFC modulo printing changes. Patch is 39.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164124.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0e0b0427ae488..a36ff4aad93aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1008,12 +1008,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ComputeAnyOfResult,
ComputeFindIVResult,
ComputeReductionResult,
- // Extracts the last lane from its operand if it is a vector, or the last
- // part if scalar. In the latter case, the recipe will be removed during
- // unrolling.
- ExtractLastElement,
- // Extracts the last lane for each part from its operand.
- ExtractLastLanePerPart,
+ // Extracts the last part of its operand.
+ ExtractLastPart,
+ // Extracts the last lane of the current part of its operand.
+ ExtractLastLane,
// Extracts the second-to-last lane from its operand or the second-to-last
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f413c63c6d14c..276f7e03ea0be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -115,14 +115,17 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return inferScalarType(R->getOperand(1));
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
return BaseTy;
}
+ case VPInstruction::ExtractLastPart: {
+ // ExtractLastPart returns the same type as its operand
+ return inferScalarType(R->getOperand(0));
+ }
case VPInstruction::LogicalAnd:
assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) &&
inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index d8203e251a5d1..a0b7fde957756 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -383,15 +383,23 @@ m_EVL(const Op0_t &Op0) {
}
template <typename Op0_t>
-inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t>
-m_ExtractLastElement(const Op0_t &Op0) {
- return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
+inline VPInstruction_match<VPInstruction::ExtractLastLane, Op0_t>
+m_ExtractLastLane(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExtractLastLane>(Op0);
}
template <typename Op0_t>
-inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
-m_ExtractLastLanePerPart(const Op0_t &Op0) {
- return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
+inline VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>
+m_ExtractLastPart(const Op0_t &Op0) {
+ return m_VPInstruction<VPInstruction::ExtractLastPart>(Op0);
+}
+
+template <typename Op0_t>
+inline VPInstruction_match<
+ VPInstruction::ExtractLastLane,
+ VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>>
+m_ExtractLastElement(const Op0_t &Op0) {
+ return m_ExtractLastLane(m_ExtractLastPart(Op0));
}
template <typename Op0_t, typename Op1_t, typename Op2_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a98c7595fe6a..331c1109a55ef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -510,8 +510,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
@@ -879,8 +879,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
return ReducedPartRdx;
}
- case VPInstruction::ExtractLastLanePerPart:
- case VPInstruction::ExtractLastElement:
+ case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
unsigned Offset =
getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
@@ -1148,7 +1147,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
- case VPInstruction::ExtractLastElement: {
+ case VPInstruction::ExtractLastLane: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
@@ -1168,8 +1167,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
bool VPInstruction::isVectorToScalar() const {
- return getOpcode() == VPInstruction::ExtractLastElement ||
- getOpcode() == VPInstruction::ExtractLastLanePerPart ||
+ return getOpcode() == VPInstruction::ExtractLastLane ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
@@ -1232,8 +1230,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
- case VPInstruction::ExtractLastElement:
- case VPInstruction::ExtractLastLanePerPart:
+ case VPInstruction::ExtractLastLane:
+ case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
@@ -1378,11 +1376,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLane:
O << "extract-lane";
break;
- case VPInstruction::ExtractLastElement:
- O << "extract-last-element";
+ case VPInstruction::ExtractLastLane:
+ O << "extract-last-lane";
break;
- case VPInstruction::ExtractLastLanePerPart:
- O << "extract-last-lane-per-part";
+ case VPInstruction::ExtractLastPart:
+ O << "extract-last-part";
break;
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
@@ -1542,7 +1540,8 @@ void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
if (Exiting->isLiveIn())
return;
- Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
+ Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting);
+ Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting);
setOperand(0, Exiting);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cae9aee82c9c3..27be581a6849c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1206,9 +1206,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- // Look through ExtractLastElement (BuildVector ....).
- if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
- m_ExtractLastLanePerPart(m_BuildVector())))) {
+ // Look through ExtractLastLane (BuildVector ....).
+ if (match(&R, m_ExtractLastLane(m_BuildVector()))) {
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
Def->replaceAllUsesWith(
BuildVector->getOperand(BuildVector->getNumOperands() - 1));
@@ -1274,15 +1273,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- if (match(Def,
- m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))),
- m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) {
+ if (match(Def, m_ExtractLastLane(m_Broadcast(m_VPValue(A))))) {
Def->replaceAllUsesWith(A);
return;
}
- if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)),
- m_ExtractLastLanePerPart(m_VPValue(A)))) &&
+ if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
(isa<VPReplicateRecipe>(A) &&
cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
@@ -1291,11 +1287,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return Def->replaceAllUsesWith(A);
}
- if (Plan->getUF() == 1 &&
- match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) {
- return Def->replaceAllUsesWith(
- Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
- }
+ if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
+ return Def->replaceAllUsesWith(A);
}
void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
@@ -1333,13 +1326,14 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
Clone->insertBefore(RepOrWidenR);
- unsigned ExtractOpc =
- vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
- ? VPInstruction::ExtractLastElement
- : VPInstruction::ExtractLastLanePerPart;
- auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
- Ext->insertBefore(Clone);
- Clone->setOperand(0, Ext);
+ VPBuilder Builder(Clone);
+ VPValue *ExtractOp = Clone->getOperand(0);
+ if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
+ ExtractOp =
+ Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
+ ExtractOp =
+ Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
+ Clone->setOperand(0, ExtractOp);
RepR->eraseFromParent();
continue;
}
@@ -1350,9 +1344,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
if (!vputils::isSingleScalar(RepOrWidenR) ||
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
return U->usesScalars(RepOrWidenR) ||
- match(cast<VPRecipeBase>(U),
- m_CombineOr(m_ExtractLastElement(m_VPValue()),
- m_ExtractLastLanePerPart(m_VPValue())));
+ match(cast<VPRecipeBase>(U), m_ExtractLastPart(m_VPValue()));
}))
continue;
@@ -4316,10 +4308,13 @@ void VPlanTransforms::addScalarResumePhis(
auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
"Cannot handle loops with uncountable early exits");
- if (IsFOR)
- ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
- "vector.recur.extract");
+ if (IsFOR) {
+ auto *ExtractPart = MiddleBuilder.createNaryOp(
+ VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
+ ResumeFromVectorLoop =
+ MiddleBuilder.createNaryOp(VPInstruction::ExtractLastLane,
+ ExtractPart, {}, "vector.recur.extract");
+ }
StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
{ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
@@ -4417,10 +4412,11 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
// the VPIRInstruction modeling the phi.
- for (VPUser *U : FOR->users()) {
- using namespace llvm::VPlanPatternMatch;
- if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
+ for (VPRecipeBase &R : make_early_inc_range(
+ make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
+ if (!match(&R, m_ExtractLastElement(m_Specific(FOR))))
continue;
+
// For VF vscale x 1, if vscale = 1, we are unable to extract the
// penultimate value of the recurrence. Instead we rely on the existing
// extract of the last element from the result of
@@ -4430,9 +4426,11 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
Range))
return;
VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
- VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
+ VPInstruction::ExtractPenultimateElement,
+ MiddleBuilder.createNaryOp(VPInstruction::ExtractLastPart,
+ FOR->getBackedgeValue()),
{}, "vector.recur.extract.for.phi");
- cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
+ cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
}
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 5aeda3e11b138..96dc1d8d2525a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -372,22 +372,27 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
R.addOperand(getValueForPart(Op1, Part));
continue;
}
- if (match(&R, m_ExtractLastElement(m_VPValue(Op0))) ||
- match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
- m_VPValue(Op0)))) {
- addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
+
+ // Handle extraction from the last part. For scalar VF, directly replace
+ // with the appropriate scalar part. Otherwise, update operand to use the
+ // part.
+ if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
+ m_ExtractLastPart(m_VPValue(Op0)))) ||
+ match(&R, m_ExtractLastElement(m_VPValue(Op0)))) {
+ auto *I = cast<VPInstruction>(&R);
+ bool IsPenultimate =
+ I->getOpcode() == VPInstruction::ExtractPenultimateElement;
+ unsigned PartIdx = IsPenultimate ? UF - 2 : UF - 1;
+
if (Plan.hasScalarVFOnly()) {
- auto *I = cast<VPInstruction>(&R);
- // Extracting from end with VF = 1 implies retrieving the last or
- // penultimate scalar part (UF-1 or UF-2).
- unsigned Offset =
- I->getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
- I->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
- R.eraseFromParent();
- } else {
- // Otherwise we extract from the last part.
- remapOperands(&R, UF - 1);
+ // For scalar VF, directly use the scalar part value.
+ addUniformForAllParts(I);
+ I->replaceAllUsesWith(getValueForPart(Op0, PartIdx));
+ continue;
}
+ // For vector VF, extract from the last part.
+ addUniformForAllParts(I);
+ R.setOperand(0, getValueForPart(Op0, UF - 1));
continue;
}
@@ -480,8 +485,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
continue;
}
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+ auto *ExtractPart =
+ Builder.createNaryOp(VPInstruction::ExtractLastPart, {Op});
NewOps.push_back(
- Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
+ Builder.createNaryOp(VPInstruction::ExtractLastLane, {ExtractPart}));
continue;
}
if (vputils::isSingleScalar(Op)) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index d4e5dea3d4aab..77e6556535863 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -42,7 +42,9 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: Successor(s): middle.block
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT_PART:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT_PART2:%.+]]> = extract-last-part vp<[[RED_RESULT_PART]]>
+; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = extract-last-lane vp<[[RED_RESULT_PART2]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -53,7 +55,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VEC_TC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[RED_RESULT_PART]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
; CHECK-NEXT: Successor(s): ir-bb<for.body>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body>:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index dff4971ffdfa1..7932adbe158b8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -58,7 +58,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: Successor(s): middle.block
; IF-EVL-OUTLOOP-EMPTY:
; IF-EVL-OUTLOOP-NEXT: middle.block:
-; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>
; IF-EVL-OUTLOOP-EMPTY:
; IF-EVL-OUTLOOP-NEXT: ir-bb<for.end>:
@@ -97,7 +99,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: Successor(s): middle.block
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: middle.block:
-; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
@@ -131,7 +135,9 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: Successor(s): middle.block
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: middle.block:
-; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_PART:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_PART2:%.+]]> = extract-last-part vp<[[RDX_PART]]>
+; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = extract-last-lane vp<[[RDX_PART2]]>
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]>
; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]>
; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
@@ -142,7 +148,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: scalar.ph:
; NO-...
[truncated]
|
…art) Replace ExtractLastElement and ExtractLastLanePerPart with more generic and specific ExtractLastLane and ExtractLastPart, which model distinct parts of extracting across parts and lanes. ExtractLastElement == ExtractLastLane(ExtractLastPart) and ExtractLastLanePerPart == ExtractLastLane, the latter clarifying the name of the opcode. A new m_ExtractLastElement matcher is provided for convenience. The patch should be NFC modulo printing changes.
544925c to
9b92c8d
Compare
|
ping |
| ExtractLastLanePerPart, | ||
| // Extracts the last part of its operand. | ||
| ExtractLastPart, | ||
| // Extracts the last lane of the current part of its operand. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // Extracts the last lane of the current part of its operand. | |
| // Extracts the last lane of its vector operand, per part. |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, thanks
| ExtractLastElement, | ||
| // Extracts the last lane for each part from its operand. | ||
| ExtractLastLanePerPart, | ||
| // Extracts the last part of its operand. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // Extracts the last part of its operand. | |
| // Extracts the last part of its operand. Removed during unrolling. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks
| return BaseTy; | ||
| } | ||
| case VPInstruction::ExtractLastPart: { | ||
| // ExtractLastPart returns the same type as its operand |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // ExtractLastPart returns the same type as its operand | |
| // Element type of ExtractLastPart is equal to the element type as its operand. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
| } | ||
| case VPInstruction::ExtractLastElement: { | ||
| case VPInstruction::ExtractLastLane: { | ||
| // Add on the cost of extracting the element. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Independent nit): "Add on"?
| inline VPInstruction_match< | ||
| VPInstruction::ExtractLastLane, | ||
| VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>> | ||
| m_ExtractLastElement(const Op0_t &Op0) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| m_ExtractLastElement(const Op0_t &Op0) { | |
| m_ExtractFinalLane(const Op0_t &Op0) { |
?
The term Element is used to represent a lane when VF>1 and otherwise a part, as in ExtractPenultimateElement. Perhaps FinalLane would be clearer to represent the last lane of the last part.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, updated the name
| // with the appropriate scalar part. Otherwise, update operand to use the | ||
| // part. | ||
| if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( | ||
| m_ExtractLastPart(m_VPValue(Op0)))) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Penultimate element may be extracted from penultimate rather than last part (when VF=1)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same suggestion/confusion as above - ExtractPenultimateElement is responsible for extracting both part and lane?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, updated for now to do both, thanks
| ; CHECK-EMPTY: | ||
| ; CHECK-NEXT: middle.block: | ||
| ; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]> | ||
| ; CHECK-NEXT: EMIT vp<[[RED_RESULT_PART:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extraction of final lane taken out of compute-reduction-result, to be represented (more) explicitly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep, but this could be handled by a new fold as compute-reduction-result produces a single scalar across all UFs and VFs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to avoid/eliminate redundant extracts (TODO?).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated the code to properly handle this now, thanks
| ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 | ||
| ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i16 1, i16 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replace vector select in loop and extract last lane in middle, by an extract first lane and scalar select in loop?
(Both selects are essentially doing zext i1 -> i16)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, the select condition is uniform (compare of broadcasted single-scalar load), need to check separately why the select/compare are not narrowed yet
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(TODO?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fix is available here: #165506
| ; CHECK-NEXT: EMIT vp<[[RDX_RES_PART:%.+]]> = compute-find-iv-result ir<%rdx>, ir<%start>, ir<-9223372036854775808>, ir<%cond> | ||
| ; CHECK-NEXT: EMIT vp<[[RDX_RES_PART2:%.+]]> = extract-last-part vp<[[RDX_RES_PART]]> | ||
| ; CHECK-NEXT: EMIT vp<[[RDX_RES:%.+]]> = extract-last-lane vp<[[RDX_RES_PART2]]> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same expansion as in compute-reduction-result.
| ; CHECK-NEXT: EMIT vp<[[RESUME_1_PART:%.+]]> = extract-last-part ir<%for.1.next> | ||
| ; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane vp<[[RESUME_1_PART]]> | ||
| ; CHECK-NEXT: EMIT vp<[[RESUME_2_PART:%.+]]> = extract-last-part vp<[[FOR1_SPLICE]]> | ||
| ; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]>.1 = extract-last-lane vp<[[RESUME_2_PART]]> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An abstract ExtractFinalLane could be introduced as shorthand for {extract-last-part + extract-last-lane}, if desired, perhaps temporarily and/or for gradual change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've left the changes as is for now
| U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult); | ||
| if (match(U, m_ExtractLastElement(m_VPValue()))) | ||
| if (match(U, m_ExtractFinalLane(m_VPValue()))) | ||
| cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to clarify somewhere our distinction between "Last" and "Final".
("Final" also appears here in FinalReductionResult)
| return true; | ||
| } | ||
|
|
||
| /// Update the recipes first operand to the last lane of the operand using \p |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Update the recipes first operand to the last lane of the operand using \p | |
| /// Update the recipe's first operand to the final lane of the operand using \p |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, thanks
| VPInstruction::ExtractLastLane, | ||
| VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>> | ||
| m_ExtractFinalLane(const Op0_t &Op0) { | ||
| return m_ExtractLastLane(m_ExtractLastPart(Op0)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would ExtractLastPart(ExtractLastLane()) also work? Perhaps worth a comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure how it would interact with interleaving, but extracting the last part works naturally, extracting the lane first would be more difficult I think
| return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); | ||
| } | ||
| case VPInstruction::ExtractLastElement: { | ||
| case VPInstruction::ExtractLastLane: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. Worth leaving behind a TODO?
| if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A)))) | ||
| return Def->replaceAllUsesWith(A); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO: this is probably where ExtractLastLane's should be bypassed if VF is scalar.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added the TODO to ::execute
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, it indeed affects ::execute, just seems natural to add the following here - analogous to bypassing redundant ExtractLastPart:
if (Plan->hasScalarVFOnly() && match(Def, m_ExtractLastLane(m_VPValue(A))))
return Def->replaceAllUsesWith(A);
or rather prevent its creation?
| // with the appropriate scalar part. Otherwise, update operand to use the | ||
| // part. | ||
| if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( | ||
| m_ExtractLastPart(m_VPValue(Op0)))) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same suggestion/confusion as above - ExtractPenultimateElement is responsible for extracting both part and lane?
| } else { | ||
| // Otherwise we extract from the last part. | ||
| remapOperands(&R, UF - 1); | ||
| bool IsPenultimate = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| bool IsPenultimate = | |
| bool IsPenultimatePart = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated thanks
| I->replaceAllUsesWith(getValueForPart(Op0, PartIdx)); | ||
| continue; | ||
| } | ||
| // For vector VF, extract from the last part. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // For vector VF, extract from the last part. | |
| // For vector VF, always extract from the last part. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated thanks
| ; CHECK-EMPTY: | ||
| ; CHECK-NEXT: middle.block: | ||
| ; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]> | ||
| ; CHECK-NEXT: EMIT vp<[[RED_RESULT_PART:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to avoid/eliminate redundant extracts (TODO?).
| ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 | ||
| ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = select i1 [[TMP2]], i16 1, i16 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(TODO?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
| if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A)))) | ||
| return Def->replaceAllUsesWith(A); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, it indeed affects ::execute, just seems natural to add the following here - analogous to bypassing redundant ExtractLastPart:
if (Plan->hasScalarVFOnly() && match(Def, m_ExtractLastLane(m_VPValue(A))))
return Def->replaceAllUsesWith(A);
or rather prevent its creation?
| for (VPRecipeBase &R : make_early_inc_range( | ||
| make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
... as done above in adjustRecipesForReductions(). Matching this recipe-pair pattern going backwards from use to def is indeed simpler that going forward from def to use, hence the suggestion for (the abstract) ExtractFinalLane to represent both.
Replace ExtractLastElement and ExtractLastLanePerPart with more generic and specific ExtractLastLane and ExtractLastPart, which model distinct parts of extracting across parts and lanes. ExtractLastElement == ExtractLastLane(ExtractLastPart) and ExtractLastLanePerPart == ExtractLastLane, the latter clarifying the name of the opcode. A new m_ExtractLastElement matcher is provided for convenience.
The patch should be NFC modulo printing changes.