-
Notifications
You must be signed in to change notification settings - Fork 15k
[VPlan] Replace ExtractLast(Elem|LanePerPart) with ExtractLast(Lane/Part) #164124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9b92c8d
ddd2618
47861d7
69eba0f
f41528c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -524,8 +524,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { | |
| case VPInstruction::CalculateTripCountMinusVF: | ||
| case VPInstruction::CanonicalIVIncrementForPart: | ||
| case VPInstruction::ExplicitVectorLength: | ||
| case VPInstruction::ExtractLastElement: | ||
| case VPInstruction::ExtractLastLanePerPart: | ||
| case VPInstruction::ExtractLastLane: | ||
| case VPInstruction::ExtractLastPart: | ||
| case VPInstruction::ExtractPenultimateElement: | ||
| case VPInstruction::FirstActiveLane: | ||
| case VPInstruction::Not: | ||
|
|
@@ -894,8 +894,7 @@ Value *VPInstruction::generate(VPTransformState &State) { | |
|
|
||
| return ReducedPartRdx; | ||
| } | ||
| case VPInstruction::ExtractLastLanePerPart: | ||
| case VPInstruction::ExtractLastElement: | ||
| case VPInstruction::ExtractLastLane: | ||
| case VPInstruction::ExtractPenultimateElement: { | ||
| unsigned Offset = | ||
| getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1; | ||
|
|
@@ -906,6 +905,7 @@ Value *VPInstruction::generate(VPTransformState &State) { | |
| // Extract lane VF - Offset from the operand. | ||
| Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset)); | ||
| } else { | ||
| // TODO: Remove ExtractLastLane for scalar VFs. | ||
| assert(Offset <= 1 && "invalid offset to extract from"); | ||
| Res = State.get(getOperand(0)); | ||
| } | ||
|
|
@@ -1163,7 +1163,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, | |
| I32Ty, {Arg0Ty, I32Ty, I1Ty}); | ||
| return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); | ||
| } | ||
| case VPInstruction::ExtractLastElement: { | ||
| case VPInstruction::ExtractLastLane: { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that the current cost of ExtractLastElement seems inaccurate, when VF=1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, can update as follow-up, thanks. I think we should remove the ExtractLastLane for plans with just the scalar VF There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. Worth leaving behind a TODO? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added to :;execute, where it handles scalar VFs |
||
| // Add on the cost of extracting the element. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Independent nit): "Add on"? |
||
| auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); | ||
| return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, | ||
|
|
@@ -1183,8 +1183,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, | |
| } | ||
|
|
||
| bool VPInstruction::isVectorToScalar() const { | ||
| return getOpcode() == VPInstruction::ExtractLastElement || | ||
| getOpcode() == VPInstruction::ExtractLastLanePerPart || | ||
| return getOpcode() == VPInstruction::ExtractLastLane || | ||
| getOpcode() == VPInstruction::ExtractPenultimateElement || | ||
| getOpcode() == Instruction::ExtractElement || | ||
| getOpcode() == VPInstruction::ExtractLane || | ||
|
|
@@ -1247,8 +1246,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { | |
| case VPInstruction::CalculateTripCountMinusVF: | ||
| case VPInstruction::CanonicalIVIncrementForPart: | ||
| case VPInstruction::ExtractLane: | ||
| case VPInstruction::ExtractLastElement: | ||
| case VPInstruction::ExtractLastLanePerPart: | ||
| case VPInstruction::ExtractLastLane: | ||
| case VPInstruction::ExtractLastPart: | ||
| case VPInstruction::ExtractPenultimateElement: | ||
| case VPInstruction::ActiveLaneMask: | ||
| case VPInstruction::FirstActiveLane: | ||
|
|
@@ -1395,11 +1394,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, | |
| case VPInstruction::ExtractLane: | ||
| O << "extract-lane"; | ||
| break; | ||
| case VPInstruction::ExtractLastElement: | ||
| O << "extract-last-element"; | ||
| case VPInstruction::ExtractLastLane: | ||
| O << "extract-last-lane"; | ||
| break; | ||
| case VPInstruction::ExtractLastLanePerPart: | ||
| O << "extract-last-lane-per-part"; | ||
| case VPInstruction::ExtractLastPart: | ||
| O << "extract-last-part"; | ||
| break; | ||
| case VPInstruction::ExtractPenultimateElement: | ||
| O << "extract-penultimate-element"; | ||
|
|
@@ -1554,15 +1553,16 @@ InstructionCost VPIRInstruction::computeCost(ElementCount VF, | |
| return 0; | ||
| } | ||
|
|
||
| void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) { | ||
| void VPIRInstruction::extractFinalLaneOfFirstOperand(VPBuilder &Builder) { | ||
| assert(isa<PHINode>(getInstruction()) && | ||
| "can only update exiting operands to phi nodes"); | ||
| assert(getNumOperands() > 0 && "must have at least one operand"); | ||
| VPValue *Exiting = getOperand(0); | ||
| if (Exiting->isLiveIn()) | ||
| return; | ||
|
|
||
| Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting}); | ||
| Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting); | ||
| Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting); | ||
|
Comment on lines
+1564
to
+1565
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggests renaming the method to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated, thanks |
||
| setOperand(0, Exiting); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -858,7 +858,7 @@ static VPValue *optimizeLatchExitInductionUser( | |
| VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, | ||
| DenseMap<VPValue *, VPValue *> &EndValues, ScalarEvolution &SE) { | ||
| VPValue *Incoming; | ||
| if (!match(Op, m_ExtractLastElement(m_VPValue(Incoming)))) | ||
| if (!match(Op, m_ExtractFinalLane(m_VPValue(Incoming)))) | ||
| return nullptr; | ||
|
|
||
| auto *WideIV = getOptimizableIVOf(Incoming, SE); | ||
|
|
@@ -1257,9 +1257,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { | |
| return; | ||
| } | ||
|
|
||
| // Look through ExtractLastElement (BuildVector ....). | ||
| if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()), | ||
| m_ExtractLastLanePerPart(m_BuildVector())))) { | ||
| // Look through ExtractLastLane (BuildVector ....). | ||
| if (match(&R, m_ExtractLastLane(m_BuildVector()))) { | ||
| auto *BuildVector = cast<VPInstruction>(R.getOperand(0)); | ||
| Def->replaceAllUsesWith( | ||
| BuildVector->getOperand(BuildVector->getNumOperands() - 1)); | ||
|
|
@@ -1332,15 +1331,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { | |
| return; | ||
| } | ||
|
|
||
| if (match(Def, | ||
| m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))), | ||
| m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) { | ||
| if (match(Def, m_ExtractLastLane(m_Broadcast(m_VPValue(A))))) { | ||
| Def->replaceAllUsesWith(A); | ||
| return; | ||
| } | ||
|
|
||
| if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)), | ||
| m_ExtractLastLanePerPart(m_VPValue(A)))) && | ||
| if (match(Def, m_ExtractLastLane(m_VPValue(A))) && | ||
| ((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) || | ||
| (isa<VPReplicateRecipe>(A) && | ||
| cast<VPReplicateRecipe>(A)->isSingleScalar())) && | ||
|
|
@@ -1349,11 +1345,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { | |
| return Def->replaceAllUsesWith(A); | ||
| } | ||
|
|
||
| if (Plan->getUF() == 1 && | ||
| match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) { | ||
| return Def->replaceAllUsesWith( | ||
| Builder.createNaryOp(VPInstruction::ExtractLastElement, {A})); | ||
| } | ||
| if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A)))) | ||
| return Def->replaceAllUsesWith(A); | ||
|
Comment on lines
+1348
to
+1349
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: this is probably where ExtractLastLane's should be bypassed if VF is scalar. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added the TODO to ::execute There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, it indeed affects ::execute, just seems natural to add the following here - analogous to bypassing redundant ExtractLastPart: or rather prevent its creation? |
||
| } | ||
|
|
||
| void VPlanTransforms::simplifyRecipes(VPlan &Plan) { | ||
|
|
@@ -1391,13 +1384,14 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { | |
| RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(), | ||
| true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/); | ||
| Clone->insertBefore(RepOrWidenR); | ||
| unsigned ExtractOpc = | ||
| vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)) | ||
| ? VPInstruction::ExtractLastElement | ||
| : VPInstruction::ExtractLastLanePerPart; | ||
| auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)}); | ||
| Ext->insertBefore(Clone); | ||
| Clone->setOperand(0, Ext); | ||
| VPBuilder Builder(Clone); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: would have been good to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, this requires adding support for creating VPReplicateRecipes in VPBuilder. Probably best done separately |
||
| VPValue *ExtractOp = Clone->getOperand(0); | ||
| if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))) | ||
| ExtractOp = | ||
| Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp); | ||
| ExtractOp = | ||
| Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp); | ||
| Clone->setOperand(0, ExtractOp); | ||
| RepR->eraseFromParent(); | ||
| continue; | ||
| } | ||
|
|
@@ -1409,8 +1403,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { | |
| !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { | ||
| return U->usesScalars(RepOrWidenR) || | ||
| match(cast<VPRecipeBase>(U), | ||
| m_CombineOr(m_ExtractLastElement(m_VPValue()), | ||
| m_ExtractLastLanePerPart(m_VPValue()))); | ||
| m_CombineOr(m_ExtractLastPart(m_VPValue()), | ||
| m_ExtractLastLane(m_VPValue()))); | ||
| })) | ||
| continue; | ||
|
|
||
|
|
@@ -3481,8 +3475,8 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, | |
| unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1; | ||
| if (ExitIRI->getNumOperands() != 1) { | ||
| // The first of two operands corresponds to the latch exit, via MiddleVPBB | ||
| // predecessor. Extract its last lane. | ||
| ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder); | ||
| // predecessor. Extract its final lane. | ||
| ExitIRI->extractFinalLaneOfFirstOperand(MiddleBuilder); | ||
| } | ||
|
|
||
| VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx); | ||
|
|
@@ -4451,10 +4445,13 @@ void VPlanTransforms::addScalarResumePhis( | |
| auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); | ||
| assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && | ||
| "Cannot handle loops with uncountable early exits"); | ||
| if (IsFOR) | ||
| ResumeFromVectorLoop = MiddleBuilder.createNaryOp( | ||
| VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {}, | ||
| "vector.recur.extract"); | ||
| if (IsFOR) { | ||
| auto *ExtractPart = MiddleBuilder.createNaryOp( | ||
| VPInstruction::ExtractLastPart, ResumeFromVectorLoop); | ||
| ResumeFromVectorLoop = | ||
| MiddleBuilder.createNaryOp(VPInstruction::ExtractLastLane, | ||
| ExtractPart, {}, "vector.recur.extract"); | ||
| } | ||
| StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; | ||
| auto *ResumePhiR = ScalarPHBuilder.createScalarPhi( | ||
| {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); | ||
|
|
@@ -4552,10 +4549,11 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, | |
| // Now update VPIRInstructions modeling LCSSA phis in the exit block. | ||
| // Extract the penultimate value of the recurrence and use it as operand for | ||
| // the VPIRInstruction modeling the phi. | ||
| for (VPUser *U : FOR->users()) { | ||
| using namespace llvm::VPlanPatternMatch; | ||
| if (!match(U, m_ExtractLastElement(m_Specific(FOR)))) | ||
| for (VPRecipeBase &R : make_early_inc_range( | ||
| make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) { | ||
|
Comment on lines
+4552
to
+4553
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this change related and needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, now we have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ... as done above in adjustRecipesForReductions(). Matching this recipe-pair pattern going backwards from use to def is indeed simpler that going forward from def to use, hence the suggestion for (the abstract) ExtractFinalLane to represent both. |
||
| if (!match(&R, m_ExtractFinalLane(m_Specific(FOR)))) | ||
| continue; | ||
|
|
||
| // For VF vscale x 1, if vscale = 1, we are unable to extract the | ||
| // penultimate value of the recurrence. Instead we rely on the existing | ||
| // extract of the last element from the result of | ||
|
|
@@ -4565,9 +4563,9 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, | |
| Range)) | ||
| return; | ||
| VPValue *PenultimateElement = MiddleBuilder.createNaryOp( | ||
| VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()}, | ||
| {}, "vector.recur.extract.for.phi"); | ||
| cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement); | ||
| VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {}, | ||
| "vector.recur.extract.for.phi"); | ||
| cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would ExtractLastPart(ExtractLastLane()) also work? Perhaps worth a comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure how it would interact with interleaving, but extracting the last part works naturally, extracting the lane first would be more difficult I think