Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8816,14 +8816,24 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}

// Update all users outside the vector region. Also replace redundant
// ExtractLastElement.
// extracts.
for (auto *U : to_vector(OrigExitingVPV->users())) {
auto *Parent = cast<VPRecipeBase>(U)->getParent();
if (FinalReductionResult == U || Parent->getParent())
continue;
U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
if (match(U, m_ExtractLastElement(m_VPValue())))
cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);

// Check for redundant ExtractLastPart followed by ExtractLastLane.
if (!match(U, m_ExtractLastPart(m_VPValue())))
continue;

auto *ExtractPart = cast<VPInstruction>(U);
if (ExtractPart->getNumUsers() != 1)
continue;

VPUser *User = *ExtractPart->user_begin();
if (match(User, m_ExtractLastLane(m_VPValue())))
cast<VPInstruction>(User)->replaceAllUsesWith(FinalReductionResult);
}

// Adjust AnyOf reductions; replace the reduction phi for the selected value
Expand Down
18 changes: 8 additions & 10 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1017,12 +1017,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ComputeAnyOfResult,
ComputeFindIVResult,
ComputeReductionResult,
// Extracts the last lane from its operand if it is a vector, or the last
// part if scalar. In the latter case, the recipe will be removed during
// unrolling.
ExtractLastElement,
// Extracts the last lane for each part from its operand.
ExtractLastLanePerPart,
// Extracts the last part of its operand. Removed during unrolling.
ExtractLastPart,
// Extracts the last lane of its vector operand, per part.
ExtractLastLane,
// Extracts the second-to-last lane from its operand or the second-to-last
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
Expand Down Expand Up @@ -1402,10 +1400,10 @@ class VPIRInstruction : public VPRecipeBase {
return true;
}

/// Update the recipes first operand to the last lane of the operand using \p
/// Builder. Must only be used for VPIRInstructions with at least one operand
/// wrapping a PHINode.
void extractLastLaneOfFirstOperand(VPBuilder &Builder);
/// Update the recipe's first operand to the final lane of the operand using
/// \p Builder. Must only be used for VPIRInstructions with at least one
/// operand wrapping a PHINode.
void extractFinalLaneOfFirstOperand(VPBuilder &Builder);
};

/// An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,18 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return inferScalarType(R->getOperand(1));
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
return BaseTy;
}
case VPInstruction::ExtractLastPart: {
// Element type of ExtractLastPart is equal to the element type as its
// operand.
return inferScalarType(R->getOperand(0));
}
case VPInstruction::LogicalAnd:
assert(inferScalarType(R->getOperand(0))->isIntegerTy(1) &&
inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ static void createExtractsForLiveOuts(VPlan &Plan, VPBasicBlock *MiddleVPBB) {
ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
"exit values from early exits must be fixed when branch to "
"early-exit is added");
ExitIRI->extractLastLaneOfFirstOperand(B);
ExitIRI->extractFinalLaneOfFirstOperand(B);
}
}
}
Expand Down
20 changes: 14 additions & 6 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,9 +383,9 @@ m_EVL(const Op0_t &Op0) {
}

template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ExtractLastElement, Op0_t>
m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
inline VPInstruction_match<VPInstruction::ExtractLastLane, Op0_t>
m_ExtractLastLane(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastLane>(Op0);
}

template <typename Op0_t, typename Op1_t>
Expand All @@ -395,9 +395,17 @@ m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
}

template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
m_ExtractLastLanePerPart(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
inline VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>
m_ExtractLastPart(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastPart>(Op0);
}

template <typename Op0_t>
inline VPInstruction_match<
VPInstruction::ExtractLastLane,
VPInstruction_match<VPInstruction::ExtractLastPart, Op0_t>>
m_ExtractFinalLane(const Op0_t &Op0) {
return m_ExtractLastLane(m_ExtractLastPart(Op0));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would ExtractLastPart(ExtractLastLane()) also work? Perhaps worth a comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure how it would interact with interleaving, but extracting the last part works naturally, extracting the lane first would be more difficult I think

}

template <typename Op0_t, typename Op1_t, typename Op2_t>
Expand Down
30 changes: 15 additions & 15 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -524,8 +524,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
Expand Down Expand Up @@ -894,8 +894,7 @@ Value *VPInstruction::generate(VPTransformState &State) {

return ReducedPartRdx;
}
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractPenultimateElement: {
unsigned Offset =
getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
Expand All @@ -906,6 +905,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
// Extract lane VF - Offset from the operand.
Res = State.get(getOperand(0), VPLane::getLaneFromEnd(State.VF, Offset));
} else {
// TODO: Remove ExtractLastLane for scalar VFs.
assert(Offset <= 1 && "invalid offset to extract from");
Res = State.get(getOperand(0));
}
Expand Down Expand Up @@ -1163,7 +1163,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::ExtractLastElement: {
case VPInstruction::ExtractLastLane: {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that the current cost of ExtractLastElement seems inaccurate, when VF=1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, can update as follow-up, thanks. I think we should remove the ExtractLastLane for plans with just the scalar VF

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. Worth leaving behind a TODO?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added to :;execute, where it handles scalar VFs

// Add on the cost of extracting the element.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Independent nit): "Add on"?

auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,
Expand All @@ -1183,8 +1183,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}

bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractLastElement ||
getOpcode() == VPInstruction::ExtractLastLanePerPart ||
return getOpcode() == VPInstruction::ExtractLastLane ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
Expand Down Expand Up @@ -1247,8 +1246,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractLastLane:
case VPInstruction::ExtractLastPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
Expand Down Expand Up @@ -1395,11 +1394,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLane:
O << "extract-lane";
break;
case VPInstruction::ExtractLastElement:
O << "extract-last-element";
case VPInstruction::ExtractLastLane:
O << "extract-last-lane";
break;
case VPInstruction::ExtractLastLanePerPart:
O << "extract-last-lane-per-part";
case VPInstruction::ExtractLastPart:
O << "extract-last-part";
break;
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
Expand Down Expand Up @@ -1554,15 +1553,16 @@ InstructionCost VPIRInstruction::computeCost(ElementCount VF,
return 0;
}

void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
void VPIRInstruction::extractFinalLaneOfFirstOperand(VPBuilder &Builder) {
assert(isa<PHINode>(getInstruction()) &&
"can only update exiting operands to phi nodes");
assert(getNumOperands() > 0 && "must have at least one operand");
VPValue *Exiting = getOperand(0);
if (Exiting->isLiveIn())
return;

Exiting = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Exiting});
Exiting = Builder.createNaryOp(VPInstruction::ExtractLastPart, Exiting);
Exiting = Builder.createNaryOp(VPInstruction::ExtractLastLane, Exiting);
Comment on lines +1564 to +1565
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggests renaming the method to extractFinalLaneOfFirstOperand()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated, thanks

setOperand(0, Exiting);
}

Expand Down
68 changes: 33 additions & 35 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -858,7 +858,7 @@ static VPValue *optimizeLatchExitInductionUser(
VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
DenseMap<VPValue *, VPValue *> &EndValues, ScalarEvolution &SE) {
VPValue *Incoming;
if (!match(Op, m_ExtractLastElement(m_VPValue(Incoming))))
if (!match(Op, m_ExtractFinalLane(m_VPValue(Incoming))))
return nullptr;

auto *WideIV = getOptimizableIVOf(Incoming, SE);
Expand Down Expand Up @@ -1257,9 +1257,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}

// Look through ExtractLastElement (BuildVector ....).
if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
m_ExtractLastLanePerPart(m_BuildVector())))) {
// Look through ExtractLastLane (BuildVector ....).
if (match(&R, m_ExtractLastLane(m_BuildVector()))) {
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
Def->replaceAllUsesWith(
BuildVector->getOperand(BuildVector->getNumOperands() - 1));
Expand Down Expand Up @@ -1332,15 +1331,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}

if (match(Def,
m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))),
m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) {
if (match(Def, m_ExtractLastLane(m_Broadcast(m_VPValue(A))))) {
Def->replaceAllUsesWith(A);
return;
}

if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)),
m_ExtractLastLanePerPart(m_VPValue(A)))) &&
if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
(isa<VPReplicateRecipe>(A) &&
cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
Expand All @@ -1349,11 +1345,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return Def->replaceAllUsesWith(A);
}

if (Plan->getUF() == 1 &&
match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) {
return Def->replaceAllUsesWith(
Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
}
if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
return Def->replaceAllUsesWith(A);
Comment on lines +1348 to +1349
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: this is probably where ExtractLastLane's should be bypassed if VF is scalar.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the TODO to ::execute

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, it indeed affects ::execute, just seems natural to add the following here - analogous to bypassing redundant ExtractLastPart:

  if (Plan->hasScalarVFOnly() && match(Def, m_ExtractLastLane(m_VPValue(A))))
    return Def->replaceAllUsesWith(A);

or rather prevent its creation?

}

void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
Expand Down Expand Up @@ -1391,13 +1384,14 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
Clone->insertBefore(RepOrWidenR);
unsigned ExtractOpc =
vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
? VPInstruction::ExtractLastElement
: VPInstruction::ExtractLastLanePerPart;
auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
Ext->insertBefore(Clone);
Clone->setOperand(0, Ext);
VPBuilder Builder(Clone);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: would have been good to use Builder also for generating Clone.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, this requires adding support for creating VPReplicateRecipes in VPBuilder. Probably best done separately

VPValue *ExtractOp = Clone->getOperand(0);
if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
ExtractOp =
Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
ExtractOp =
Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
Clone->setOperand(0, ExtractOp);
RepR->eraseFromParent();
continue;
}
Expand All @@ -1409,8 +1403,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
return U->usesScalars(RepOrWidenR) ||
match(cast<VPRecipeBase>(U),
m_CombineOr(m_ExtractLastElement(m_VPValue()),
m_ExtractLastLanePerPart(m_VPValue())));
m_CombineOr(m_ExtractLastPart(m_VPValue()),
m_ExtractLastLane(m_VPValue())));
}))
continue;

Expand Down Expand Up @@ -3481,8 +3475,8 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
if (ExitIRI->getNumOperands() != 1) {
// The first of two operands corresponds to the latch exit, via MiddleVPBB
// predecessor. Extract its last lane.
ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
// predecessor. Extract its final lane.
ExitIRI->extractFinalLaneOfFirstOperand(MiddleBuilder);
}

VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
Expand Down Expand Up @@ -4451,10 +4445,13 @@ void VPlanTransforms::addScalarResumePhis(
auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
"Cannot handle loops with uncountable early exits");
if (IsFOR)
ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
"vector.recur.extract");
if (IsFOR) {
auto *ExtractPart = MiddleBuilder.createNaryOp(
VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
ResumeFromVectorLoop =
MiddleBuilder.createNaryOp(VPInstruction::ExtractLastLane,
ExtractPart, {}, "vector.recur.extract");
}
StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
{ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
Expand Down Expand Up @@ -4552,10 +4549,11 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
// the VPIRInstruction modeling the phi.
for (VPUser *U : FOR->users()) {
using namespace llvm::VPlanPatternMatch;
if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
for (VPRecipeBase &R : make_early_inc_range(
make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
Comment on lines +4552 to +4553
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change related and needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, now we have ExtractLastLane(ExtractLastPart(FOR)), iterating and matching the 2 recipe pattern seems simpler than finding ExtractLastPart users of FOR, that itself are only used by ExtractLastLane

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... as done above in adjustRecipesForReductions(). Matching this recipe-pair pattern going backwards from use to def is indeed simpler that going forward from def to use, hence the suggestion for (the abstract) ExtractFinalLane to represent both.

if (!match(&R, m_ExtractFinalLane(m_Specific(FOR))))
continue;

// For VF vscale x 1, if vscale = 1, we are unable to extract the
// penultimate value of the recurrence. Instead we rely on the existing
// extract of the last element from the result of
Expand All @@ -4565,9 +4563,9 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
Range))
return;
VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
{}, "vector.recur.extract.for.phi");
cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
"vector.recur.extract.for.phi");
cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
}
}
}
Loading