-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LoopVectorize][AArch64][SVE] Generate wide active lane masks #81140
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Momchil Velikov (momchil-velikov) ChangesThis patch makes the The motivating example is a vectorised loop with unroll factor 2 that How wide is the lane mask that the vectoriser emits is controlled Patch is 497.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81140.diff 23 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 58577a6b6eb5c..67e1b45cce29c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1228,6 +1228,8 @@ class TargetTransformInfo {
/// and the number of execution units in the CPU.
unsigned getMaxInterleaveFactor(ElementCount VF) const;
+ ElementCount getMaxPredicateLength(ElementCount VF) const;
+
/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
static OperandValueInfo getOperandInfo(const Value *V);
@@ -1981,6 +1983,9 @@ class TargetTransformInfo::Concept {
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
+
+ virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;
+
virtual InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
@@ -2601,6 +2606,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxInterleaveFactor(ElementCount VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
+
+ ElementCount getMaxPredicateLength(ElementCount VF) const override {
+ return Impl.getMaxPredicateLength(VF);
+ }
+
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
unsigned &JTSize,
ProfileSummaryInfo *PSI,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3d5db96e86b80..b6d01e0764ab1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -528,6 +528,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index bb17298daba03..2b0d0f3ed6f70 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -881,6 +881,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1f11f0d7dd620..daea8e48981ec 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -808,6 +808,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
+ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
+ return TTIImpl->getMaxPredicateLength(VF);
+}
+
TargetTransformInfo::OperandValueInfo
TargetTransformInfo::getOperandInfo(const Value *V) {
OperandValueKind OpInfo = OK_AnyValue;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8573939b04389..4405e8d3f91df 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1813,8 +1813,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
EVT OpVT) const {
- // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
- if (!Subtarget->hasSVE())
+ // Only SVE/SME has a 1:1 mapping from intrinsic -> instruction (whilelo).
+ if (!Subtarget->hasSVEorSME())
return true;
// We can only support legal predicate result types. We can use the SVE
@@ -20004,47 +20004,98 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
return SDValue();
}
-static SDValue performIntrinsicCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
+static SDValue tryCombineGetActiveLaneMask(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
- unsigned IID = getIntrinsicID(N);
- switch (IID) {
- default:
- break;
- case Intrinsic::get_active_lane_mask: {
- SDValue Res = SDValue();
- EVT VT = N->getValueType(0);
- if (VT.isFixedLengthVector()) {
- // We can use the SVE whilelo instruction to lower this intrinsic by
- // creating the appropriate sequence of scalable vector operations and
- // then extracting a fixed-width subvector from the scalable vector.
+ EVT VT = N->getValueType(0);
+ if (VT.isFixedLengthVector()) {
+ // We can use the SVE whilelo instruction to lower this intrinsic by
+ // creating the appropriate sequence of scalable vector operations and
+ // then extracting a fixed-width subvector from the scalable vector.
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
- SDLoc DL(N);
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+ EVT WhileVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ ElementCount::getScalable(VT.getVectorNumElements()));
- EVT WhileVT = EVT::getVectorVT(
- *DAG.getContext(), MVT::i1,
- ElementCount::getScalable(VT.getVectorNumElements()));
+ // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
+ EVT PromVT = getPromotedVTForPredicate(WhileVT);
- // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
- EVT PromVT = getPromotedVTForPredicate(WhileVT);
+ // Get the fixed-width equivalent of PromVT for extraction.
+ EVT ExtVT =
+ EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
+ VT.getVectorElementCount());
- // Get the fixed-width equivalent of PromVT for extraction.
- EVT ExtVT =
- EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
- VT.getVectorElementCount());
+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
+ N->getOperand(1), N->getOperand(2));
+ Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
+ DAG.getConstant(0, DL, MVT::i64));
+ Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
- N->getOperand(1), N->getOperand(2));
- Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
- DAG.getConstant(0, DL, MVT::i64));
- Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- }
return Res;
}
+
+ if (!Subtarget->hasSVE2p1() && !Subtarget->hasSME2())
+ return SDValue();
+
+ if (!N->hasNUsesOfValue(2, 0))
+ return SDValue();
+
+ auto It = N->use_begin();
+ SDNode *Lo = *It++;
+ SDNode *Hi = *It;
+
+ const uint64_t HalfSize = VT.getVectorMinNumElements() / 2;
+ uint64_t OffLo, OffHi;
+ if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Lo->getOperand(1).getNode(), OffLo) ||
+ (OffLo != 0 && OffLo != HalfSize) ||
+ Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Hi->getOperand(1).getNode(), OffHi) ||
+ (OffHi != 0 && OffHi != HalfSize))
+ return SDValue();
+
+ if (OffLo > OffHi) {
+ std::swap(Lo, Hi);
+ std::swap(OffLo, OffHi);
+ }
+
+ if (OffLo != 0 || OffHi != HalfSize)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
+ SDValue Idx = N->getOperand(1);
+ SDValue TC = N->getOperand(2);
+ if (Idx.getValueType() != MVT::i64) {
+ Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
+ TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
+ }
+ auto R =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
+ {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
+
+ DCI.CombineTo(Lo, R.getValue(0));
+ DCI.CombineTo(Hi, R.getValue(1));
+
+ return SDValue(N, 0);
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned IID = getIntrinsicID(N);
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::get_active_lane_mask:
+ return tryCombineGetActiveLaneMask(N, DCI, Subtarget);
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2c..73aca77305df1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3285,6 +3285,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
}
+ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
+ // Do not create masks bigger than `<vscale x 16 x i1>`.
+ unsigned N = ST->hasSVE() ? 16 : 0;
+ // Do not create masks that are more than twice the VF.
+ N = std::min(N, 2 * VF.getKnownMinValue());
+ return VF.isScalable() ? ElementCount::getScalable(N)
+ : ElementCount::getFixed(N);
+}
+
// For Falkor, we want to avoid having too many strided loads in a loop since
// that can exhaust the HW prefetcher resources. We adjust the unroller
// MaxCount preference below to attempt to ensure unrolling doesn't create too
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index de39dea2be43e..6501cc4a85e8d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
unsigned getMaxInterleaveFactor(ElementCount VF);
+ ElementCount getMaxPredicateLength(ElementCount VF) const;
+
bool prefersVectorizedAddressing() const;
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7ebf78e54ceb..0e681c8080bfd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -184,6 +184,14 @@ class VPBuilder {
VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
DebugLoc DL = {}, const Twine &Name = "");
+ VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
+ const Twine &Name = "") {
+ auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
+ if (BB)
+ BB->insert(ALM, InsertPt);
+ return ALM;
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1a7b301c35f2b..bac66e633a6f3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -599,6 +599,10 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const {
+ return TTI->getMaxPredicateLength(VF);
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -7550,7 +7554,8 @@ LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
// Perform the actual loop transformation.
- VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+ VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
+ DT, ILV.Builder, &ILV, &BestVPlan,
OrigLoop->getHeader()->getContext());
// 0. Generate SCEV-dependent code into the preheader, including TripCount,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 162a3c4b195e5..6f20bc148e72e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -234,15 +234,16 @@ struct VPIteration {
/// VPTransformState holds information passed down when "executing" a VPlan,
/// needed for generating the output IR.
struct VPTransformState {
- VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
- DominatorTree *DT, IRBuilderBase &Builder,
+ VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred,
+ LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx)
- : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
- LVer(nullptr), TypeAnalysis(Ctx) {}
+ : VF(VF), UF(UF), MaxPred(MaxPred), LI(LI), DT(DT), Builder(Builder),
+ ILV(ILV), Plan(Plan), LVer(nullptr), TypeAnalysis(Ctx) {}
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
ElementCount VF;
unsigned UF;
+ ElementCount MaxPred;
/// Hold the indices to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
@@ -1275,6 +1276,43 @@ class VPInstruction : public VPRecipeWithIRFlags {
}
};
+class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags {
+ const std::string Name;
+
+public:
+ VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, DebugLoc DL = {},
+ const Twine &Name = "")
+ : VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC,
+ std::initializer_list<VPValue *>{IV, TC}, DL),
+ Name(Name.str()) {}
+
+ VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC)
+
+ VPRecipeBase *clone() override {
+ SmallVector<VPValue *, 2> Operands(operands());
+ assert(Operands.size() == 2 && "by construction");
+ auto *New = new VPActiveLaneMaskRecipe(Operands[0], Operands[1],
+ getDebugLoc(), Name);
+ New->transferFlags(*this);
+ return New;
+ }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+
+ return getOperand(0) == Op;
+ }
+};
+
/// VPWidenRecipe is a recipe for producing a copy of vector type its
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ee0cb2bd6153..c0c75072f4023 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -307,18 +307,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Value *Op2 = State.get(getOperand(2), Part);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
}
- case VPInstruction::ActiveLaneMask: {
- // Get first lane of vector induction variable.
- Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
- // Get the original loop tripcount.
- Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
- auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = VectorType::get(Int1Ty, State.VF);
- return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
- {PredTy, ScalarTC->getType()},
- {VIVElem0, ScalarTC}, nullptr, Name);
- }
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
//
@@ -526,7 +515,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case Instruction::ICmp:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
- case VPInstruction::ActiveLaneMask:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::BranchOnCount:
@@ -561,9 +549,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::SLPStore:
O << "combined store";
break;
- case VPInstruction::ActiveLaneMask:
- O << "active lane mask";
- break;
case VPInstruction::FirstOrderRecurrenceSplice:
O << "first-order splice";
break;
@@ -594,8 +579,78 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
DL.print(O);
}
}
+
+void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+
+ printAsOperand(O, SlotTracker);
+ O << " = active lane mask";
+ printFlags(O);
+ printOperands(O, SlotTracker);
+
+ if (auto DL = getDebugLoc()) {
+ O << ", !dbg ";
+ DL.print(O);
+ }
+}
+
#endif
+void VPActiveLaneMaskRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPInstruction executing an Instance");
+
+ IRBuilderBase &Builder = State.Builder;
+ Builder.SetCurrentDebugLocation(getDebugLoc());
+
+ auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+ auto *PredTy = VectorType::get(Int1Ty, State.VF);
+
+ unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(),
+ State.UF * State.VF.getKnownMinValue());
+ if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() ||
+ MaxPred % State.VF.getKnownMinValue() != 0) {
+ for (int Part = State.UF - 1; Part >= 0; --Part) {
+ // Get first lane of vector induction variable.
+ Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+ // Get the original loop tripcount.
+ Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
+ Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+ {PredTy, ScalarTC->getType()},
+ {VIVElem0, ScalarTC}, nullptr, Name);
+ State.set(this, V, Part);
+ }
+ return;
+ }
+
+ // Generate long active lane masks covering all the unrolled iterations.
+ unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue();
+ auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable());
+ SmallVector<Value *> LongMask(State.UF / PartsPerMask, nullptr);
+ for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) {
+ // Get first lane of vector induction variable.
+ Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+ // Get the original loop tripcount.
+ Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
+ Value *V = Builder.C...
[truncated]
|
39f1b89
to
b65dbdf
Compare
Perhaps this is a nonsensical suggestion but why not just always emit a VF*UF sized active lane mask when it is used for control flow. Then leave the code generator to split it when necessary. Perhaps we'll need to add wrap flag support to get.active.mask to make that job easier but this should not be all that different to what LoopVectorize would have to do anyway. |
Because the code generator can't split it without introducing inefficient control flow for wrap-around checks.
What would be the advantage compared to the current approach? |
Sure, hence my wrap flag comment. LoopVectorize must either already know there is no wrapping in order to emit an active lane mask per unroll factor, or perform saturating maths. The former case can be passed with the intrinsic to help the code generator and the latter case would match what the code generator should do today.
It would remove the need for yet another TTI function and hence divergence within LoopVectorize. So perhaps not great reasons but what's the advantage of the current approach if neither hamper code generation? |
The advantage it does not require updating other backends to handle a different |
b65dbdf
to
6b93675
Compare
You can test this locally with the following command:git-clang-format --diff ae8627809076390dbab04e01f3bf9d384c9e124e ee468ec26ba3ae15182623687d26c888d35b6a3f -- llvm/include/llvm/Analysis/TargetTransformInfo.h llvm/include/llvm/Analysis/TargetTransformInfoImpl.h llvm/include/llvm/CodeGen/BasicTTIImpl.h llvm/lib/Analysis/TargetTransformInfo.cpp llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h llvm/lib/Transforms/Vectorize/LoopVectorize.cpp llvm/lib/Transforms/Vectorize/VPlan.cpp llvm/lib/Transforms/Vectorize/VPlan.h llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp llvm/lib/Transforms/Vectorize/VPlanValue.h View the diff from clang-format here.diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 5cd18ea7ba..0ccd94f0cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -221,7 +221,6 @@ m_BranchOnCond(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::BranchOnCond>(Op0);
}
-
template <typename Op0_t, typename Op1_t>
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
@@ -292,8 +291,7 @@ m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1);
}
-template <typename Op0_t, typename Op1_t>
-struct VPActiveLaneMask_match {
+template <typename Op0_t, typename Op1_t> struct VPActiveLaneMask_match {
Op0_t Op0;
Op1_t Op1;
@@ -315,8 +313,8 @@ struct VPActiveLaneMask_match {
};
template <typename Op0_t, typename Op1_t>
-inline VPActiveLaneMask_match<Op0_t, Op1_t>
-m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
+inline VPActiveLaneMask_match<Op0_t, Op1_t> m_ActiveLaneMask(const Op0_t &Op0,
+ const Op1_t &Op1) {
return {Op0, Op1};
}
|
6b93675
to
c04687f
Compare
0070e29
to
91f14d3
Compare
80a9f39
to
a731021
Compare
Rebased. |
This patch makes the LoopVectorize generate lane masks longer than the VF to allow the target to better utilise the instruction set. The vectorizer emit one or more wide `llvm.get.active.lane.mask.*` calls plus several `llvm.vector.extract.*` calls to yield the required number of VF-wide masks. The motivating exammple is a vectorised loop with unroll factor 2 that can use the SVE2.1 `whilelo` instruction with predicate pair result, or a SVE `whilelo` instruction with smaller element size plus `punpklo`/`punpkhi`. How wide is the lane mask that the vectoriser emits is controlled by a TargetTransformInfo hook `getMaxPredicateLength`.The default impementation (return the same length as the VF) keeps the change non-functional for targets that can't or are not prepared to handle wider lane masks.
a731021
to
ee468ec
Compare
Ping? |
auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); | ||
auto *PredTy = VectorType::get(Int1Ty, State.VF); | ||
|
||
unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Conceptually the decision whether to widen the active lane mask or not shouldn't be taken at codegen (::execute), but instead performed as transform (or possibly on construction, if it is simple to determine). This makes both codegen and cost-modeling based on the VPlan easier, as well makes things more explicit in the representation itself.
If this depends on the concrete chosen VF/UF, it can be transformed late in the pipeline (like optimizeForVFAndUF
).
@@ -1329,6 +1329,50 @@ class VPInstruction : public VPRecipeWithIRFlags { | |||
} | |||
}; | |||
|
|||
class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At the moment, it's not clear why it is needed to move this out of VPInstruction, as it still only uses an opcode + operands and no extra data. Depending on what information exactly is used for widen codegen, there may be the need to have a separate class, but in general using VPInstruction when the information can be encoded easily via opcode + VPValue operands only is preferred.
unsigned N = ST->hasSVE() ? 16 : 0; | ||
// Do not create masks that are more than twice the VF. | ||
N = std::min(N, 2 * VF.getKnownMinValue()); | ||
return VF.isScalable() ? ElementCount::getScalable(N) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the profitability be determined in terms of cost of active.lane.mask + different predicate vectors?
This patch makes the
LoopVectorize
pass able to generate lanemasks longer than the VF to allow the target to better utilise
the instruction set. The vectoriser can emit one or more wide
llvm.get.active.lane.mask.*
calls plus severalllvm.vector.extract.*
calls to yield the required number of VF-wide masks.
The motivating example is a vectorised loop with unroll factor 2 that
can use the SVE2.1
whilelo
instruction with predicate pair result, ora SVE
whilelo
instruction with smaller element size pluspunpklo
/punpkhi
.How wide is the lane mask that the vectoriser emits is controlled
by a
TargetTransformInfo
hookgetMaxPredicateLength
. The defaultimplementation (return the same length as the VF) keeps the
change non-functional for targets that can't or are not prepared
to handle wider lane masks.