diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 6ea6d2361eba7..102b069ac722d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1440,6 +1440,10 @@ class TargetTransformInfo { /// to a stack reload. unsigned getGISelRematGlobalCost() const; + /// \returns the lower bound of a trip count to decide on vectorization + /// while tail-folding. + unsigned getMinTripCountTailFoldingThreshold() const; + /// \returns True if the target supports scalable vectors. bool supportsScalableVectors() const; @@ -1830,6 +1834,7 @@ class TargetTransformInfo::Concept { ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual unsigned getGISelRematGlobalCost() const = 0; + virtual unsigned getMinTripCountTailFoldingThreshold() const = 0; virtual bool enableScalableVectorization() const = 0; virtual bool supportsScalableVectors() const = 0; virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType, @@ -2453,6 +2458,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getGISelRematGlobalCost(); } + unsigned getMinTripCountTailFoldingThreshold() const override { + return Impl.getMinTripCountTailFoldingThreshold(); + } + bool supportsScalableVectors() const override { return Impl.supportsScalableVectors(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 1a75cb35549e4..da1f53aa33cb0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -803,6 +803,8 @@ class TargetTransformInfoImplBase { unsigned getGISelRematGlobalCost() const { return 1; } + unsigned getMinTripCountTailFoldingThreshold() const { return 0; } + bool supportsScalableVectors() const { return false; } bool enableScalableVectorization() const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index cfa6e3a976264..143f03ccac391 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1108,6 +1108,10 @@ unsigned TargetTransformInfo::getGISelRematGlobalCost() const { return TTIImpl->getGISelRematGlobalCost(); } +unsigned TargetTransformInfo::getMinTripCountTailFoldingThreshold() const { + return TTIImpl->getMinTripCountTailFoldingThreshold(); +} + bool TargetTransformInfo::supportsScalableVectors() const { return TTIImpl->supportsScalableVectors(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 2231f8705998d..0c5eadeffcdbd 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -334,6 +334,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { return 2; } + unsigned getMinTripCountTailFoldingThreshold() const { + return ST->hasSVE() ? 5 : 0; + } + PredicationStyle emitGetActiveLaneMask() const { if (ST->hasSVE()) return PredicationStyle::DataAndControlFlow; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 238b074089aa1..91bc7dbad1d04 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -10109,8 +10109,19 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { - LLVM_DEBUG(dbgs() << "\n"); - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { + LLVM_DEBUG(dbgs() << "\n"); + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + } else { + LLVM_DEBUG(dbgs() << " But the target considers the trip count too " + "small to consider vectorizing.\n"); + reportVectorizationFailure( + "The trip count is below the minial threshold value.", + "loop trip count is too low, avoiding vectorization", + "LowTripCount", ORE, L); + Hints.emitRemarkWithHints(); + return false; + } } } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll index a97782aac2845..bc3e1c2513a11 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -40,18 +40,22 @@ for.end: ; preds = %for.body define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip5_i8( -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] -; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK: call void @llvm.masked.store.nxv16i8.p0nxv16i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 16 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 5) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NOT:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: br i1 true, label %middle.block, label %vector.body +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP0]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[I_08]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; entry: br label %for.body