llvm · artagnon · Sep 22, 2025
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -64,7 +64,8 @@ class LoopVectorizeHints {
     HK_FORCE,
     HK_ISVECTORIZED,
     HK_PREDICATE,
-    HK_SCALABLE
+    HK_SCALABLE,
+    HK_SCALAREPILOGUE
   };
 
   /// Hint - associates name and validation with the hint value.
@@ -97,6 +98,9 @@ class LoopVectorizeHints {
   /// Says whether we should use fixed width or scalable vectorization.
   Hint Scalable;
 
+  /// Hint specifying how we should lower the scalar epilogue.
+  Hint ScalarEpilogue;
+
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
@@ -121,6 +125,33 @@ class LoopVectorizeHints {
     SK_PreferScalable = 1
   };
 
+  /// Whether it is allowed to have the original scalar loop execute at least
+  /// once. This may be needed as a fallback loop in case runtime
+  /// aliasing/dependence checks fail, or to handle the tail/remainder
+  /// iterations when the trip count is unknown or doesn't divide by the VF, or
+  /// as a peel-loop to handle gaps in interleave-groups. Under optsize and when
+  /// the trip count is very small we don't allow anyiterations to execute in
+  /// the scalar loop.
+  enum ScalarEpilogueKind {
+    // The default: allowing scalar epilogues.
+    SEK_Allowed,
+
+    // Vectorization with OptForSize: don't allow epilogues.
+    SEK_NotAllowedOptSize,
+
+    // A special case of vectorisation with OptForSize: loops with a very small
+    // trip count are considered for vectorization under OptForSize, thereby
+    // making sure the cost of their loop body is dominant, free of runtime
+    // guards and scalar iteration overheads.
+    SEK_NotAllowedLowTripLoop,
+
+    // Loop hint predicate indicating an epilogue is undesired.
+    SEK_NotNeededUsePredicate,
+
+    // Directive indicating we must either tail fold or not vectorize
+    SEK_NotAllowedUsePredicate
+  };
+
   LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced,
                      OptimizationRemarkEmitter &ORE,
                      const TargetTransformInfo *TTI = nullptr);
@@ -156,6 +187,21 @@ class LoopVectorizeHints {
       return FK_Disabled;
     return (ForceKind)Force.Value;
   }
+  ScalarEpilogueKind getScalarEpilogue() const {
+    return static_cast<ScalarEpilogueKind>(ScalarEpilogue.Value);
+  }
+  bool isScalarEpilogueAllowed() const {
+    return ScalarEpilogue.Value == SEK_Allowed;
+  }
+  void setScalarEpilogue(ScalarEpilogueKind SEK) { ScalarEpilogue.Value = SEK; }
+
+  // Determine how to lower the scalar epilogue and set it. Depends on 1)
+  // optimising for minimum code-size, 2) predicate compiler options, 3) loop
+  // hints forcing predication, and 4) a TTI hook that analyses whether the loop
+  // is suitable for predication.
+  void setScalarEpilogue(ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+                         TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL,
+                         InterleavedAccessInfo *IAI);
 
   /// \return true if scalable vectorization has been explicitly disabled.
   bool isScalableVectorizationDisabled() const {
@@ -196,6 +242,8 @@ class LoopVectorizeHints {
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter &ORE;
+
+  const TargetTransformInfo *TTI;
 };
 
 /// This holds vectorization requirements that must be verified late in

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -83,6 +83,34 @@ static cl::opt<bool> EnableHistogramVectorization(
     "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
     cl::desc("Enables autovectorization of some loops containing histograms"));
 
+// Option prefer-predicate-over-epilogue indicates that an epilogue is
+// undesired, that predication is preferred, and this lists all options. I.e.,
+// the vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+enum class PreferPredicateTy {
+  ScalarEpilogue = 0,
+  PredicateElseScalarEpilogue,
+  PredicateOrDontVectorize
+};
+
+static cl::opt<PreferPredicateTy> PreferPredicateOverEpilogue(
+    "prefer-predicate-over-epilogue",
+    cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden,
+    cl::desc("Tail-folding and predication preferences over creating a scalar "
+             "epilogue loop."),
+    cl::values(
+        clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue",
+                   "Don't tail-predicate loops, create scalar epilogue"),
+        clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+                   "predicate-else-scalar-epilogue",
+                   "prefer tail-folding, create scalar epilogue if tail "
+                   "folding fails."),
+        clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+                   "predicate-dont-vectorize",
+                   "prefers tail-folding, don't attempt vectorization if "
+                   "tail-folding fails.")));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -100,6 +128,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   case HK_PREDICATE:
   case HK_SCALABLE:
     return (Val == 0 || Val == 1);
+  case HK_SCALAREPILOGUE:
+    return Val <= SEK_NotAllowedUsePredicate;
   }
   return false;
 }
@@ -114,7 +144,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
       Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
-      TheLoop(L), ORE(ORE) {
+      ScalarEpilogue("scalarepilogue", SEK_Allowed, HK_SCALAREPILOGUE),
+      TheLoop(L), ORE(ORE), TTI(TTI) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
 
@@ -302,8 +333,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width,        &Interleave, &Force,
-                   &IsVectorized, &Predicate,  &Scalable};
+  Hint *Hints[] = {&Width,     &Interleave, &Force,         &IsVectorized,
+                   &Predicate, &Scalable,   &ScalarEpilogue};
   for (auto *H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
@@ -315,6 +346,63 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
   }
 }
 
+void LoopVectorizeHints::setScalarEpilogue(ProfileSummaryInfo *PSI,
+                                           BlockFrequencyInfo *BFI,
+                                           TargetLibraryInfo *TLI,
+                                           LoopVectorizationLegality &LVL,
+                                           InterleavedAccessInfo *IAI) {
+  // 1) OptSize takes precedence over all other options, i.e. if this is set,
+  // don't look at hints or options, and don't request a scalar epilogue.
+  // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
+  // LoopAccessInfo (due to code dependency and not being able to reliably get
+  // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
+  // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
+  // versioning when the vectorization is forced, unlike hasOptSize. So revert
+  // back to the old way and vectorize with versioning when forced. See D81345.)
+  Function *F = TheLoop->getHeader()->getParent();
+  if (F->hasOptSize() ||
+      (llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+                                   PGSOQueryType::IRPass) &&
+       getForce() != LoopVectorizeHints::FK_Enabled)) {
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedOptSize;
+    return;
+  }
+
+  // 2) If set, obey the directives
+  if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+    switch (PreferPredicateOverEpilogue) {
+    case PreferPredicateTy::ScalarEpilogue:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+      return;
+    case PreferPredicateTy::PredicateElseScalarEpilogue:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+      return;
+    case PreferPredicateTy::PredicateOrDontVectorize:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedUsePredicate;
+      return;
+    };
+  }
+
+  // 3) If set, obey the hints
+  switch (getPredicate()) {
+  case LoopVectorizeHints::FK_Enabled:
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+    return;
+  case LoopVectorizeHints::FK_Disabled:
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+    return;
+  };
+
+  // 4) if the TTI hook indicates this is profitable, request predication.
+  TailFoldingInfo TFI(TLI, &LVL, IAI);
+  if (TTI->preferPredicateOverEpilogue(&TFI)) {
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+    return;
+  }
+
+  ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+}
+
 // Return true if the inner loop \p Lp is uniform with regard to the outer loop
 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
 // executing the inner loop will execute the same iterations). This check is