[LV] Update interleaving count computation when scalar epilogue loop …

…needs to run at least once (#79651) Update loop interleaving count computation to address loops that require at least one scalar iteration in the epilogue loop. For this case, the available trip count for interleaving the loop is one less.
llvm · Jan 29, 2024 · c492eb6 · c492eb6
1 parent 9a1ca24
commit c492eb6
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 18 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5436,7 +5436,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
 
   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (KnownTC) {
+  if (KnownTC > 0) {
+    // At least one iteration must be scalar when this constraint holds. So the
+    // maximum available iterations for interleaving is one less.
+    unsigned AvailableTC =
+        requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
+
     // If trip count is known we select between two prospective ICs, where
     // 1) the aggressive IC is capped by the trip count divided by VF
     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5446,27 +5451,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     // we run the vector loop at least twice.
 
     unsigned InterleaveCountUB = bit_floor(
-        std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+        std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
     unsigned InterleaveCountLB = bit_floor(std::max(
-        1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
     MaxInterleaveCount = InterleaveCountLB;
 
     if (InterleaveCountUB != InterleaveCountLB) {
-      unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
-      unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+      unsigned TailTripCountUB =
+          (AvailableTC % (EstimatedVF * InterleaveCountUB));
+      unsigned TailTripCountLB =
+          (AvailableTC % (EstimatedVF * InterleaveCountLB));
       // If both produce same scalar tail, maximize the IC to do the same work
       // in fewer vector loop iterations
       if (TailTripCountUB == TailTripCountLB)
         MaxInterleaveCount = InterleaveCountUB;
     }
-  } else if (BestKnownTC) {
+  } else if (BestKnownTC && *BestKnownTC > 0) {
+    // At least one iteration must be scalar when this constraint holds. So the
+    // maximum available iterations for interleaving is one less.
+    unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
+                               ? (*BestKnownTC) - 1
+                               : *BestKnownTC;
+
     // If trip count is an estimated compile time constant, limit the
     // IC to be capped by the trip count divided by VF * 2, such that the vector
     // loop runs at least twice to make interleaving seem profitable when there
     // is an epilogue loop present. Since exact Trip count is not known we
     // choose to be conservative in our IC estimate.
     MaxInterleaveCount = bit_floor(std::max(
-        1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
   }
 
   assert(MaxInterleaveCount > 0 &&

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -129,9 +129,9 @@ for.end:
 ; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the 
 ; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar 
 ; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar 
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar 
 ; remainder than IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -211,17 +211,17 @@ for.end:
 ; the resulting interleaved group in this case may access memory out-of-bounds, it requires 
 ; a scalar epilogue iteration for correctness, making at most 127 iterations available for 
 ; interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
 ; remainder than IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 ; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
 ; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
 ; CHECK-IR-NEXT:  iter.check:
 ; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
 ; CHECK-IR-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
 ; CHECK-IR:       vector.main.loop.iter.check:
-; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
 ; CHECK-IR-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
 ;
 entry:

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -33,9 +33,9 @@ for.end:
 ; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 31 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
 ; than IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -229,15 +229,15 @@ for.end:
 ; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 127 iterations available for interleaving.
-; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
+; Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
 ; remainder than IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 ; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
 ; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
 ; CHECK-IR-NEXT:  entry:
-; CHECK-IR-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-IR-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 entry:
   br label %for.body