diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3483e2c968e6b..17a0d01f18072 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5436,7 +5436,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (KnownTC) { + if (KnownTC > 0) { + // At least one iteration must be scalar when this constraint holds. So the + // maximum available iterations for interleaving is one less. + unsigned AvailableTC = + requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC; + // If trip count is known we select between two prospective ICs, where // 1) the aggressive IC is capped by the trip count divided by VF // 2) the conservative IC is capped by the trip count divided by (VF * 2) @@ -5446,27 +5451,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // we run the vector loop at least twice. unsigned InterleaveCountUB = bit_floor( - std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount))); + std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount))); unsigned InterleaveCountLB = bit_floor(std::max( - 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount))); + 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); MaxInterleaveCount = InterleaveCountLB; if (InterleaveCountUB != InterleaveCountLB) { - unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB)); - unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB)); + unsigned TailTripCountUB = + (AvailableTC % (EstimatedVF * InterleaveCountUB)); + unsigned TailTripCountLB = + (AvailableTC % (EstimatedVF * InterleaveCountLB)); // If both produce same scalar tail, maximize the IC to do the same work // in fewer vector loop iterations if (TailTripCountUB == TailTripCountLB) MaxInterleaveCount = InterleaveCountUB; } - } else if (BestKnownTC) { + } else if (BestKnownTC && *BestKnownTC > 0) { + // At least one iteration must be scalar when this constraint holds. So the + // maximum available iterations for interleaving is one less. + unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) + ? (*BestKnownTC) - 1 + : *BestKnownTC; + // If trip count is an estimated compile time constant, limit the // IC to be capped by the trip count divided by VF * 2, such that the vector // loop runs at least twice to make interleaving seem profitable when there // is an epilogue loop present. Since exact Trip count is not known we // choose to be conservative in our IC estimate. MaxInterleaveCount = bit_floor(std::max( - 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount))); + 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); } assert(MaxInterleaveCount > 0 && diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll index 97c6d2a819615..691c0fc8facc4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll @@ -129,9 +129,9 @@ for.end: ; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the ; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar ; epilogue iteration for correctness, making at most 63 iterations available for interleaving. -; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar +; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar ; remainder than IC 2 -; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 1) define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) { entry: br label %for.body @@ -211,9 +211,9 @@ for.end: ; the resulting interleaved group in this case may access memory out-of-bounds, it requires ; a scalar epilogue iteration for correctness, making at most 127 iterations available for ; interleaving. -; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar +; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar ; remainder than IC 4 -; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 4) +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) { ; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd( ; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) { @@ -221,7 +221,7 @@ define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr n ; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8 ; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]] ; CHECK-IR: vector.main.loop.iter.check: -; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64 +; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32 ; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll index 526fe0dc0910d..6ea0229ab8ea0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll @@ -33,9 +33,9 @@ for.end: ; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for ; correctness, making at most 31 iterations available for interleaving. -; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder +; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder ; than IC 2 -; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 1) define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) { entry: br label %for.body @@ -229,15 +229,15 @@ for.end: ; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for ; correctness, making at most 127 iterations available for interleaving. -; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue. -; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar +; Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue. +; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar ; remainder than IC 4 -; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 8) +; CHECK: remark: :0:0: vectorized loop (vectorization width: 16, interleaved count: 2) define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) { ; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd( ; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) { ; CHECK-IR-NEXT: entry: -; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] entry: br label %for.body