Skip to content

Commit

Permalink
[LV] Update interleaving count computation when scalar epilogue loop …
Browse files Browse the repository at this point in the history
…needs to run at least once (#79651)

Update loop interleaving count computation to address loops that require at least one scalar iteration in the epilogue loop. For this case, the available trip count for interleaving the loop is one less.
  • Loading branch information
nilanjana87 committed Jan 29, 2024
1 parent 9a1ca24 commit c492eb6
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 18 deletions.
27 changes: 20 additions & 7 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5436,7 +5436,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");

unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (KnownTC) {
if (KnownTC > 0) {
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC =
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;

// If trip count is known we select between two prospective ICs, where
// 1) the aggressive IC is capped by the trip count divided by VF
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
Expand All @@ -5446,27 +5451,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// we run the vector loop at least twice.

unsigned InterleaveCountUB = bit_floor(
std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
unsigned InterleaveCountLB = bit_floor(std::max(
1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
MaxInterleaveCount = InterleaveCountLB;

if (InterleaveCountUB != InterleaveCountLB) {
unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
unsigned TailTripCountUB =
(AvailableTC % (EstimatedVF * InterleaveCountUB));
unsigned TailTripCountLB =
(AvailableTC % (EstimatedVF * InterleaveCountLB));
// If both produce same scalar tail, maximize the IC to do the same work
// in fewer vector loop iterations
if (TailTripCountUB == TailTripCountLB)
MaxInterleaveCount = InterleaveCountUB;
}
} else if (BestKnownTC) {
} else if (BestKnownTC && *BestKnownTC > 0) {
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
? (*BestKnownTC) - 1
: *BestKnownTC;

// If trip count is an estimated compile time constant, limit the
// IC to be capped by the trip count divided by VF * 2, such that the vector
// loop runs at least twice to make interleaving seem profitable when there
// is an epilogue loop present. Since exact Trip count is not known we
// choose to be conservative in our IC estimate.
MaxInterleaveCount = bit_floor(std::max(
1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
}

assert(MaxInterleaveCount > 0 &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ for.end:
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
; remainder than IC 2
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
entry:
br label %for.body
Expand Down Expand Up @@ -211,17 +211,17 @@ for.end:
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
; interleaving.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
; CHECK-IR-NEXT: iter.check:
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
; CHECK-IR: vector.main.loop.iter.check:
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
;
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ for.end:
; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
; correctness, making at most 31 iterations available for interleaving.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
; than IC 2
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
entry:
br label %for.body
Expand Down Expand Up @@ -229,15 +229,15 @@ for.end:
; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
; correctness, making at most 127 iterations available for interleaving.
; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
; CHECK-IR-NEXT: entry:
; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
entry:
br label %for.body

Expand Down

0 comments on commit c492eb6

Please sign in to comment.