[LV] Pre-committing tests for changing loop interleaving count computation #70272

nilanjana87 · 2023-10-26T00:23:51Z

Added tests for evaluating changes to loop interleaving count computation and for removing loop interleaving threshold.

llvmbot · 2023-10-26T00:24:58Z

@llvm/pr-subscribers-llvm-transforms

Author: Nilanjana Basu (nilanjana87)

Changes

Added tests for evaluating changes to loop interleaving count computation and for removing loop interleaving threshold.

Full diff: https://github.com/llvm/llvm-project/pull/70272.diff

2 Files Affected:

(added) llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll (+106)
(modified) llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll (+190-4)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
new file mode 100644
index 000000000000000..74fd2f44fce0109
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -force-vector-width=64 -O3 -S -pass-remarks=loop-vectorize 2>&1 | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For a loop with known trip count of 128, when we force VF 64, it should use
+; IC 2, since there is no remainder loop needed when the vector loop runs.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_tc_128(ptr %p, ptr %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with known trip count of 129, when we force VF 64, it should use
+; IC 1, since there may be a remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_tc_129(ptr %p, ptr %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 129
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with unknown trip count but a profile showing an approx TC estimate of 128, 
+; when we force VF 64, it should use IC 2, since chances are high that the remainder loop
+; won't need to run
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_profile_tc_128(ptr %p, ptr %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+  ret void
+}
+
+; For a loop with unknown trip count but a profile showing an approx TC estimate of 129, 
+; when we force VF 64, it should use IC 1, since chances are high that the remainder loop
+; will need to run
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_profile_tc_129(ptr %p, ptr %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !1
+
+for.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 127}
+!1 = !{!"branch_weights", i32 1, i32 128}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
index 290be569bc12524..6b71cfef06be149 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
@@ -8,20 +8,20 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ; We don't unroll this loop because it has a small constant trip count.
 ;
-; CHECK-VECTOR-LABEL: @foo(
+; CHECK-VECTOR-LABEL: @foo_trip_count_8(
 ; CHECK-VECTOR: load <4 x i32>
 ; CHECK-VECTOR-NOT: load <4 x i32>
 ; CHECK-VECTOR: store <4 x i32>
 ; CHECK-VECTOR-NOT: store <4 x i32>
 ; CHECK-VECTOR: ret
 ;
-; CHECK-SCALAR-LABEL: @foo(
+; CHECK-SCALAR-LABEL: @foo_trip_count_8(
 ; CHECK-SCALAR: load i32, ptr
 ; CHECK-SCALAR-NOT: load i32, ptr
 ; CHECK-SCALAR: store i32
 ; CHECK-SCALAR-NOT: store i32
 ; CHECK-SCALAR: ret
-define i32 @foo(ptr nocapture %A) nounwind uwtable ssp {
+define i32 @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp {
   br label %1
 
 ; <label>:1                                       ; preds = %1, %0
@@ -32,7 +32,193 @@ define i32 @foo(ptr nocapture %A) nounwind uwtable ssp {
   store i32 %4, ptr %2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  %exitcond = icmp eq i32 %lftr.wideiv, 8
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; CHECK-VECTOR-LABEL: @foo_trip_count_16(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo_trip_count_16(
+; CHECK-SCALAR: load i32, ptr
+; CHECK-SCALAR-NOT: load i32, ptr
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo_trip_count_16(ptr nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, ptr %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 16
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; CHECK-VECTOR-LABEL: @foo_trip_count_17(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo_trip_count_17(
+; CHECK-SCALAR: load i32, ptr
+; CHECK-SCALAR-NOT: load i32, ptr
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo_trip_count_17(ptr nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, ptr %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 17
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; CHECK-VECTOR-LABEL: @foo_trip_count_24(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo_trip_count_24(
+; CHECK-SCALAR: load i32, ptr
+; CHECK-SCALAR-NOT: load i32, ptr
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo_trip_count_24(ptr nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, ptr %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 24
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; CHECK-VECTOR-LABEL: @foo_trip_count_25(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo_trip_count_25(
+; CHECK-SCALAR: load i32, ptr
+; CHECK-SCALAR-NOT: load i32, ptr
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo_trip_count_25(ptr nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, ptr %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 25
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; CHECK-VECTOR-LABEL: @foo_trip_count_33(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo_trip_count_33(
+; CHECK-SCALAR: load i32, ptr
+; CHECK-SCALAR-NOT: load i32, ptr
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo_trip_count_33(ptr nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, ptr %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 33
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; CHECK-VECTOR-LABEL: @foo_trip_count_101(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo_trip_count_101(
+; CHECK-SCALAR: load i32, ptr
+; CHECK-SCALAR-NOT: load i32, ptr
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo_trip_count_101(ptr nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, ptr %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 101
   br i1 %exitcond, label %5, label %1
 
 ; <label>:5                                       ; preds = %1

Since this patch is the first part of a two-part patch for changing loop interleaving count computation, followed by removing loop interleaving threshold, these tests remain unchanged for the current (first) patch but will be modified for the second patch. The original PR for these tests is in llvm#70272.

fhahn

Thanks for the additional test coverage. Some comments inline

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll

llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll

- replaced -O3 in tests with -p loop-vectorize - added "TODO" in comments for tests that are meant to change after the subsequent patch - used named labels for basic blocks

…ments

llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll

…ments

…n change in subsequent patches without the need to force VF

…ments

fhahn

LGTM, thanks!

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll

fhahn

LGTM, thanks!

…ments

…ation (llvm#70272) Added tests for evaluating changes to loop interleaving count computation and for removing loop interleaving threshold in subsequent patches.

[LV] Change loops' interleave count computation A set of microbenchmarks in llvm-test-suite (llvm/llvm-test-suite#56), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial when the vector loop runs at least twice or when the epilogue loop trip count (TC) is minimal. Therefore, we choose interleaving count (IC) between TC/VF & TC/2*VF (VF = vectorization factor), such that remainder TC for the epilogue loop is minimum while the IC is maximum in case the remainder TC is same for both. The initial tests for this change were submitted in PRs: #70272 and #74689.

…ation (llvm#70272) Added tests for evaluating changes to loop interleaving count computation and for removing loop interleaving threshold in subsequent patches.

[LV] Change loops' interleave count computation A set of microbenchmarks in llvm-test-suite (llvm/llvm-test-suite#56), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial when the vector loop runs at least twice or when the epilogue loop trip count (TC) is minimal. Therefore, we choose interleaving count (IC) between TC/VF & TC/2*VF (VF = vectorization factor), such that remainder TC for the epilogue loop is minimum while the IC is maximum in case the remainder TC is same for both. The initial tests for this change were submitted in PRs: llvm#70272 and llvm#74689.

[LV] Pre-committing tests for changing interleaving count computation

8ee31fc

nilanjana87 requested a review from fhahn October 26, 2023 00:23

llvmbot added the llvm:transforms label Oct 26, 2023

nilanjana87 mentioned this pull request Oct 26, 2023

[LV] Change loops' interleave count computation #70141

Closed

fhahn reviewed Oct 26, 2023

View reviewed changes

nilanjana87 mentioned this pull request Oct 27, 2023

[LV] Relax high loop trip count threshold for deciding to interleave a loop #67725

Merged

Addressed reviewer comments

55d3431

- replaced -O3 in tests with -p loop-vectorize - added "TODO" in comments for tests that are meant to change after the subsequent patch - used named labels for basic blocks

nilanjana87 added a commit to nilanjana87/llvm-project that referenced this pull request Oct 30, 2023

Rebasing tests that changed in llvm#70272 for addressing reviewer com…

0376af1

…ments

fhahn reviewed Nov 9, 2023

View reviewed changes

llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll Outdated Show resolved Hide resolved

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll Outdated Show resolved Hide resolved

Changed test VFs to a more reasonable value, fixed comments

205335a

nilanjana87 added a commit to nilanjana87/llvm-project that referenced this pull request Nov 11, 2023

Rebasing tests that changed in llvm#70272 for addressing reviewer com…

269012b

…ments

Changed trip count for the added test cases to test the IC computatio…

7c3606a

…n change in subsequent patches without the need to force VF

nilanjana87 added a commit to nilanjana87/llvm-project that referenced this pull request Nov 16, 2023

Rebasing tests that changed in llvm#70272 for addressing reviewer com…

e9e0c86

…ments

fhahn approved these changes Nov 17, 2023

View reviewed changes

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll Outdated Show resolved Hide resolved

Disabled redundant output in tests

a389673

fhahn approved these changes Nov 17, 2023

View reviewed changes

nilanjana87 added a commit to nilanjana87/llvm-project that referenced this pull request Nov 17, 2023

Rebasing tests that changed in llvm#70272 for addressing reviewer com…

3104a42

…ments

nilanjana87 merged commit e2210ce into llvm:main Nov 18, 2023
3 checks passed

nilanjana87 deleted the loop-interleaving-precommit-tests branch November 18, 2023 04:38

nilanjana87 mentioned this pull request Nov 29, 2023

[LV] Change loops' interleave count computation #73766

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[LV] Pre-committing tests for changing loop interleaving count computation #70272

[LV] Pre-committing tests for changing loop interleaving count computation #70272

nilanjana87 commented Oct 26, 2023

llvmbot commented Oct 26, 2023

fhahn left a comment

fhahn left a comment

fhahn left a comment

[LV] Pre-committing tests for changing loop interleaving count computation #70272

[LV] Pre-committing tests for changing loop interleaving count computation #70272

Conversation

nilanjana87 commented Oct 26, 2023

llvmbot commented Oct 26, 2023

fhahn left a comment

Choose a reason for hiding this comment

fhahn left a comment

Choose a reason for hiding this comment

fhahn left a comment

Choose a reason for hiding this comment