-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LoopVectorize/test: clean up reduction.ll; generate using UTC (NFC) #68890
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Ramkumar Ramachandra (artagnon) ChangesThe test reduction.ll was introduced before utils/update_test_checks.py, and hence contains hand-written CHECK lines. Revisit the test today, and modernize it by:
Finally, regenerate the positive CHECK lines using update_test_checks.py, eliminating hand-written error-prone CHECK lines. Patch is 56.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68890.diff 1 Files Affected:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index 278917b0069c854..b468c4a7570906c 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -1,14 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-
-;CHECK-LABEL: @reduction_sum(
-;CHECK: phi <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
-;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
-;CHECK: ret i32
-define i32 @reduction_sum(i32 %n, ptr noalias nocapture %A, ptr noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+define i32 @reduction_sum(i32 %n, ptr %A, ptr %B) {
+; CHECK-LABEL: define i32 @reduction_sum(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK: .lr.ph.preheader:
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 4294967292
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP7]] = add <4 x i32> [[TMP6]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP2]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
+; CHECK: .lr.ph:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP17:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SUM_02]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], [[TMP11]]
+; CHECK-NEXT: [[TMP17]] = add i32 [[TMP16]], [[TMP13]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: ._crit_edge.loopexit:
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
+; CHECK: ._crit_edge:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
+;
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -33,13 +82,63 @@ define i32 @reduction_sum(i32 %n, ptr noalias nocapture %A, ptr noalias nocaptur
ret i32 %sum.0.lcssa
}
-;CHECK-LABEL: @reduction_prod(
-;CHECK: phi <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: mul <4 x i32>
-;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>
-;CHECK: ret i32
-define i32 @reduction_prod(i32 %n, ptr noalias nocapture %A, ptr noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+define i32 @reduction_prod(i32 %n, ptr %A, ptr %B) {
+; CHECK-LABEL: define i32 @reduction_prod(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK: .lr.ph.preheader:
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 4294967292
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[VEC_PHI]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i32> [[TMP5]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP7]] = mul <4 x i32> [[TMP6]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP2]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
+; CHECK: .lr.ph:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[PROD_02:%.*]] = phi i32 [ [[TMP17:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[PROD_02]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP11]]
+; CHECK-NEXT: [[TMP17]] = mul i32 [[TMP16]], [[TMP13]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: ._crit_edge.loopexit:
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
+; CHECK: ._crit_edge:
+; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ 1, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[PROD_0_LCSSA]]
+;
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -64,13 +163,63 @@ define i32 @reduction_prod(i32 %n, ptr noalias nocapture %A, ptr noalias nocaptu
ret i32 %prod.0.lcssa
}
-;CHECK-LABEL: @reduction_mix(
-;CHECK: phi <4 x i32>
-;CHECK: load <4 x i32>
-;CHECK: mul nsw <4 x i32>
-;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
-;CHECK: ret i32
-define i32 @reduction_mix(i32 %n, ptr noalias nocapture %A, ptr noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+define i32 @reduction_mix(i32 %n, ptr %A, ptr %B) {
+; CHECK-LABEL: define i32 @reduction_mix(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK: .lr.ph.preheader:
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 4294967292
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP7]] = add <4 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP2]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
+; CHECK: .lr.ph:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP17:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = mul nsw i32 [[TMP13]], [[TMP11]]
+; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[SUM_02]], [[TMP15]]
+; CHECK-NEXT: [[TMP17]] = add i32 [[TMP16]], [[TMP14]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: ._crit_edge.loopexit:
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
+; CHECK: ._crit_edge:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
+;
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -95,11 +244,63 @@ define i32 @reduction_mix(i32 %n, ptr noalias nocapture %A, ptr noalias nocaptur
ret i32 %sum.0.lcssa
}
-;CHECK-LABEL: @reduction_mul(
-;CHECK: mul <4 x i32>
-;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>
-;CHECK: ret i32
-define i32 @reduction_mul(i32 %n, ptr noalias nocapture %A, ptr noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+define i32 @reduction_mul(i32 %n, ptr %A, ptr %B) {
+; CHECK-LABEL: define i32 @reduction_mul(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK: .lr.ph.preheader:
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 4294967292
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 19, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP7]] = mul <4 x i32> [[TMP6]], [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP2]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 19, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
+; CHECK: .lr.ph:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP17:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP11]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], [[TMP13]]
+; CHECK-NEXT: [[TMP17]] = mul i32 [[TMP16]], [[SUM_02]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: ._crit_edge.loopexit:
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
+; CHECK: ._crit_edge:
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
+;
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -124,12 +325,59 @@ define i32 @reduction_mul(i32 %n, ptr noalias nocapture %A, ptr noalias nocaptur
ret i32 %sum.0.lcssa
}
-;CHECK-LABEL: @start_at_non_zero(
-;CHECK: phi <4 x i32>
-;CHECK: <i32 120, i32 0, i32 0, i32 0>
-;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
-;CHECK: ret i32
-define i32 @start_at_non_zero(ptr nocapture %in, ptr nocapture %coeff, ptr nocapture %out, i32 %n) nounwind uwtable readonly ssp {
+define i32 @start_at_non_zero(ptr %in, ptr %coeff, ptr %out, i32 %n) {
+; CHECK-LABEL: define i32 @start_at_non_zero(
+; CHECK-SAME: ptr [[IN:%.*]], ptr [[COEFF:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967292
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 120, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
+; ...
[truncated]
|
If you're using UTC, there should not be any CHECK-NOT lines. |
I'm conforming to the practice used in the (recent) iv-select-cmp.ll. I regenerate using UTC, and stage all hunks except the CHECK-NOT checks. I think it's clearer this way: we convey the information that the example is not vectorized, without being overly verbose. |
We really don't care about verbosity -- only about easy to regenerate tests and clear diffs. Using CHECK-NOT means it's not possible to regenerate the test, which means you lose the primary benefit of using UTC in the first place. If you have to manually fix up the results, you may as well stay with hand-written check lines. |
fdea26a
to
546274d
Compare
Okay, that's a good argument. Fixed now. |
The test reduction.ll was introduced before utils/update_test_checks.py, and hence contains hand-written CHECK lines. Revisit the test today, and modernize it by: - Removing extranous attributes on functions and their arguments, as LoopVectorize doesn't even look at these attributes. - Removing the target datalayout, as it is not essential for LoopVectorize. Finally, regenerate the CHECK lines using update_test_checks.py, eliminating hand-written error-prone CHECK lines.
546274d
to
df1e771
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The test reduction.ll was introduced before utils/update_test_checks.py, and hence contains hand-written CHECK lines. Revisit the test today, and modernize it by:
Finally, regenerate the CHECK lines using update_test_checks.py, eliminating hand-written error-prone CHECK lines.