-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LV]: Ensure fairness when selecting epilogue VF. #155547
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-vectorizers Author: Hassnaa Hamdi (hassnaaHamdi) ChangesConsider IC when deciding if epilogue profitable for scalable vectors, Patch is 45.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155547.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7fc87a0b49f70..442858a01b50a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4398,16 +4398,10 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
- // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
- // VFs when deciding profitability.
- // See related "TODO: extend to support scalable VFs." in
- // selectEpilogueVectorizationFactor.
- unsigned Multiplier = VF.isFixed() ? IC : 1;
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
- return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
- MinVFThreshold;
+ return estimateElementCount(VF * IC, VScaleForTuning) >= MinVFThreshold;
}
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 137e07336fd50..80130305aaf04 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -8,24 +8,29 @@ target triple = "arm64-apple-macosx14.0.0"
define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-LABEL: define void @iv_casts(
; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: [[ITER_CHECK:.*]]:
; DEFAULT-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
; DEFAULT-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
+; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; DEFAULT-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; DEFAULT-NEXT: [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
; DEFAULT-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; DEFAULT-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
-; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; DEFAULT: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; DEFAULT-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP9]], 4
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], [[TMP8]]
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP11]], 16
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
@@ -63,25 +68,59 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; DEFAULT: [[SCALAR_PH]]:
-; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; DEFAULT: [[VEC_EPILOG_ITER_CHECK]]:
+; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP48:%.*]] = shl nuw i64 [[TMP47]], 2
+; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP48]]
+; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; DEFAULT: [[VEC_EPILOG_PH]]:
+; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP50:%.*]] = mul nuw i64 [[TMP49]], 4
+; DEFAULT-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[TMP0]], [[TMP50]]
+; DEFAULT-NEXT: [[N_VEC6:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF5]]
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP51:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT8]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; DEFAULT: [[VEC_EPILOG_VECTOR_BODY]]:
+; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; DEFAULT-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[GEP_SRC]], align 1
+; DEFAULT-NEXT: [[TMP39:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: [[TMP40:%.*]] = mul <vscale x 4 x i16> [[TMP39]], [[TMP51]]
+; DEFAULT-NEXT: [[TMP52:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: [[TMP53:%.*]] = or <vscale x 4 x i16> [[TMP40]], [[TMP52]]
+; DEFAULT-NEXT: [[TMP54:%.*]] = lshr <vscale x 4 x i16> [[TMP53]], splat (i16 1)
+; DEFAULT-NEXT: [[TMP55:%.*]] = trunc <vscale x 4 x i16> [[TMP54]] to <vscale x 4 x i8>
+; DEFAULT-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT: store <vscale x 4 x i8> [[TMP55]], ptr [[TMP45]], align 1
+; DEFAULT-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[IV]], [[TMP50]]
+; DEFAULT-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
+; DEFAULT-NEXT: br i1 [[TMP46]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; DEFAULT-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC6]]
+; DEFAULT-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT: [[VEC_EPILOG_SCALAR_PH]]:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ITER_CHECK]] ]
; DEFAULT-NEXT: br label %[[LOOP:.*]]
; DEFAULT: [[LOOP]]:
-; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; DEFAULT-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
-; DEFAULT-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; DEFAULT-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; DEFAULT-NEXT: [[GEP_SRC1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV1]]
+; DEFAULT-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC1]], align 1
; DEFAULT-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
; DEFAULT-NEXT: [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
-; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
; DEFAULT-NEXT: [[CONV25_US:%.*]] = zext i8 [[L]] to i32
; DEFAULT-NEXT: [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
; DEFAULT-NEXT: [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
; DEFAULT-NEXT: [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
-; DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV1]]
; DEFAULT-NEXT: store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
-; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[N]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -218,7 +257,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 1, ptr [[TMP21]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -234,7 +273,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 1, ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -382,7 +421,7 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[TMP15]], ptr [[TMP24]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -401,7 +440,7 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1
; DEFAULT-NEXT: [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; DEFAULT-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -574,7 +613,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[TMP14]], ptr [[TMP23]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -593,7 +632,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1
; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -750,7 +789,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
; DEFAULT-NEXT: store i32 0, ptr [[TMP10]], align 8
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -766,7 +805,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
; DEFAULT-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1
; DEFAULT-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
; DEFAULT-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
-; DEFAULT-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP12:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index 7ff4609f8ec4b..4058679dd10fe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -24,30 +24,30 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-4-NEXT: br label [[VECTOR_BODY:%.*]]
; INTERLEAVE-4: vector.body:
; INTERLEAVE-4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; INTERLEAVE-4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; INTERLEAVE-4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; INTERLEAVE-4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
-; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
-; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1
-; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
-; INTERLEAVE-4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1
-; INTERLEAVE-4-NEXT: [[TMP12]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; INTERLEAVE-4-NEXT: [[TMP13]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
-; INTERLEAVE-4-NEXT: [[TMP14]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
-; INTERLEAVE-4-NEXT: [[TMP15]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
+; INTERLEAVE-4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
+; INTERLEAVE-4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
+; INTERLEAVE-4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
+; INTERLEAVE-4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
+; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 1
+; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
+; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; INTERLEAVE-4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; INTERLEAVE-4-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; INTERLEAVE-4-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
+; INTERLEAVE-4-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
+; INTERLEAVE-4-NEXT: [[TMP7]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
; INTERLEAVE-4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; INTERLEAVE-4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-4-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; INTERLEAVE-4: middle.block:
-; INTERLEAVE-4-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[TMP12]]
-; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP14]], [[BIN_RDX]]
-; INTERLEAVE-4-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX8]]
-; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX9]])
+; INTERLEAVE-4-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
+; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
+; INTERLEAVE-4-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX8]]
+; INTERLEAVE-4-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX9]])
; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; INTERLEAVE-4: vec.epilog.iter.check:
@@ -56,27 +56,27 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; INTERLEAVE-4: vec.epilog.ph:
; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; INTERLEAVE-4-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[N]], 4
; INTERLEAVE-4-NEXT: [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]]
-; INTERLEAVE-4-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; INTERLEAVE-4-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
; INTERLEAVE-4-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; INTERLEAVE-4: vec.epilog.vector.body:
; INTERLEAVE-4-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], ...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Hassnaa Hamdi (hassnaaHamdi) ChangesConsider IC when deciding if epilogue profitable for scalable vectors, Patch is 45.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155547.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7fc87a0b49f70..442858a01b50a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4398,16 +4398,10 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
- // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
- // VFs when deciding profitability.
- // See related "TODO: extend to support scalable VFs." in
- // selectEpilogueVectorizationFactor.
- unsigned Multiplier = VF.isFixed() ? IC : 1;
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
- return estimateElementCount(VF * Multiplier, VScaleForTuning) >=
- MinVFThreshold;
+ return estimateElementCount(VF * IC, VScaleForTuning) >= MinVFThreshold;
}
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index 137e07336fd50..80130305aaf04 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -8,24 +8,29 @@ target triple = "arm64-apple-macosx14.0.0"
define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-LABEL: define void @iv_casts(
; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: [[ITER_CHECK:.*]]:
; DEFAULT-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
; DEFAULT-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 4
+; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
; DEFAULT-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; DEFAULT-NEXT: [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
; DEFAULT-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
-; DEFAULT-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
-; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: br i1 [[DIFF_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; DEFAULT: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 16
+; DEFAULT-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP9]], 4
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], [[TMP8]]
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP11]], 16
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
@@ -63,25 +68,59 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
; DEFAULT-NEXT: br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; DEFAULT: [[SCALAR_PH]]:
-; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; DEFAULT: [[VEC_EPILOG_ITER_CHECK]]:
+; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP48:%.*]] = shl nuw i64 [[TMP47]], 2
+; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP48]]
+; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; DEFAULT: [[VEC_EPILOG_PH]]:
+; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT: [[TMP50:%.*]] = mul nuw i64 [[TMP49]], 4
+; DEFAULT-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[TMP0]], [[TMP50]]
+; DEFAULT-NEXT: [[N_VEC6:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF5]]
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT: [[TMP51:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT8]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; DEFAULT: [[VEC_EPILOG_VECTOR_BODY]]:
+; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; DEFAULT-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[GEP_SRC]], align 1
+; DEFAULT-NEXT: [[TMP39:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: [[TMP40:%.*]] = mul <vscale x 4 x i16> [[TMP39]], [[TMP51]]
+; DEFAULT-NEXT: [[TMP52:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i16>
+; DEFAULT-NEXT: [[TMP53:%.*]] = or <vscale x 4 x i16> [[TMP40]], [[TMP52]]
+; DEFAULT-NEXT: [[TMP54:%.*]] = lshr <vscale x 4 x i16> [[TMP53]], splat (i16 1)
+; DEFAULT-NEXT: [[TMP55:%.*]] = trunc <vscale x 4 x i16> [[TMP54]] to <vscale x 4 x i8>
+; DEFAULT-NEXT: [[TMP45:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT: store <vscale x 4 x i8> [[TMP55]], ptr [[TMP45]], align 1
+; DEFAULT-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[IV]], [[TMP50]]
+; DEFAULT-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
+; DEFAULT-NEXT: br i1 [[TMP46]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT: [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; DEFAULT-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC6]]
+; DEFAULT-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT: [[VEC_EPILOG_SCALAR_PH]]:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ITER_CHECK]] ]
; DEFAULT-NEXT: br label %[[LOOP:.*]]
; DEFAULT: [[LOOP]]:
-; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; DEFAULT-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
-; DEFAULT-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; DEFAULT-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; DEFAULT-NEXT: [[GEP_SRC1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV1]]
+; DEFAULT-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC1]], align 1
; DEFAULT-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
; DEFAULT-NEXT: [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
-; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
; DEFAULT-NEXT: [[CONV25_US:%.*]] = zext i8 [[L]] to i32
; DEFAULT-NEXT: [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
; DEFAULT-NEXT: [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
; DEFAULT-NEXT: [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
-; DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV1]]
; DEFAULT-NEXT: store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
-; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[N]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -218,7 +257,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 1, ptr [[TMP21]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -234,7 +273,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 1, ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -382,7 +421,7 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[TMP15]], ptr [[TMP24]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -401,7 +440,7 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1
; DEFAULT-NEXT: [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; DEFAULT-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -574,7 +613,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[TMP14]], ptr [[TMP23]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -593,7 +632,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; DEFAULT-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4
; DEFAULT-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1
; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
@@ -750,7 +789,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
; DEFAULT-NEXT: store i32 0, ptr [[TMP10]], align 8
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -766,7 +805,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
; DEFAULT-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1
; DEFAULT-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
; DEFAULT-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
-; DEFAULT-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; DEFAULT-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP12:![0-9]+]]
; DEFAULT: [[EXIT]]:
; DEFAULT-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index 7ff4609f8ec4b..4058679dd10fe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -24,30 +24,30 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-4-NEXT: br label [[VECTOR_BODY:%.*]]
; INTERLEAVE-4: vector.body:
; INTERLEAVE-4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; INTERLEAVE-4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; INTERLEAVE-4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; INTERLEAVE-4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
-; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
-; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1
-; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
-; INTERLEAVE-4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1
-; INTERLEAVE-4-NEXT: [[TMP12]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; INTERLEAVE-4-NEXT: [[TMP13]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
-; INTERLEAVE-4-NEXT: [[TMP14]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
-; INTERLEAVE-4-NEXT: [[TMP15]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
+; INTERLEAVE-4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
+; INTERLEAVE-4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
+; INTERLEAVE-4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
+; INTERLEAVE-4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
+; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 1
+; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
+; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; INTERLEAVE-4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; INTERLEAVE-4-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; INTERLEAVE-4-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
+; INTERLEAVE-4-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
+; INTERLEAVE-4-NEXT: [[TMP7]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
; INTERLEAVE-4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; INTERLEAVE-4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-4-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; INTERLEAVE-4: middle.block:
-; INTERLEAVE-4-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP13]], [[TMP12]]
-; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP14]], [[BIN_RDX]]
-; INTERLEAVE-4-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX8]]
-; INTERLEAVE-4-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX9]])
+; INTERLEAVE-4-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
+; INTERLEAVE-4-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
+; INTERLEAVE-4-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX8]]
+; INTERLEAVE-4-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX9]])
; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; INTERLEAVE-4: vec.epilog.iter.check:
@@ -56,27 +56,27 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; INTERLEAVE-4: vec.epilog.ph:
; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; INTERLEAVE-4-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[N]], 4
; INTERLEAVE-4-NEXT: [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]]
-; INTERLEAVE-4-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; INTERLEAVE-4-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
; INTERLEAVE-4-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; INTERLEAVE-4: vec.epilog.vector.body:
; INTERLEAVE-4-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], ...
[truncated]
|
Consider IC when deciding if epilogue profitable for scalable vectors, same as fixed-width vectors.
Gentle ping @david-arm @fhahn |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
; INTERLEAVE-4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] | ||
; INTERLEAVE-4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] | ||
; INTERLEAVE-4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] | ||
; INTERLEAVE-4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] | ||
; INTERLEAVE-4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] | ||
; INTERLEAVE-4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4 | ||
; INTERLEAVE-4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8 | ||
; INTERLEAVE-4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12 | ||
; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 1 | ||
; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1 | ||
; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1 | ||
; INTERLEAVE-4-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1 | ||
; INTERLEAVE-4-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] | ||
; INTERLEAVE-4-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]] | ||
; INTERLEAVE-4-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]] | ||
; INTERLEAVE-4-NEXT: [[TMP7]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]] | ||
; INTERLEAVE-4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 | ||
; INTERLEAVE-4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] | ||
; INTERLEAVE-4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] | ||
; INTERLEAVE-4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes here and a few below look to be just renaming, could you strip those from the patch so it's easier to see what the impact of the change is?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like LLVM.Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
is failing in pre-commit, probably needs a rebase + test update?
It fails as expected, I tried to strip out the renaming to show you the difference of the logic. After you see the changes, I will update the file again. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
I guess it's probably a bit easier for reviewers to pre-commit a NFC patch that regenerates the CHECK lines first, either in a separate PR or as an initial commit to a PR. |
Yeah, I will do that for all next patches. |
Consider IC when deciding if epilogue profitable for scalable vectors, same as fixed-width vectors.
Consider IC when deciding if epilogue profitable for scalable vectors, same as fixed-width vectors.
Consider IC when deciding if epilogue profitable for scalable vectors, same as fixed-width vectors.
Consider IC when deciding if epilogue profitable for scalable vectors,
same as fixed-width vectors.