-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Stability fix for outerloop vectorization #68118
[LV] Stability fix for outerloop vectorization #68118
Conversation
@llvm/pr-subscribers-llvm-transforms ChangesHCFG builder doesn't correctly handle cases when non-outermost loop is requested to be vectorized [Original] Differential Revision: https://reviews.llvm.org/D150700 Patch is 27.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68118.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index a49332f39ceebef..94a51a383ea8867 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -156,6 +156,19 @@ static bool isHeaderVPBB(VPBasicBlock *VPBB) {
return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB;
}
+/// Return true of \p L loop is contained within \p OuterLoop
+static bool isNestedLoop(const Loop *L, const Loop *OuterLoop) {
+ if (L->getLoopDepth() < OuterLoop->getLoopDepth())
+ return false;
+ const Loop *P = L;
+ while (P) {
+ if (P == OuterLoop)
+ return true;
+ P = P->getParentLoop();
+ }
+ return false;
+}
+
// Create a new empty VPBasicBlock for an incoming BasicBlock in the region
// corresponding to the containing loop or retrieve an existing one if it was
// already created. If no region exists yet for the loop containing \p BB, a new
@@ -173,7 +186,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
// Get or create a region for the loop containing BB.
Loop *LoopOfBB = LI->getLoopFor(BB);
- if (!LoopOfBB)
+ if (!LoopOfBB || !isNestedLoop(LoopOfBB, TheLoop))
return VPBB;
VPRegionBlock *RegionOfBB = Loop2Region.lookup(LoopOfBB);
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
new file mode 100644
index 000000000000000..895a129d231ba96
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
@@ -0,0 +1,461 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s -S | FileCheck %s
+; void test(int n, int **a)
+; {
+; for (int k = 0; k < n; ++k) {
+; a[k][0] = 0;
+; #pragma clang loop vectorize_width(4)
+; for (int i = 0; i < n; ++i) {
+; for (int j = 0; j < n; ++j) {
+; a[i][j] = 2 + k;
+; }
+; }
+; }
+; }
+
+define void @test(i64 %n, ptr %a) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP34:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP34]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.us.preheader:
+; CHECK-NEXT: br label [[FOR_BODY_US:%.*]]
+; CHECK: for.body.us:
+; CHECK-NEXT: [[IV42:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[IV_NEXT43:%.*]], [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV42]]
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-NEXT: store i32 0, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[IV42]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[A]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT: br label [[FOR_BODY9_US_US1:%.*]]
+; CHECK: for.body9.us.us1:
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR_BODY9_US_US1]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4]], label [[FOR_BODY9_US_US1]]
+; CHECK: for.cond6.for.cond.cleanup8_crit_edge.us.us4:
+; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT: br label [[FOR_COND6_PREHEADER_US_US:%.*]]
+; CHECK: for.cond6.preheader.us.us:
+; CHECK-NEXT: [[IV37:%.*]] = phi i64 [ [[IV_NEXT38:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX11_US_US:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV37]]
+; CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[ARRAYIDX11_US_US]], align 8
+; CHECK-NEXT: br label [[FOR_BODY9_US_US:%.*]]
+; CHECK: for.body9.us.us:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY9_US_US]] ], [ 0, [[FOR_COND6_PREHEADER_US_US]] ]
+; CHECK-NEXT: [[ARRAYIDX13_US_US:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[IV]]
+; CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX13_US_US]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US]], label [[FOR_BODY9_US_US]]
+; CHECK: for.cond6.for.cond.cleanup8_crit_edge.us.us:
+; CHECK-NEXT: [[IV_NEXT38]] = add nuw nsw i64 [[IV37]], 1
+; CHECK-NEXT: [[EXITCOND41_NOT:%.*]] = icmp eq i64 [[IV_NEXT38]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND41_NOT]], label [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US]], label [[FOR_COND6_PREHEADER_US_US]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: for.cond2.for.cond.cleanup4_crit_edge.split.us.us:
+; CHECK-NEXT: [[IV_NEXT43]] = add nuw nsw i64 [[IV42]], 1
+; CHECK-NEXT: [[EXITCOND47_NOT:%.*]] = icmp eq i64 [[IV_NEXT43]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND47_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_US]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp34 = icmp sgt i64 %n, 0
+ br i1 %cmp34, label %for.body.us.preheader, label %for.cond.cleanup
+
+for.body.us.preheader:
+ br label %for.body.us
+
+for.body.us:
+ %iv42 = phi i64 [ 0, %for.body.us.preheader ], [ %iv.next43, %for.cond2.for.cond.cleanup4_crit_edge.split.us.us ]
+ %arrayidx.us = getelementptr inbounds ptr, ptr %a, i64 %iv42
+ %0 = load ptr, ptr %arrayidx.us, align 8
+ store i32 0, ptr %0, align 4
+ %1 = trunc i64 %iv42 to i32
+ %2 = add i32 %1, 2
+ br label %for.cond6.preheader.us.us
+
+for.cond6.preheader.us.us:
+ %iv37 = phi i64 [ %iv.next38, %for.cond6.for.cond.cleanup8_crit_edge.us.us ], [ 0, %for.body.us ]
+ %arrayidx11.us.us = getelementptr inbounds ptr, ptr %a, i64 %iv37
+ %3 = load ptr, ptr %arrayidx11.us.us, align 8
+ br label %for.body9.us.us
+
+for.body9.us.us:
+ %iv = phi i64 [ %iv.next, %for.body9.us.us ], [ 0, %for.cond6.preheader.us.us ]
+ %arrayidx13.us.us = getelementptr inbounds i32, ptr %3, i64 %iv
+ store i32 %2, ptr %arrayidx13.us.us, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.cond6.for.cond.cleanup8_crit_edge.us.us, label %for.body9.us.us
+
+for.cond6.for.cond.cleanup8_crit_edge.us.us:
+ %iv.next38 = add nuw nsw i64 %iv37, 1
+ %exitcond41.not = icmp eq i64 %iv.next38, %n
+ br i1 %exitcond41.not, label %for.cond2.for.cond.cleanup4_crit_edge.split.us.us, label %for.cond6.preheader.us.us, !llvm.loop !3
+
+for.cond2.for.cond.cleanup4_crit_edge.split.us.us:
+ %iv.next43 = add nuw nsw i64 %iv42, 1
+ %exitcond47.not = icmp eq i64 %iv.next43, %n
+ br i1 %exitcond47.not, label %for.cond.cleanup.loopexit, label %for.body.us
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+; void test1(int n, int **a)
+; {
+; for (int k = 0; k < n; ++k) {
+; a[k][0] = 0;
+; for (int i = 0; i < n; ++i) {
+; #pragma clang loop vectorize_width(4)
+; for (int j = 0; j < n; ++j) {
+; for (int x = 0; x < n; ++x) {
+; a[i][j] = 2 + k+x;
+; }
+; }
+;
+; for (int j = 0; j < n; ++j) {
+; for (int x = 0; x < n; ++x) {
+; a[i][j] += 2 + k+x;
+; }
+; }
+; }
+; }
+; }
+define void @test1(i32 %n, ptr %a) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP84:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP84]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.lr.ph:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT104:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_COND6_PREHEADER_LR_PH:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.cond6.preheader.lr.ph:
+; CHECK-NEXT: [[IV99:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT100:%.*]], [[FOR_COND_CLEANUP4:%.*]] ]
+; CHECK-NEXT: [[IV87:%.*]] = phi i32 [ [[N]], [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT88:%.*]], [[FOR_COND_CLEANUP4]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV99]]
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: store i32 0, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[IV99]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[N]]
+; CHECK-NEXT: br label [[FOR_COND6_PREHEADER:%.*]]
+; CHECK: for.cond6.preheader:
+; CHECK-NEXT: [[IV94:%.*]] = phi i64 [ 0, [[FOR_COND6_PREHEADER_LR_PH]] ], [ [[IV_NEXT95:%.*]], [[FOR_COND_CLEANUP25:%.*]] ]
+; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV94]]
+; CHECK-NEXT: [[DOTPRE:%.*]] = load ptr, ptr [[ARRAYIDX16]], align 8
+; CHECK-NEXT: br label [[FOR_COND10_PREHEADER:%.*]]
+; CHECK: for.cond.cleanup4:
+; CHECK-NEXT: [[IV_NEXT100]] = add nuw nsw i64 [[IV99]], 1
+; CHECK-NEXT: [[IV_NEXT88]] = add i32 [[IV87]], [[N]]
+; CHECK-NEXT: [[EXITCOND105_NOT:%.*]] = icmp eq i64 [[IV_NEXT100]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT: br i1 [[EXITCOND105_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND6_PREHEADER_LR_PH]]
+; CHECK: for.cond10.preheader:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[FOR_COND6_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_COND10_PREHEADER]] ]
+; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV]]
+; CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND28_PREHEADER_PREHEADER:%.*]], label [[FOR_COND10_PREHEADER]]
+; CHECK: for.cond28.preheader.preheader:
+; CHECK-NEXT: br label [[FOR_COND28_PREHEADER:%.*]]
+; CHECK: for.cond28.preheader:
+; CHECK-NEXT: [[IV89:%.*]] = phi i64 [ [[IV_NEXT90:%.*]], [[FOR_COND28_PREHEADER]] ], [ 0, [[FOR_COND28_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV89]]
+; CHECK-NEXT: [[ARRAYIDX37_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[IV87]], [[ARRAYIDX37_PROMOTED]]
+; CHECK-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT: [[IV_NEXT90]] = add nuw nsw i64 [[IV89]], 1
+; CHECK-NEXT: [[EXITCOND93_NOT:%.*]] = icmp eq i64 [[IV_NEXT90]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT: br i1 [[EXITCOND93_NOT]], label [[FOR_COND_CLEANUP25]], label [[FOR_COND28_PREHEADER]]
+; CHECK: for.cond.cleanup25:
+; CHECK-NEXT: [[IV_NEXT95]] = add nuw nsw i64 [[IV94]], 1
+; CHECK-NEXT: [[EXITCOND98_NOT:%.*]] = icmp eq i64 [[IV_NEXT95]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT: br i1 [[EXITCOND98_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_COND6_PREHEADER]]
+;
+entry:
+ %cmp84 = icmp sgt i32 %n, 0
+ br i1 %cmp84, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:
+ %wide.trip.count104 = zext i32 %n to i64
+ br label %for.cond6.preheader.lr.ph
+
+for.cond.cleanup:
+ ret void
+
+for.cond6.preheader.lr.ph:
+ %iv99 = phi i64 [ 0, %for.body.lr.ph ], [ %iv.next100, %for.cond.cleanup4 ]
+ %iv87 = phi i32 [ %n, %for.body.lr.ph ], [ %iv.next88, %for.cond.cleanup4 ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %iv99
+ %0 = load ptr, ptr %arrayidx, align 8
+ store i32 0, ptr %0, align 4
+ %1 = trunc i64 %iv99 to i32
+ %2 = add i32 %1, 2
+ %3 = add i32 %2, %n
+ br label %for.cond6.preheader
+
+for.cond6.preheader:
+ %iv94 = phi i64 [ 0, %for.cond6.preheader.lr.ph ], [ %iv.next95, %for.cond.cleanup25 ]
+ %arrayidx16 = getelementptr inbounds ptr, ptr %a, i64 %iv94
+ %.pre = load ptr, ptr %arrayidx16, align 8
+ br label %for.cond10.preheader
+
+for.cond.cleanup4:
+ %iv.next100 = add nuw nsw i64 %iv99, 1
+ %iv.next88 = add i32 %iv87, %n
+ %exitcond105.not = icmp eq i64 %iv.next100, %wide.trip.count104
+ br i1 %exitcond105.not, label %for.cond.cleanup, label %for.cond6.preheader.lr.ph
+
+for.cond10.preheader:
+ %iv = phi i64 [ 0, %for.cond6.preheader ], [ %iv.next, %for.cond10.preheader ]
+ %arrayidx18 = getelementptr inbounds i32, ptr %.pre, i64 %iv
+ store i32 %3, ptr %arrayidx18, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count104
+ br i1 %exitcond.not, label %for.cond28.preheader, label %for.cond10.preheader
+
+for.cond28.preheader:
+ %iv89 = phi i64 [ %iv.next90, %for.cond28.preheader ], [ 0, %for.cond10.preheader ]
+ %arrayidx37 = getelementptr inbounds i32, ptr %.pre, i64 %iv89
+ %arrayidx37.promoted = load i32, ptr %arrayidx37, align 4
+ %4 = add i32 %iv87, %arrayidx37.promoted
+ store i32 %4, ptr %arrayidx37, align 4
+ %iv.next90 = add nuw nsw i64 %iv89, 1
+ %exitcond93.not = icmp eq i64 %iv.next90, %wide.trip.count104
+ br i1 %exitcond93.not, label %for.cond.cleanup25, label %for.cond28.preheader
+
+for.cond.cleanup25:
+ %iv.next95 = add nuw nsw i64 %iv94, 1
+ %exitcond98.not = icmp eq i64 %iv.next95, %wide.trip.count104
+ br i1 %exitcond98.not, label %for.cond.cleanup4, label %for.cond6.preheader
+}
+
+; void test2(int n, int **a)
+; {
+; for (int k = 0; k < n; ++k) {
+; a[k][0] = 0;
+; #pragma clang loop vectorize_width(4)
+; for (int i = 0; i < n; ++i) {
+; for (int j = 0; j < n; ++j) {
+; for (int x = 0; x < n; ++x) {
+; a[i][j] = 2 + k+x;
+; }
+; }
+;
+; for (int j = 0; j < n; ++j) {
+; for (int x = 0; x < n; ++x) {
+; a[i][j] += 2 + k+x;
+; }
+; }
+; }
+; }
+; }
+define void @test2(i32 %n, ptr %a) {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP84:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP84]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.lr.ph:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT104:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_COND6_PREHEADER_LR_PH:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.cond6.preheader.lr.ph:
+; CHECK-NEXT: [[IV99:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT100:%.*]], [[FOR_COND_CLEANUP4:%.*]] ]
+; CHECK-NEXT: [[IV87:%.*]] = phi i32 [ [[N]], [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT88:%.*]], [[FOR_COND_CLEANUP4]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV99]]
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: store i32 0, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[IV99]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[N]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT104]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT104]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT104]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[WIDE_TRIP_COUNT104]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[IV87]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND_CLEANUP2510:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND_CLEANUP2510]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[A]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT: br label [[FOR_COND10_PREHEADER1:%.*]]
+; CHECK: for.cond10.preheader1:
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_COND10_PREHEADER1]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for moving this from Phabricator. Looks like there are a few outstanding comments from the original https://reviews.llvm.org/D150700.
llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
Outdated
Show resolved
Hide resolved
05d0b31
to
3f7f9fc
Compare
my bad. Somehow missed our comments |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM thanks! 2 small remaining suggestions inline
HCFG builder doesn't correctly handle cases when non-outermost loop is requested to be vectorized [Original] Differential Revision: https://reviews.llvm.org/D150700
3f7f9fc
to
3eb8bf8
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still LGTM, thanks!
HCFG builder doesn't correctly handle cases when non-outermost loop is requested to be vectorized
[Original] Differential Revision: https://reviews.llvm.org/D150700