-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[VPlan] Permit more users in narrowToSingleScalars #166559
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-llvm-transforms Author: Ramkumar Ramachandra (artagnon) ChangesnarrowToSingleScalarRecipes can permit users that are WidenStore, or a VPInstruction that is VectorToScalar or SingleScalar. This is an generalization and extension of the existing code. Patch is 23.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166559.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2588c878d8472..bf258818725b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1412,10 +1412,14 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
// broadcasts.
if (!vputils::isSingleScalar(RepOrWidenR) ||
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
- return U->usesScalars(RepOrWidenR) ||
- match(cast<VPRecipeBase>(U),
- m_CombineOr(m_ExtractLastElement(m_VPValue()),
- m_ExtractLastLanePerPart(m_VPValue())));
+ if (isa<VPWidenStoreRecipe>(U))
+ return true;
+
+ if (auto *VPI = dyn_cast<VPInstruction>(U))
+ if (VPI->isVectorToScalar() || VPI->isSingleScalar())
+ return true;
+
+ return U->usesScalars(RepOrWidenR);
}))
continue;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f16351720b20f..8380b41c0e9a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -429,48 +429,36 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
; DEFAULT: [[VECTOR_BODY]]:
-; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ]
-; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP19:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP19]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP6:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
-; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP6]]
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ]
+; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
+; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
+; DEFAULT-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
+; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]]
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]]
-; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
-; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; DEFAULT: [[PRED_STORE_IF]]:
-; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP11]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]]
; DEFAULT: [[PRED_STORE_CONTINUE]]:
-; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
-; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
+; DEFAULT: [[PRED_STORE_IF28]]:
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE29]]
+; DEFAULT: [[PRED_STORE_CONTINUE29]]:
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
+; DEFAULT: [[PRED_STORE_IF30]]:
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE31]]
+; DEFAULT: [[PRED_STORE_CONTINUE31]]:
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33]]
; DEFAULT: [[PRED_STORE_IF32]]:
-; DEFAULT-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP13]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE33]]
; DEFAULT: [[PRED_STORE_CONTINUE33]]:
-; DEFAULT-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
-; DEFAULT-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35:.*]]
-; DEFAULT: [[PRED_STORE_IF34]]:
-; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP15]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]]
-; DEFAULT: [[PRED_STORE_CONTINUE35]]:
-; DEFAULT-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
-; DEFAULT-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF36:.*]], label %[[PRED_STORE_CONTINUE37]]
-; DEFAULT: [[PRED_STORE_IF36]]:
-; DEFAULT-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE37]]
-; DEFAULT: [[PRED_STORE_CONTINUE37]]:
-; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
+; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
@@ -660,16 +648,16 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
+; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT: br label %[[EXIT]]
+; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; COMMON: [[PRED_STORE_CONTINUE14]]:
+; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]]
+; COMMON: [[MIDDLE_BLOCK]]:
+; COMMON-NEXT: br label %[[EXIT:.*]]
; COMMON: [[EXIT]]:
-; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
-; COMMON: [[SCALAR_PH]]:
-; COMMON-NEXT: br label %[[EXIT1:.*]]
-; COMMON: [[EXIT1]]:
; COMMON-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 5970608794b55..bea34e29e3530 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -16,7 +16,7 @@
; CM: vector.ph:
; CM: CLONE ir<%a> = extractvalue ir<%sv>
; CM: CLONE ir<%b> = extractvalue ir<%sv>
-; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM: CLONE ir<%add> = add ir<%a>, ir<%b>
; CM: Successor(s): vector loop
; CM: LV: Scalar loop costs: 5.
@@ -30,23 +30,22 @@ define void @test1(ptr %dst, {i64, i64} %sv) {
; FORCED-NEXT: br label %[[VECTOR_PH:.*]]
; FORCED: [[VECTOR_PH]]:
; FORCED-NEXT: [[TMP0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; FORCED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
-; FORCED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; FORCED-NEXT: [[TMP4:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
+; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[TMP0]], [[TMP4]]
+; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
; FORCED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FORCED-NEXT: [[TMP1:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
; FORCED-NEXT: br label %[[VECTOR_BODY:.*]]
; FORCED: [[VECTOR_BODY]]:
; FORCED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; FORCED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
-; FORCED-NEXT: store <2 x i64> [[TMP1]], ptr [[TMP2]], align 4
+; FORCED-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 4
; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; FORCED: [[MIDDLE_BLOCK]]:
-; FORCED-NEXT: br [[EXIT:label %.*]]
-; FORCED: [[SCALAR_PH:.*:]]
+; FORCED-NEXT: br label %[[EXIT:.*]]
+; FORCED: [[EXIT]]:
+; FORCED-NEXT: ret void
;
entry:
br label %loop.body
@@ -99,10 +98,11 @@ define void @test_getVectorCallCost(ptr %dst, {float, float} %sv) {
; FORCED-NEXT: store <2 x float> [[TMP2]], ptr [[TMP1]], align 4
; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; FORCED: [[MIDDLE_BLOCK]]:
-; FORCED-NEXT: br [[EXIT:label %.*]]
-; FORCED: [[SCALAR_PH:.*:]]
+; FORCED-NEXT: br label %[[EXIT:.*]]
+; FORCED: [[EXIT]]:
+; FORCED-NEXT: ret void
;
entry:
br label %loop.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
index 0c6a490ddf4ba..eceda0897b174 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
@@ -17,17 +17,15 @@ define void @widen_extractvalue(ptr %dst, {i64, i64} %sv) #0 {
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]]
; CHECK-NEXT: [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT0]], i64 0
-; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[EXTRACT0]], [[TMP10]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT2]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
-; CHECK-NEXT: store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index f25b86d3b20c2..b81637f50989d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -293,9 +293,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[A]], -1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 -1)
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 9)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
@@ -309,7 +309,7 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
index 198a30af814ba..372876c5faac6 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
@@ -134,21 +134,18 @@ define i16 @for_phi_removed(ptr %src) {
; UNROLL-NO-IC: [[VECTOR_BODY]]:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 104
; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; UNROLL-NO-IC: [[MIDDLE_BLOCK]]:
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
; UNROLL-NO-IC-NEXT: br label %[[SCALAR_PH:.*]]
; UNROLL-NO-IC: [[SCALAR_PH]]:
; UNROLL-NO-IC-NEXT: br label %[[LOOP:.*]]
; UNROLL-NO-IC: [[LOOP]]:
; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ 104, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
; UNROLL-NO-IC-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4
; UNROLL-NO-IC-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0
; UNROLL-NO-IC-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0
@@ -199,21 +196,18 @@ define i16 @for_phi_removed(ptr %src) {
; SINK-AFTER: [[VECTOR_BODY]]:
; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; SINK-AFTER-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
-; SINK-AFTER-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; SINK-AFTER-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 108
; SINK-AFTER-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; SINK-AFTER: [[MIDDLE_BLOCK]]:
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
; SINK-AFTER-NEXT: br label %[[SCALAR_PH:.*]]
; SINK-AFTER: [[SCALAR_PH]]:
; SINK-AFTER-NEXT: br label %[[LOOP:.*]]
; SINK-AFTER: [[LOOP]]:
; SINK-AFTER-NEXT: [[IV:%.*]] = phi i16 [ 108, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; SINK-AFTER-NE...
[truncated]
|
|
@llvm/pr-subscribers-backend-risc-v Author: Ramkumar Ramachandra (artagnon) ChangesnarrowToSingleScalarRecipes can permit users that are WidenStore, or a VPInstruction that is VectorToScalar or SingleScalar. This is an generalization and extension of the existing code. Patch is 23.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166559.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2588c878d8472..bf258818725b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1412,10 +1412,14 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
// broadcasts.
if (!vputils::isSingleScalar(RepOrWidenR) ||
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
- return U->usesScalars(RepOrWidenR) ||
- match(cast<VPRecipeBase>(U),
- m_CombineOr(m_ExtractLastElement(m_VPValue()),
- m_ExtractLastLanePerPart(m_VPValue())));
+ if (isa<VPWidenStoreRecipe>(U))
+ return true;
+
+ if (auto *VPI = dyn_cast<VPInstruction>(U))
+ if (VPI->isVectorToScalar() || VPI->isSingleScalar())
+ return true;
+
+ return U->usesScalars(RepOrWidenR);
}))
continue;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f16351720b20f..8380b41c0e9a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -429,48 +429,36 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
; DEFAULT: [[VECTOR_BODY]]:
-; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ]
-; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP19:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP19]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP6:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
-; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP6]]
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ]
+; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
+; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
+; DEFAULT-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
+; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]]
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]]
-; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
-; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; DEFAULT: [[PRED_STORE_IF]]:
-; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP11]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]]
; DEFAULT: [[PRED_STORE_CONTINUE]]:
-; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
-; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
+; DEFAULT: [[PRED_STORE_IF28]]:
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE29]]
+; DEFAULT: [[PRED_STORE_CONTINUE29]]:
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
+; DEFAULT: [[PRED_STORE_IF30]]:
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE31]]
+; DEFAULT: [[PRED_STORE_CONTINUE31]]:
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33]]
; DEFAULT: [[PRED_STORE_IF32]]:
-; DEFAULT-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP13]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE33]]
; DEFAULT: [[PRED_STORE_CONTINUE33]]:
-; DEFAULT-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
-; DEFAULT-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35:.*]]
-; DEFAULT: [[PRED_STORE_IF34]]:
-; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP15]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]]
-; DEFAULT: [[PRED_STORE_CONTINUE35]]:
-; DEFAULT-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
-; DEFAULT-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF36:.*]], label %[[PRED_STORE_CONTINUE37]]
-; DEFAULT: [[PRED_STORE_IF36]]:
-; DEFAULT-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE37]]
-; DEFAULT: [[PRED_STORE_CONTINUE37]]:
-; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
+; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
@@ -660,16 +648,16 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
+; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT: br label %[[EXIT]]
+; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; COMMON: [[PRED_STORE_CONTINUE14]]:
+; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]]
+; COMMON: [[MIDDLE_BLOCK]]:
+; COMMON-NEXT: br label %[[EXIT:.*]]
; COMMON: [[EXIT]]:
-; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
-; COMMON: [[SCALAR_PH]]:
-; COMMON-NEXT: br label %[[EXIT1:.*]]
-; COMMON: [[EXIT1]]:
; COMMON-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 5970608794b55..bea34e29e3530 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -16,7 +16,7 @@
; CM: vector.ph:
; CM: CLONE ir<%a> = extractvalue ir<%sv>
; CM: CLONE ir<%b> = extractvalue ir<%sv>
-; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM: CLONE ir<%add> = add ir<%a>, ir<%b>
; CM: Successor(s): vector loop
; CM: LV: Scalar loop costs: 5.
@@ -30,23 +30,22 @@ define void @test1(ptr %dst, {i64, i64} %sv) {
; FORCED-NEXT: br label %[[VECTOR_PH:.*]]
; FORCED: [[VECTOR_PH]]:
; FORCED-NEXT: [[TMP0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; FORCED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
-; FORCED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; FORCED-NEXT: [[TMP4:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
+; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[TMP0]], [[TMP4]]
+; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
; FORCED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FORCED-NEXT: [[TMP1:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
; FORCED-NEXT: br label %[[VECTOR_BODY:.*]]
; FORCED: [[VECTOR_BODY]]:
; FORCED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; FORCED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
-; FORCED-NEXT: store <2 x i64> [[TMP1]], ptr [[TMP2]], align 4
+; FORCED-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 4
; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; FORCED: [[MIDDLE_BLOCK]]:
-; FORCED-NEXT: br [[EXIT:label %.*]]
-; FORCED: [[SCALAR_PH:.*:]]
+; FORCED-NEXT: br label %[[EXIT:.*]]
+; FORCED: [[EXIT]]:
+; FORCED-NEXT: ret void
;
entry:
br label %loop.body
@@ -99,10 +98,11 @@ define void @test_getVectorCallCost(ptr %dst, {float, float} %sv) {
; FORCED-NEXT: store <2 x float> [[TMP2]], ptr [[TMP1]], align 4
; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; FORCED: [[MIDDLE_BLOCK]]:
-; FORCED-NEXT: br [[EXIT:label %.*]]
-; FORCED: [[SCALAR_PH:.*:]]
+; FORCED-NEXT: br label %[[EXIT:.*]]
+; FORCED: [[EXIT]]:
+; FORCED-NEXT: ret void
;
entry:
br label %loop.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
index 0c6a490ddf4ba..eceda0897b174 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
@@ -17,17 +17,15 @@ define void @widen_extractvalue(ptr %dst, {i64, i64} %sv) #0 {
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]]
; CHECK-NEXT: [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT0]], i64 0
-; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[EXTRACT0]], [[TMP10]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT2]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
-; CHECK-NEXT: store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index f25b86d3b20c2..b81637f50989d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -293,9 +293,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[A]], -1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 -1)
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 9)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
@@ -309,7 +309,7 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
index 198a30af814ba..372876c5faac6 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
@@ -134,21 +134,18 @@ define i16 @for_phi_removed(ptr %src) {
; UNROLL-NO-IC: [[VECTOR_BODY]]:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 104
; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; UNROLL-NO-IC: [[MIDDLE_BLOCK]]:
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
; UNROLL-NO-IC-NEXT: br label %[[SCALAR_PH:.*]]
; UNROLL-NO-IC: [[SCALAR_PH]]:
; UNROLL-NO-IC-NEXT: br label %[[LOOP:.*]]
; UNROLL-NO-IC: [[LOOP]]:
; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ 104, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
; UNROLL-NO-IC-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4
; UNROLL-NO-IC-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0
; UNROLL-NO-IC-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0
@@ -199,21 +196,18 @@ define i16 @for_phi_removed(ptr %src) {
; SINK-AFTER: [[VECTOR_BODY]]:
; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; SINK-AFTER-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
-; SINK-AFTER-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; SINK-AFTER-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 108
; SINK-AFTER-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; SINK-AFTER: [[MIDDLE_BLOCK]]:
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
; SINK-AFTER-NEXT: br label %[[SCALAR_PH:.*]]
; SINK-AFTER: [[SCALAR_PH]]:
; SINK-AFTER-NEXT: br label %[[LOOP:.*]]
; SINK-AFTER: [[LOOP]]:
; SINK-AFTER-NEXT: [[IV:%.*]] = phi i16 [ 108, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; SINK-AFTER-NE...
[truncated]
|
|
@llvm/pr-subscribers-vectorizers Author: Ramkumar Ramachandra (artagnon) ChangesnarrowToSingleScalarRecipes can permit users that are WidenStore, or a VPInstruction that is VectorToScalar or SingleScalar. This is an generalization and extension of the existing code. Patch is 23.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166559.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2588c878d8472..bf258818725b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1412,10 +1412,14 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
// broadcasts.
if (!vputils::isSingleScalar(RepOrWidenR) ||
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
- return U->usesScalars(RepOrWidenR) ||
- match(cast<VPRecipeBase>(U),
- m_CombineOr(m_ExtractLastElement(m_VPValue()),
- m_ExtractLastLanePerPart(m_VPValue())));
+ if (isa<VPWidenStoreRecipe>(U))
+ return true;
+
+ if (auto *VPI = dyn_cast<VPInstruction>(U))
+ if (VPI->isVectorToScalar() || VPI->isSingleScalar())
+ return true;
+
+ return U->usesScalars(RepOrWidenR);
}))
continue;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f16351720b20f..8380b41c0e9a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -429,48 +429,36 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
; DEFAULT: [[VECTOR_BODY]]:
-; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE37:.*]] ]
-; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP19:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP19]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP6:%.*]] = or <4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
-; DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
-; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; DEFAULT-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT30]], <4 x i32> poison, <4 x i32> zeroinitializer
-; DEFAULT-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i32> [[BROADCAST_SPLAT31]], [[TMP6]]
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE33:.*]] ]
+; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META8:![0-9]+]]
+; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META11:![0-9]+]]
+; DEFAULT-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META13:![0-9]+]]
+; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP5]]
+; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
+; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[INDEX]]
-; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
-; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; DEFAULT: [[PRED_STORE_IF]]:
-; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP11]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META17:![0-9]+]]
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]]
; DEFAULT: [[PRED_STORE_CONTINUE]]:
-; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
-; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
+; DEFAULT: [[PRED_STORE_IF28]]:
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE29]]
+; DEFAULT: [[PRED_STORE_CONTINUE29]]:
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
+; DEFAULT: [[PRED_STORE_IF30]]:
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE31]]
+; DEFAULT: [[PRED_STORE_CONTINUE31]]:
+; DEFAULT-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33]]
; DEFAULT: [[PRED_STORE_IF32]]:
-; DEFAULT-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP13]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
+; DEFAULT-NEXT: store i32 [[TMP5]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE33]]
; DEFAULT: [[PRED_STORE_CONTINUE33]]:
-; DEFAULT-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
-; DEFAULT-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35:.*]]
-; DEFAULT: [[PRED_STORE_IF34]]:
-; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP15]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]]
-; DEFAULT: [[PRED_STORE_CONTINUE35]]:
-; DEFAULT-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
-; DEFAULT-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF36:.*]], label %[[PRED_STORE_CONTINUE37]]
-; DEFAULT: [[PRED_STORE_IF36]]:
-; DEFAULT-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; DEFAULT-NEXT: store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
-; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE37]]
-; DEFAULT: [[PRED_STORE_CONTINUE37]]:
-; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
+; DEFAULT-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
@@ -660,16 +648,16 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
+; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT: br label %[[EXIT]]
+; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; COMMON: [[PRED_STORE_CONTINUE14]]:
+; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]]
+; COMMON: [[MIDDLE_BLOCK]]:
+; COMMON-NEXT: br label %[[EXIT:.*]]
; COMMON: [[EXIT]]:
-; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
-; COMMON: [[SCALAR_PH]]:
-; COMMON-NEXT: br label %[[EXIT1:.*]]
-; COMMON: [[EXIT1]]:
; COMMON-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 5970608794b55..bea34e29e3530 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -16,7 +16,7 @@
; CM: vector.ph:
; CM: CLONE ir<%a> = extractvalue ir<%sv>
; CM: CLONE ir<%b> = extractvalue ir<%sv>
-; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM: CLONE ir<%add> = add ir<%a>, ir<%b>
; CM: Successor(s): vector loop
; CM: LV: Scalar loop costs: 5.
@@ -30,23 +30,22 @@ define void @test1(ptr %dst, {i64, i64} %sv) {
; FORCED-NEXT: br label %[[VECTOR_PH:.*]]
; FORCED: [[VECTOR_PH]]:
; FORCED-NEXT: [[TMP0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; FORCED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
-; FORCED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; FORCED-NEXT: [[TMP4:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
+; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[TMP0]], [[TMP4]]
+; FORCED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
; FORCED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; FORCED-NEXT: [[TMP1:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
; FORCED-NEXT: br label %[[VECTOR_BODY:.*]]
; FORCED: [[VECTOR_BODY]]:
; FORCED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; FORCED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
-; FORCED-NEXT: store <2 x i64> [[TMP1]], ptr [[TMP2]], align 4
+; FORCED-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 4
; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; FORCED: [[MIDDLE_BLOCK]]:
-; FORCED-NEXT: br [[EXIT:label %.*]]
-; FORCED: [[SCALAR_PH:.*:]]
+; FORCED-NEXT: br label %[[EXIT:.*]]
+; FORCED: [[EXIT]]:
+; FORCED-NEXT: ret void
;
entry:
br label %loop.body
@@ -99,10 +98,11 @@ define void @test_getVectorCallCost(ptr %dst, {float, float} %sv) {
; FORCED-NEXT: store <2 x float> [[TMP2]], ptr [[TMP1]], align 4
; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; FORCED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FORCED-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; FORCED: [[MIDDLE_BLOCK]]:
-; FORCED-NEXT: br [[EXIT:label %.*]]
-; FORCED: [[SCALAR_PH:.*:]]
+; FORCED-NEXT: br label %[[EXIT:.*]]
+; FORCED: [[EXIT]]:
+; FORCED-NEXT: ret void
;
entry:
br label %loop.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
index 0c6a490ddf4ba..eceda0897b174 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll
@@ -17,17 +17,15 @@ define void @widen_extractvalue(ptr %dst, {i64, i64} %sv) #0 {
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1000, [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1000, [[N_MOD_VF]]
; CHECK-NEXT: [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT0]], i64 0
-; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[EXTRACT0]], [[TMP10]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT2]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i32 [[INDEX]]
-; CHECK-NEXT: store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT: store <vscale x 2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index f25b86d3b20c2..b81637f50989d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -293,9 +293,9 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT: [[TMP0:%.*]] = xor i32 [[A]], -1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP19:%.*]] = xor <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 -1)
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 9)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
@@ -309,7 +309,7 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP8]])
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
index 198a30af814ba..372876c5faac6 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
@@ -134,21 +134,18 @@ define i16 @for_phi_removed(ptr %src) {
; UNROLL-NO-IC: [[VECTOR_BODY]]:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 104
; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; UNROLL-NO-IC: [[MIDDLE_BLOCK]]:
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
; UNROLL-NO-IC-NEXT: br label %[[SCALAR_PH:.*]]
; UNROLL-NO-IC: [[SCALAR_PH]]:
; UNROLL-NO-IC-NEXT: br label %[[LOOP:.*]]
; UNROLL-NO-IC: [[LOOP]]:
; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ 104, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; UNROLL-NO-IC-NEXT: [[P:%.*]] = phi i16 [ [[TMP2]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
; UNROLL-NO-IC-NEXT: [[L:%.*]] = load i32, ptr [[SRC]], align 4
; UNROLL-NO-IC-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 0
; UNROLL-NO-IC-NEXT: [[SEL]] = select i1 [[C]], i16 1, i16 0
@@ -199,21 +196,18 @@ define i16 @for_phi_removed(ptr %src) {
; SINK-AFTER: [[VECTOR_BODY]]:
; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; SINK-AFTER-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
-; SINK-AFTER-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> splat (i16 1), <4 x i16> zeroinitializer
+; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; SINK-AFTER-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i16 1, i16 0
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 108
; SINK-AFTER-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; SINK-AFTER: [[MIDDLE_BLOCK]]:
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
; SINK-AFTER-NEXT: br label %[[SCALAR_PH:.*]]
; SINK-AFTER: [[SCALAR_PH]]:
; SINK-AFTER-NEXT: br label %[[LOOP:.*]]
; SINK-AFTER: [[LOOP]]:
; SINK-AFTER-NEXT: [[IV:%.*]] = phi i16 [ 108, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; SINK-AFTER-NE...
[truncated]
|
707afab to
bc795cc
Compare
ac83d60 to
a8ea09b
Compare
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
narrowToSingleScalarRecipes can permit users that are WidenStore, or a VPInstruction that is VectorToScalar or SingleScalar. This is an generalization and extension of the existing code.
a8ea09b to
55c5519
Compare
|
This change causes crashes: opt --passes=loop-vectorize ./reduced.ll -S lib/Transforms/Vectorize/VPlanTransforms.cpp:1429: narrowToSingleScalarRecipes(llvm::VPlan&)::<lambda(const llvm::VPUser*)>: Assertion `RepOrWidenR == Store->getAddr() || vputils::isSingleScalar(Store->getStoredValue())' failed. |
This is baffling! I expected that the assert would always hold by construction. Will investigate in the morning, thanks. |
Oh shit, the store could also be masked and could have a mask operand. I definitely need to forbid that. |
narrowToSingleScalarRecipes can permit users that are WidenStore, or a VPInstruction that has a suitable opcode. This is a generalization and extension of the existing code.