diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9bd671fa2b3de..6d86dbb4e3805 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1448,15 +1448,14 @@ class LoopVectorizationCostModel { // through scalar predication or masked load/store or masked gather/scatter. // \p VF is the vectorization factor that will be used to vectorize \p I. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I, ElementCount VF, - bool IsKnownUniform = false) { - // When we know the load is uniform and the original scalar loop was not - // predicated we don't need to mark it as a predicated instruction. Any - // vectorised blocks created when tail-folding are something artificial we - // have introduced and we know there is always at least one active lane. - // That's why we call Legal->blockNeedsPredication here because it doesn't - // query tail-folding. - if (IsKnownUniform && isa(I) && + bool isPredicatedInst(Instruction *I, ElementCount VF) { + // When we know the load's address is loop invariant and the instruction + // in the original scalar loop was unconditionally executed then we + // don't need to mark it as a predicated instruction. Tail folding may + // introduce additional predication, but we're guaranteed to always have + // at least one active lane. We call Legal->blockNeedsPredication here + // because it doesn't query tail-folding. + if (Legal->isUniformMemOp(*I) && isa(I) && !Legal->blockNeedsPredication(I->getParent())) return false; if (!blockNeedsPredicationForAnyReason(I->getParent())) @@ -6062,7 +6061,8 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); + assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) && + "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); @@ -8363,7 +8363,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, + [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range); // Even if the instruction is not marked as uniform, there are certain diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 55551711bf96f..7b1a2b6c6f5b6 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -447,31 +447,14 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, i32* %src) { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%.pn> = phi ir<0>, vp<[[P_L:%.+]]> +; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%.pn> = phi ir<0>, ir<%l> ; CHECK-NEXT: EMIT vp<[[WIDE_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule vp<[[WIDE_IV]]> vp<[[BTC]]> -; CHECK-NEXT: Successor(s): loop.0 -; CHECK-EMPTY: -; CHECK-NEXT: loop.0: -; CHECK-NEXT: Successor(s): pred.load -; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[CMP]]> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%l> = load ir<%src> (S->V) -; CHECK-NEXT: Successor(s): pred.load.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[P_L]]> = ir<%l> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): pred.load.succ -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.succ: -; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn> vp<[[P_L]]> +; CHECK-NEXT: Successor(s): loop.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.0: +; CHECK-NEXT: REPLICATE ir<%l> = load ir<%src> +; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn> ir<%l> ; CHECK-NEXT: Successor(s): pred.sdiv ; CHECK-EMPTY: ; CHECK-NEXT: pred.sdiv: { @@ -487,9 +470,9 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, i32* %src) { ; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[P_VAL:%.+]]> = ir<%val> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.1 +; CHECK-NEXT: Successor(s): loop.0.split ; CHECK-EMPTY: -; CHECK-NEXT: loop.1: +; CHECK-NEXT: loop.0.split: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll index f9a9cce0dda69..8e71a00161faf 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -373,33 +373,37 @@ define void @load_variant(i64* noalias %a, i64* noalias %b) { ; VF1UF4-NEXT: store i64 [[TMP5]], i64* [[B:%.*]], align 8 ; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]] ; VF1UF4: pred.store.continue: +; VF1UF4-NEXT: [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_STORE_IF]] ] ; VF1UF4-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; VF1UF4: pred.store.if7: ; VF1UF4-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; VF1UF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION1]] -; VF1UF4-NEXT: [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8 -; VF1UF4-NEXT: store i64 [[TMP7]], i64* [[B]], align 8 +; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION1]] +; VF1UF4-NEXT: [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8 +; VF1UF4-NEXT: store i64 [[TMP8]], i64* [[B]], align 8 ; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE8]] ; VF1UF4: pred.store.continue8: +; VF1UF4-NEXT: [[TMP9:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP8]], [[PRED_STORE_IF7]] ] ; VF1UF4-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; VF1UF4: pred.store.if9: ; VF1UF4-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2 -; VF1UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION2]] -; VF1UF4-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8 -; VF1UF4-NEXT: store i64 [[TMP9]], i64* [[B]], align 8 +; VF1UF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION2]] +; VF1UF4-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8 +; VF1UF4-NEXT: store i64 [[TMP11]], i64* [[B]], align 8 ; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE10]] ; VF1UF4: pred.store.continue10: +; VF1UF4-NEXT: [[TMP12:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE8]] ], [ [[TMP11]], [[PRED_STORE_IF9]] ] ; VF1UF4-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] ; VF1UF4: pred.store.if11: ; VF1UF4-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3 -; VF1UF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION3]] -; VF1UF4-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8 -; VF1UF4-NEXT: store i64 [[TMP11]], i64* [[B]], align 8 +; VF1UF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION3]] +; VF1UF4-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP13]], align 8 +; VF1UF4-NEXT: store i64 [[TMP14]], i64* [[B]], align 8 ; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE12]] ; VF1UF4: pred.store.continue12: +; VF1UF4-NEXT: [[TMP15:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE10]] ], [ [[TMP14]], [[PRED_STORE_IF11]] ] ; VF1UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; VF1UF4-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; VF1UF4-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF1UF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; VF1UF4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VF1UF4: middle.block: ; VF1UF4-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VF1UF4: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 1b5ca0520ebd0..f846d9e8b8fba 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -253,24 +253,7 @@ define void @uniform_gep(i64 %k, i16* noalias %A, i16* noalias %B) { ; CHECK-NEXT: EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]> vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr ir<%A>, ir<0> -; CHECK-NEXT: Successor(s): pred.load -; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep.A.uniform> -; CHECK-NEXT: Successor(s): pred.load.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.0 -; CHECK-EMPTY: -; CHECK-NEXT: loop.0: +; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep.A.uniform> ; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: Successor(s): loop.then ; CHECK-EMPTY: @@ -286,7 +269,7 @@ define void @uniform_gep(i64 %k, i16* noalias %A, i16* noalias %B) { ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: REPLICATE ir<%gep.B> = getelementptr ir<%B>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE store vp<[[PRED]]>, ir<%gep.B> +; CHECK-NEXT: REPLICATE store ir<%lv>, ir<%gep.B> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.continue: