diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index d04b5edcfc212..8097699ff58ff 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3397,7 +3397,7 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) { return false; } -/// Return true if \p R is a predicated load/store with a loop-invariant address +/// Return true if \p R is a predicated store with a loop-invariant address /// only masked by the header mask. static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R, const SCEV *PtrSCEV, @@ -3544,20 +3544,15 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UsedByLoadStoreAddress ? UI : nullptr); - // Check if this is a predicated load/store with a loop-invariant address - // only masked by the header mask. If so, return the uniform mem op cost. - if (isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) { + // Check if this is a predicated store with a loop-invariant address only + // masked by the header mask. If so, return the uniform mem op cost. + if (!IsLoad && + isPredicatedUniformMemOpAfterTailFolding(*this, PtrSCEV, Ctx)) { InstructionCost UniformCost = ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(ScalarPtrTy, /*SE=*/nullptr, /*Ptr=*/nullptr, Ctx.CostKind); auto *VectorTy = cast(toVectorTy(ValTy, VF)); - if (IsLoad) { - return UniformCost + - Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, - VectorTy, VectorTy, {}, Ctx.CostKind); - } - VPValue *StoredVal = getOperand(0); if (!StoredVal->isDefinedOutsideLoopRegions()) UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd( diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 32d89a34105a4..e898f10cf3b70 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1697,6 +1697,24 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { match(Def, m_ComputeReductionResult(m_VPIRValue(IRV)))) return Def->replaceAllUsesWith(IRV); + // Narrow a single-scalar predicated load with a header mask to an + // unpredicated load; the header mask only restricts the active lanes, but a + // single-scalar load reads from a uniform address and produces the same value + // for every lane. + if (auto *RepR = dyn_cast(Def)) { + if (RepR->isPredicated() && RepR->isSingleScalar() && + RepR->getOpcode() == Instruction::Load && + vputils::isHeaderMask(RepR->getMask(), *Plan)) { + auto *NewR = new VPReplicateRecipe( + RepR->getUnderlyingInstr(), drop_end(RepR->operands()), + /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR, + RepR->getDebugLoc()); + NewR->insertBefore(RepR); + RepR->replaceAllUsesWith(NewR); + return; + } + } + // Some simplifications can only be applied after unrolling. Perform them // below. if (!Plan->isUnrolled()) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll index 3e61a933d0ecb..1573bae53d52c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll @@ -16,29 +16,19 @@ define i32 @pr70988(ptr %src, i32 %n) { ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[UMAX]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE5:%.*]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_LOAD_CONTINUE5]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[PRED_LOAD_CONTINUE5]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[PRED_LOAD_CONTINUE5]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE5]] ] -; CHECK-NEXT: br i1 [[ACTIVE_LANE_MASK]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: br i1 [[ACTIVE_LANE_MASK2]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5]] -; CHECK: pred.load.if4: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] -; CHECK: pred.load.continue5: -; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF4]] ] -; CHECK-NEXT: [[TMP15:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP8]], i32 [[VEC_PHI]]) +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP12]], i32 [[VEC_PHI]]) ; CHECK-NEXT: [[TMP16:%.*]] = tail call i32 @llvm.smax.i32(i32 [[TMP14]], i32 [[VEC_PHI3]]) ; CHECK-NEXT: [[TMP17]] = select i1 [[ACTIVE_LANE_MASK]], i32 [[TMP15]], i32 [[VEC_PHI]] ; CHECK-NEXT: [[TMP18]] = select i1 [[ACTIVE_LANE_MASK2]], i32 [[TMP16]], i32 [[VEC_PHI3]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll index 89c5553d3bc03..4a6d5cb3142cb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll @@ -199,22 +199,22 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64 ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ true, [[PRED_STORE_CONTINUE4]] ] ; INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[PRED_STORE_CONTINUE4]] ] +; INTERLEAVE-NEXT: [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1 +; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[TMP0]] +; INTERLEAVE-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP1]], align 8 ; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; INTERLEAVE: pred.store.if: -; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = call double @foo(double [[TMP1]], i64 [[INDEX]]) #[[ATTR4:[0-9]+]] -; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]] -; INTERLEAVE-NEXT: store double [[TMP2]], ptr [[TMP3]], align 8 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[TMP3]], align 8 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[INDEX]]) #[[ATTR4:[0-9]+]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]] +; INTERLEAVE-NEXT: store double [[TMP5]], ptr [[TMP6]], align 8 ; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE]] ; INTERLEAVE: pred.store.continue: ; INTERLEAVE-NEXT: br i1 [[ACTIVE_LANE_MASK2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] ; INTERLEAVE: pred.store.if3: -; INTERLEAVE-NEXT: [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 1 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[TMP4]] -; INTERLEAVE-NEXT: [[TMP6:%.*]] = load double, ptr [[TMP5]], align 8 -; INTERLEAVE-NEXT: [[TMP7:%.*]] = call double @foo(double [[TMP6]], i64 [[TMP4]]) #[[ATTR4]] -; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[TMP4]] +; INTERLEAVE-NEXT: [[TMP7:%.*]] = call double @foo(double [[TMP2]], i64 [[TMP0]]) #[[ATTR4]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[TMP0]] ; INTERLEAVE-NEXT: store double [[TMP7]], ptr [[TMP8]], align 8 ; INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE4]] ; INTERLEAVE: pred.store.continue4: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll index 0a57bef879728..937fc5b1489de 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll @@ -111,20 +111,10 @@ define i32 @FOR_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { ; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0 ; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] ; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]] -; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] -; VF1IC2: [[PRED_LOAD_IF]]: ; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] -; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 -; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] -; VF1IC2: [[PRED_LOAD_CONTINUE]]: -; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] -; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]] -; VF1IC2: [[PRED_LOAD_IF2]]: ; VF1IC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV1]] -; VF1IC2-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4 -; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]] -; VF1IC2: [[PRED_LOAD_CONTINUE3]]: -; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP32]], %[[PRED_LOAD_IF2]] ] +; VF1IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: [[TMP8]] = load i32, ptr [[TMP31]], align 4 ; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF1IC2: [[PRED_STORE_IF]]: ; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] @@ -288,20 +278,10 @@ define i32 @FOR_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { ; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0 ; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] ; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]] -; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] -; VF1IC2: [[PRED_LOAD_IF]]: ; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] -; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 -; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] -; VF1IC2: [[PRED_LOAD_CONTINUE]]: -; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] -; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]] -; VF1IC2: [[PRED_LOAD_IF2]]: ; VF1IC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV1]] -; VF1IC2-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4 -; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]] -; VF1IC2: [[PRED_LOAD_CONTINUE3]]: -; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP29]], %[[PRED_LOAD_IF2]] ] +; VF1IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: [[TMP8]] = load i32, ptr [[TMP28]], align 4 ; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF1IC2: [[PRED_STORE_IF]]: ; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] @@ -473,20 +453,10 @@ define i32 @FOR_and_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) { ; VF1IC2-NEXT: [[VEC_IV:%.*]] = add i64 [[TMP0]], 0 ; VF1IC2-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] ; VF1IC2-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV1]], [[TRIP_COUNT_MINUS_1]] -; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] -; VF1IC2: [[PRED_LOAD_IF]]: ; VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]] -; VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 -; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE]] -; VF1IC2: [[PRED_LOAD_CONTINUE]]: -; VF1IC2-NEXT: [[TMP5:%.*]] = phi i32 [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] -; VF1IC2-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF2:.*]], label %[[PRED_LOAD_CONTINUE3:.*]] -; VF1IC2: [[PRED_LOAD_IF2]]: ; VF1IC2-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[VEC_IV1]] -; VF1IC2-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4 -; VF1IC2-NEXT: br label %[[PRED_LOAD_CONTINUE3]] -; VF1IC2: [[PRED_LOAD_CONTINUE3]]: -; VF1IC2-NEXT: [[TMP8]] = phi i32 [ poison, %[[PRED_LOAD_CONTINUE]] ], [ [[TMP35]], %[[PRED_LOAD_IF2]] ] +; VF1IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP6]], align 4 +; VF1IC2-NEXT: [[TMP8]] = load i32, ptr [[TMP34]], align 4 ; VF1IC2-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; VF1IC2: [[PRED_STORE_IF]]: ; VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]] diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll index 106712c3d9a65..ccb671fe15f07 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -295,31 +295,31 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) { ; VF1UF4-NEXT: [[TMP5:%.*]] = icmp ule i64 [[TMP1]], 13 ; VF1UF4-NEXT: [[TMP6:%.*]] = icmp ule i64 [[TMP2]], 13 ; VF1UF4-NEXT: [[TMP7:%.*]] = icmp ule i64 [[TMP3]], 13 -; VF1UF4-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; VF1UF4: pred.store.if: ; VF1UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; VF1UF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; VF1UF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; VF1UF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] ; VF1UF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8 +; VF1UF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP11]], align 8 +; VF1UF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP13]], align 8 +; VF1UF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF1UF4-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VF1UF4: pred.store.if: ; VF1UF4-NEXT: store i64 [[TMP9]], ptr [[B:%.*]], align 8 ; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]] ; VF1UF4: pred.store.continue: ; VF1UF4-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] ; VF1UF4: pred.store.if1: -; VF1UF4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; VF1UF4-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP11]], align 8 ; VF1UF4-NEXT: store i64 [[TMP12]], ptr [[B]], align 8 ; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE2]] ; VF1UF4: pred.store.continue2: ; VF1UF4-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; VF1UF4: pred.store.if3: -; VF1UF4-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] -; VF1UF4-NEXT: [[TMP15:%.*]] = load i64, ptr [[TMP14]], align 8 ; VF1UF4-NEXT: store i64 [[TMP15]], ptr [[B]], align 8 ; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE4]] ; VF1UF4: pred.store.continue4: ; VF1UF4-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] ; VF1UF4: pred.store.if5: -; VF1UF4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] -; VF1UF4-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8 ; VF1UF4-NEXT: store i64 [[TMP18]], ptr [[B]], align 8 ; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE6]] ; VF1UF4: pred.store.continue6: diff --git a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll index 636eb6ddfaece..122cd78905a84 100644 --- a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll +++ b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll @@ -74,26 +74,16 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE3:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_LOAD_CONTINUE3]] ] -; CHECK-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IV1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i32 [[VEC_IV]], 14 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i32 [[VEC_IV1]], 14 -; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[SRC]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP5:%.*]] = phi float [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3]] -; CHECK: pred.load.if1: ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, ptr [[SRC]], i32 [[VEC_IV1]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE3]] -; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP9:%.*]] = phi float [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP8]], [[PRED_LOAD_IF2]] ] +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP0]], float [[TMP5]], float -0.000000e+00 ; CHECK-NEXT: [[TMP11:%.*]] = fadd contract float [[VEC_PHI]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = select contract i1 [[TMP1]], float [[TMP9]], float -0.000000e+00 @@ -113,25 +103,15 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) { ; CHECK-ALM: vector.ph: ; CHECK-ALM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ALM: vector.body: -; CHECK-ALM-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE3:%.*]] ] -; CHECK-ALM-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[PRED_LOAD_CONTINUE3]] ] +; CHECK-ALM-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ALM-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-ALM-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 ; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult i32 [[INDEX]], 15 ; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = icmp ult i32 [[TMP1]], 15 -; CHECK-ALM-NEXT: br i1 [[ACTIVE_LANE_MASK]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK-ALM: pred.load.if: ; CHECK-ALM-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[SRC]], i32 [[INDEX]] -; CHECK-ALM-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4 -; CHECK-ALM-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK-ALM: pred.load.continue: -; CHECK-ALM-NEXT: [[TMP4:%.*]] = phi float [ poison, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ] -; CHECK-ALM-NEXT: br i1 [[ACTIVE_LANE_MASK1]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3]] -; CHECK-ALM: pred.load.if2: ; CHECK-ALM-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[SRC]], i32 [[TMP1]] -; CHECK-ALM-NEXT: [[TMP6:%.*]] = load float, ptr [[TMP5]], align 4 -; CHECK-ALM-NEXT: br label [[PRED_LOAD_CONTINUE3]] -; CHECK-ALM: pred.load.continue3: -; CHECK-ALM-NEXT: [[TMP7:%.*]] = phi float [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP6]], [[PRED_LOAD_IF2]] ] +; CHECK-ALM-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP2]], align 4 +; CHECK-ALM-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP5]], align 4 ; CHECK-ALM-NEXT: [[TMP8:%.*]] = select contract i1 [[ACTIVE_LANE_MASK]], float [[TMP4]], float -0.000000e+00 ; CHECK-ALM-NEXT: [[TMP9:%.*]] = fadd contract float [[VEC_PHI]], [[TMP8]] ; CHECK-ALM-NEXT: [[TMP10:%.*]] = select contract i1 [[ACTIVE_LANE_MASK1]], float [[TMP7]], float -0.000000e+00