diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 511162511f8e2..b36cb5ffc78f3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -311,14 +311,16 @@ class LoopVectorizationPlanner { /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue /// vectorization re-using plans for both the main and epilogue vector loops. /// It should be removed once the re-use issue has been fixed. - /// Returns a mapping of SCEVs to their expanded IR values. Note that this is - /// a temporary workaround needed due to the current epilogue - /// handling workaround needed due to the current epilogue handling. - DenseMap executePlan(ElementCount VF, unsigned UF, - VPlan &BestPlan, - InnerLoopVectorizer &LB, - DominatorTree *DT, - bool IsEpilogueVectorization); + /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop + /// to re-use expansion results generated during main plan execution. Returns + /// a mapping of SCEVs to their expanded IR values. Note that this is a + /// temporary workaround needed due to the current epilogue handling + /// workaround needed due to the current epilogue handling. + DenseMap + executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, + InnerLoopVectorizer &LB, DominatorTree *DT, + bool IsEpilogueVectorization, + DenseMap *ExpandedSCEVs = nullptr); #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printPlans(raw_ostream &O); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4727788ee090f..2f44f1c73bb93 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7684,11 +7684,15 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { SCEV2ValueTy LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, - InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization) { + InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, + DenseMap *ExpandedSCEVs) { assert(BestVPlan.hasVF(BestVF) && "Trying to execute plan with unsupported VF"); assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); + assert( + (IsEpilogueVectorization || !ExpandedSCEVs) && + "expanded SCEVs to reuse can only be used during epilogue vectorization"); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); @@ -7716,7 +7720,8 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan( // middle block. The vector loop is created during VPlan execution. Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = - ILV.createVectorizedLoopSkeleton(State.ExpandedSCEVs); + ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs + : State.ExpandedSCEVs); // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. @@ -10415,14 +10420,19 @@ bool LoopVectorizePass::processLoop(Loop *L) { VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); - // Re-use the trip count expanded for the main loop, as skeleton - // creation needs it as a value that dominates both the scalar and - // vector epilogue loops + // Re-use the trip count and steps expanded for the main loop, as + // skeleton creation needs it as a value that dominates both the scalar + // and vector epilogue loops + // TODO: This is a workaround needed for epilogue vectorization and it + // should be removed once induction resume value creation is done + // directly in VPlan. EpilogILV.setTripCount(MainILV.getTripCount()); - if (auto *R = BestEpiPlan.getTripCount()->getDefiningRecipe()) { - assert(BestEpiPlan.getTripCount()->getNumUsers() == 0 && - "trip count VPValue cannot be used in epilogue plan"); - R->eraseFromParent(); + for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { + auto *ExpandR = cast(&R); + auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( + ExpandedSCEVs.find(ExpandR->getSCEV())->second); + ExpandR->replaceAllUsesWith(ExpandedVal); + ExpandR->eraseFromParent(); } // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, @@ -10462,7 +10472,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { } LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT, true); + DT, true, &ExpandedSCEVs); ++LoopsEpilogueVectorized; if (!MainILV.areSafetyChecksAdded()) diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll index d7c4733a909c2..f5c031a09a16d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -87,7 +87,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) { ; CHECK-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX9]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT14]] = add <8 x i32> [[VEC_IND13]], ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -131,3 +131,138 @@ exit: ret void } +define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, i16 %off) { +; CHECK-LABEL: @test_induction_step_needs_expansion( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = sub i16 0, [[OFF:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[L:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[L]], 64 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[L]], 64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[L]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i16> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i16> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i16 [[TMP0]], 16 +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP2]], i64 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT7]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT9]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT11]], <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 +; CHECK-NEXT: [[IND_END:%.*]] = mul i16 [[DOTCAST]], [[TMP0]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <16 x i16> [[VEC_IND]], [[DOTSPLAT3]] +; CHECK-NEXT: [[STEP_ADD4:%.*]] = add <16 x i16> [[STEP_ADD]], [[DOTSPLAT3]] +; CHECK-NEXT: [[STEP_ADD5:%.*]] = add <16 x i16> [[STEP_ADD4]], [[DOTSPLAT3]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 48 +; CHECK-NEXT: [[TMP7:%.*]] = sub <16 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = sub <16 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i16> [[STEP_ADD4]], [[BROADCAST_SPLAT10]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i16> [[STEP_ADD5]], [[BROADCAST_SPLAT12]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 +; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP15]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 16 +; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP16]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 32 +; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP17]], align 2 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 48 +; CHECK-NEXT: store <16 x i16> [[TMP10]], ptr [[TMP18]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD5]], [[DOTSPLAT3]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[L]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[DOTCAST18:%.*]] = trunc i64 [[N_VEC]] to i16 +; CHECK-NEXT: [[IND_END19:%.*]] = mul i16 [[DOTCAST18]], [[TMP0]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[L]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF13:%.*]] = urem i64 [[L]], 8 +; CHECK-NEXT: [[N_VEC14:%.*]] = sub i64 [[L]], [[N_MOD_VF13]] +; CHECK-NEXT: [[DOTCAST16:%.*]] = trunc i64 [[N_VEC14]] to i16 +; CHECK-NEXT: [[IND_END17:%.*]] = mul i16 [[DOTCAST16]], [[TMP0]] +; CHECK-NEXT: [[DOTSPLATINSERT23:%.*]] = insertelement <8 x i16> poison, i16 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT24:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT23]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT25:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT26:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT25]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = mul <8 x i16> , [[DOTSPLAT26]] +; CHECK-NEXT: [[INDUCTION27:%.*]] = add <8 x i16> [[DOTSPLAT24]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = mul i16 [[TMP0]], 8 +; CHECK-NEXT: [[DOTSPLATINSERT28:%.*]] = insertelement <8 x i16> poison, i16 [[TMP21]], i64 0 +; CHECK-NEXT: [[DOTSPLAT29:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT28]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <8 x i16> poison, i16 [[OFF]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT33]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX22:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND30:%.*]] = phi <8 x i16> [ [[INDUCTION27]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT32:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX22]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = sub <8 x i16> [[VEC_IND30]], [[BROADCAST_SPLAT34]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 0 +; CHECK-NEXT: store <8 x i16> [[TMP23]], ptr [[TMP25]], align 2 +; CHECK-NEXT: [[INDEX_NEXT35]] = add nuw i64 [[INDEX22]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT32]] = add <8 x i16> [[VEC_IND30]], [[DOTSPLAT29]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC14]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N21:%.*]] = icmp eq i64 [[L]], [[N_VEC14]] +; CHECK-NEXT: br i1 [[CMP_N21]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL20:%.*]] = phi i16 [ [[IND_END17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[P_09:%.*]] = phi i16 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ADD]] = sub i16 [[P_09]], [[OFF]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[IV]] +; CHECK-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[L]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %p.09 = phi i16 [ 0, %entry ], [ %add, %loop ] + %add = sub i16 %p.09, %off + %arrayidx3 = getelementptr inbounds i16, ptr %k, i64 %iv + store i16 %add, ptr %arrayidx3, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %l + br i1 %ec , label %exit, label %loop + +exit: + ret void +}