diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 97dc05d23d4b5..b31f608e10e6d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8283,6 +8283,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan, Range, RecipeBuilder, CostCtx); + RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeScalarizationDecisions, *Plan, + Range); + // Now process all other blocks and instructions. for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { // Convert input VPInstructions to widened recipes. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 98c026cadb50e..ea0aded2c0716 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -6354,3 +6354,58 @@ void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, return ReplaceWith(VPI, Recipe); }); } + +void VPlanTransforms::makeScalarizationDecisions(VPlan &Plan, VFRange &Range) { + if (LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { return VF.isScalar(); }, Range)) + return; + + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock(); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + post_order>( + HeaderVPBB))) { + for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { + auto *VPI = dyn_cast(&R); + if (!VPI) + continue; + + auto *I = cast_or_null(VPI->getUnderlyingValue()); + if (!I) + // Wouldn't be able to create a `VPReplicateRecipe` anyway. + continue; + + bool CanTransformToFirstLaneOnly = [&]() { + if (VPI->mayHaveSideEffects()) + return false; + + if (is_contained({Instruction::SDiv, Instruction::UDiv, + Instruction::SRem, Instruction::URem}, + VPI->getOpcode()) && + VPI->getMask()) + return false; + + // Avoid rewriting IV increment as that interferes with + // `removeRedundantCanonicalIVs`. + if (VPI->getOpcode() == Instruction::Add && + any_of(VPI->operands(), IsaPred)) + return false; + + if (!vputils::onlyFirstLaneUsed(VPI)) + return false; + + return true; + }(); + + if (CanTransformToFirstLaneOnly) { + auto *Recipe = + new VPReplicateRecipe(I, VPI->operandsWithoutMask(), true, nullptr, + *VPI, *VPI, VPI->getDebugLoc()); + Recipe->insertBefore(VPI); + VPI->replaceAllUsesWith(Recipe); + VPI->eraseFromParent(); + continue; + } + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index f6dfd8043dcbf..d28effdc65ac6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -510,6 +510,11 @@ struct VPlanTransforms { static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder, VPCostContext &CostCtx); + + /// Make VPlan-based scalarization decision prior to delegating to the ones + /// made by the legacy CM. Only transforms "usesFirstLaneOnly` def-use chains + /// enabled by prior widening of consecutive memory operations for now. + static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range); }; /// A helper function that returns true if the given type is irregular. The diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll index 3832d281e93fd..8e7fe28568952 100644 --- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll +++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll @@ -10,6 +10,7 @@ ; CHECK: VPlan for loop in 'foo' after scalarizeMemOpsWithIrregularTypes ; CHECK: VPlan for loop in 'foo' after delegateMemOpWideningToLegacyCM ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeMemOpWideningDecisions +; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeScalarizationDecisions ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::clearReductionWrapFlags ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::optimizeFindIVReductions ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::handleMultiUseReductions diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll index e667711889961..93885d972e127 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll @@ -425,9 +425,9 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP0]], <4 x double> poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll index 98da110a44e8c..bfc62fb208466 100644 --- a/llvm/test/Transforms/LoopVectorize/pr37248.ll +++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll @@ -42,7 +42,6 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[START]], [[INDEX]] -; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[OFFSET_IDX]] to i16 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 ; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: @@ -55,6 +54,7 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) { ; CHECK-NEXT: store i32 10, ptr [[B]], align 1 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE3]] ; CHECK: [[PRED_STORE_CONTINUE3]]: +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[OFFSET_IDX]] to i16 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP12]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i64 -1 ; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP17]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll index dd7a8a87a921b..7548a783fb4dd 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll @@ -13,12 +13,12 @@ define void @test(ptr %A, i32 %x) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 ; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] ; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], ptr [[TMP9]], align 4