diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b1d5d3b880357..1ed3f200a9244 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3715,6 +3715,18 @@ class slpvectorizer::BoUpSLP { }); } + /// Checks if it is legal and profitable to build SplitVectorize node for the + /// given \p VL. + /// \param Op1 first homogeneous scalars. + /// \param Op2 second homogeneous scalars. + /// \param ReorderIndices indices to reorder the scalars. + /// \returns true if the node was successfully built. + bool canBuildSplitNode(ArrayRef VL, + const InstructionsState &LocalState, + SmallVectorImpl &Op1, + SmallVectorImpl &Op2, + OrdersType &ReorderIndices) const; + ~BoUpSLP(); private: @@ -3788,18 +3800,6 @@ class slpvectorizer::BoUpSLP { ArrayRef VectorizedVals, SmallPtrSetImpl &CheckedExtracts); - /// Checks if it is legal and profitable to build SplitVectorize node for the - /// given \p VL. - /// \param Op1 first homogeneous scalars. - /// \param Op2 second homogeneous scalars. - /// \param ReorderIndices indices to reorder the scalars. - /// \returns true if the node was successfully built. - bool canBuildSplitNode(ArrayRef VL, - const InstructionsState &LocalState, - SmallVectorImpl &Op1, - SmallVectorImpl &Op2, - OrdersType &ReorderIndices) const; - /// This is the recursive part of buildTree. void buildTreeRec(ArrayRef Roots, unsigned Depth, const EdgeInfo &EI, unsigned InterleaveFactor = 0); @@ -26253,6 +26253,11 @@ class HorizontalReduction { SmallVector> LocalReducedVals; // Try merge consecutive reduced values into a single vectorizable group and // check, if they can be vectorized as copyables. + const bool TwoGroupsOnly = ReducedVals.size() == 2; + const bool TwoGroupsOfSameSmallSize = + TwoGroupsOnly && + ReducedVals.front().size() == ReducedVals.back().size() && + ReducedVals.front().size() < ReductionLimit; for (ArrayRef RV : ReducedVals) { // Loads are not very compatible with undefs. if (isa(RV.front()) && @@ -26269,22 +26274,47 @@ class HorizontalReduction { States.push_back(getSameOpcode(RV, TLI)); continue; } - SmallVector Ops; - if (!LocalReducedVals.empty()) - Ops = LocalReducedVals.back(); - Ops.append(RV.begin(), RV.end()); - InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI); - InstructionsState OpS = - Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements); - if (LocalReducedVals.empty()) { - LocalReducedVals.push_back(Ops); - States.push_back(OpS); - continue; - } - if (OpS) { - LocalReducedVals.back().swap(Ops); - States.back() = OpS; - continue; + // Do some copyables analysis only if more than 2 groups exist or they + // are large enough. + if (!TwoGroupsOfSameSmallSize) { + SmallVector Ops; + if (!LocalReducedVals.empty()) + Ops = LocalReducedVals.back(); + Ops.append(RV.begin(), RV.end()); + InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI); + InstructionsState OpS = Analysis.buildInstructionsState( + Ops, V, /*TryCopyableElementsVectorization=*/true, + /*WithProfitabilityCheck=*/true, /*SkipSameCodeCheck=*/true); + if (OpS && OpS.areInstructionsWithCopyableElements()) { + if (LocalReducedVals.empty()) { + LocalReducedVals.push_back(Ops); + States.push_back(OpS); + continue; + } + LocalReducedVals.back().swap(Ops); + States.back() = OpS; + continue; + } + // For safety, allow split vectorization only if 2 groups are available + // overall. + if (TwoGroupsOnly) { + auto [MainOp, AltOp] = getMainAltOpsNoStateVL(Ops); + OpS = InstructionsState(MainOp, AltOp); + // Last chance to try to vectorize alternate node. + SmallVector Op1, Op2; + BoUpSLP::OrdersType ReorderIndices; + if (MainOp && AltOp && + V.canBuildSplitNode(Ops, OpS, Op1, Op2, ReorderIndices)) { + if (LocalReducedVals.empty()) { + LocalReducedVals.push_back(Ops); + States.push_back(OpS); + continue; + } + LocalReducedVals.back().swap(Ops); + States.back() = OpS; + continue; + } + } } LocalReducedVals.emplace_back().append(RV.begin(), RV.end()); States.push_back(getSameOpcode(RV, TLI)); diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll index a2ccbb96b6003..d36da8d028c60 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll @@ -18,15 +18,14 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, ; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53 ; CHECK-NEXT: [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820 ; CHECK-NEXT: [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2 -; CHECK-NEXT: [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1 -; CHECK-NEXT: [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]] -; CHECK-NEXT: [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]] -; CHECK-NEXT: [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]] -; CHECK-NEXT: [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]] -; CHECK-NEXT: [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]] -; CHECK-NEXT: [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]] -; CHECK-NEXT: [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]] -; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425 +; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426 +; CHECK-NEXT: [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]] +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]] +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]] +; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]] +; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]] +; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll index eb9b249b9a898..c3131a41c2b2e 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll @@ -18,15 +18,14 @@ define <4 x i32> @square(<4 x i32> %num, i32 %y, i32 %x, i32 %h, i32 %k, i32 %w, ; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53 ; CHECK-NEXT: [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820 ; CHECK-NEXT: [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2 -; CHECK-NEXT: [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1 -; CHECK-NEXT: [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]] -; CHECK-NEXT: [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]] -; CHECK-NEXT: [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]] -; CHECK-NEXT: [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]] -; CHECK-NEXT: [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]] -; CHECK-NEXT: [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]] -; CHECK-NEXT: [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]] -; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[DOTSCALAR7]], 317425 +; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426 +; CHECK-NEXT: [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]] +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]] +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]] +; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]] +; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]] +; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll index 78ec0be59e1b4..b8a3ad9e63fda 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-based-reduction.ll @@ -7,146 +7,86 @@ define i32 @test(ptr %pix1, i32 %i_pix1, ptr %pix2, i32 %i_pix2) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64 ; CHECK-NEXT: [[IDX_EXT31:%.*]] = sext i32 [[I_PIX2]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i8> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <2 x i32> [[TMP2]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = zext <2 x i8> [[TMP7]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <2 x i32> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i32> [[TMP11]], [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <2 x i32> [[TMP6]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = shl nsw <2 x i32> [[TMP13]], splat (i32 16) -; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i32> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PIX1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR32:%.*]] = getelementptr inbounds i8, ptr [[PIX2]], i64 [[IDX_EXT31]] -; CHECK-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR32]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <2 x i32> [[TMP18]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = sub nsw <2 x i32> [[TMP24]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = add nsw <2 x i32> [[TMP27]], [[TMP22]] -; CHECK-NEXT: [[TMP29:%.*]] = sub nsw <2 x i32> [[TMP22]], [[TMP27]] -; CHECK-NEXT: [[TMP30:%.*]] = shl nsw <2 x i32> [[TMP29]], splat (i32 16) -; CHECK-NEXT: [[TMP31:%.*]] = add nsw <2 x i32> [[TMP28]], [[TMP30]] -; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR32_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR32]], i64 [[IDX_EXT31]] -; CHECK-NEXT: [[TMP32:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP32]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = load <4 x i8>, ptr [[ADD_PTR32_1]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP35]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = sub nsw <2 x i32> [[TMP34]], [[TMP37]] -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <4 x i8> [[TMP32]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = zext <2 x i8> [[TMP39]] to <2 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP35]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = sub nsw <2 x i32> [[TMP40]], [[TMP42]] -; CHECK-NEXT: [[TMP44:%.*]] = add nsw <2 x i32> [[TMP43]], [[TMP38]] -; CHECK-NEXT: [[TMP45:%.*]] = sub nsw <2 x i32> [[TMP38]], [[TMP43]] -; CHECK-NEXT: [[TMP46:%.*]] = shl nsw <2 x i32> [[TMP45]], splat (i32 16) -; CHECK-NEXT: [[TMP47:%.*]] = add nsw <2 x i32> [[TMP44]], [[TMP46]] -; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR32_1]], i64 [[IDX_EXT31]] -; CHECK-NEXT: [[TMP48:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP48]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = load <4 x i8>, ptr [[ADD_PTR32_2]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP51]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = sub nsw <2 x i32> [[TMP50]], [[TMP53]] -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <4 x i8> [[TMP48]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <4 x i8> [[TMP51]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP57]] to <2 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = sub nsw <2 x i32> [[TMP56]], [[TMP58]] -; CHECK-NEXT: [[TMP60:%.*]] = add nsw <2 x i32> [[TMP59]], [[TMP54]] -; CHECK-NEXT: [[TMP61:%.*]] = sub nsw <2 x i32> [[TMP54]], [[TMP59]] -; CHECK-NEXT: [[TMP62:%.*]] = shl nsw <2 x i32> [[TMP61]], splat (i32 16) -; CHECK-NEXT: [[TMP63:%.*]] = add nsw <2 x i32> [[TMP60]], [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 -; CHECK-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[ADD24:%.*]] = add nsw i32 [[TMP65]], [[TMP64]] -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <2 x i32> [[TMP31]], i32 0 -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <2 x i32> [[TMP31]], i32 1 -; CHECK-NEXT: [[SUB27_1:%.*]] = sub nsw i32 [[TMP66]], [[TMP67]] -; CHECK-NEXT: [[ADD24_1:%.*]] = add nsw i32 [[TMP67]], [[TMP66]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1 -; CHECK-NEXT: [[SUB27_2:%.*]] = sub nsw i32 [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[ADD24_2:%.*]] = add nsw i32 [[TMP69]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <2 x i32> [[TMP63]], i32 0 -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP63]], i32 1 -; CHECK-NEXT: [[SUB27_3:%.*]] = sub nsw i32 [[TMP70]], [[TMP71]] -; CHECK-NEXT: [[ADD24_3:%.*]] = add nsw i32 [[TMP71]], [[TMP70]] -; CHECK-NEXT: [[ADD45_1:%.*]] = add nsw i32 [[SUB27_1]], [[SUB27]] -; CHECK-NEXT: [[ADD45:%.*]] = add nsw i32 [[ADD24_1]], [[ADD24]] -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i32> poison, i32 [[ADD24]], i32 0 -; CHECK-NEXT: [[TMP73:%.*]] = insertelement <2 x i32> [[TMP72]], i32 [[SUB27]], i32 1 -; CHECK-NEXT: [[TMP74:%.*]] = insertelement <2 x i32> poison, i32 [[ADD24_1]], i32 0 -; CHECK-NEXT: [[TMP75:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB27_1]], i32 1 -; CHECK-NEXT: [[TMP76:%.*]] = sub nsw <2 x i32> [[TMP73]], [[TMP75]] -; CHECK-NEXT: [[ADD59_1:%.*]] = add nsw i32 [[SUB27_3]], [[SUB27_2]] -; CHECK-NEXT: [[ADD59:%.*]] = add nsw i32 [[ADD24_3]], [[ADD24_2]] -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <2 x i32> poison, i32 [[ADD24_2]], i32 0 -; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[SUB27_2]], i32 1 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <2 x i32> poison, i32 [[ADD24_3]], i32 0 -; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP79]], i32 [[SUB27_3]], i32 1 -; CHECK-NEXT: [[TMP81:%.*]] = sub nsw <2 x i32> [[TMP78]], [[TMP80]] -; CHECK-NEXT: [[ADD67_1:%.*]] = add nsw i32 [[ADD59_1]], [[ADD45_1]] -; CHECK-NEXT: [[ADD67:%.*]] = add nsw i32 [[ADD59]], [[ADD45]] -; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> poison, i32 [[ADD45]], i32 0 -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <2 x i32> [[TMP82]], i32 [[ADD45_1]], i32 1 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <2 x i32> poison, i32 [[ADD59]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <2 x i32> [[TMP84]], i32 [[ADD59_1]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = sub nsw <2 x i32> [[TMP83]], [[TMP85]] -; CHECK-NEXT: [[TMP87:%.*]] = add nsw <2 x i32> [[TMP81]], [[TMP76]] -; CHECK-NEXT: [[TMP88:%.*]] = sub nsw <2 x i32> [[TMP76]], [[TMP81]] -; CHECK-NEXT: [[TMP89:%.*]] = insertelement <2 x i32> poison, i32 [[ADD67]], i32 0 -; CHECK-NEXT: [[TMP90:%.*]] = insertelement <2 x i32> [[TMP89]], i32 [[ADD67_1]], i32 1 -; CHECK-NEXT: [[TMP91:%.*]] = lshr <2 x i32> [[TMP90]], splat (i32 15) -; CHECK-NEXT: [[TMP92:%.*]] = and <2 x i32> [[TMP91]], splat (i32 65537) -; CHECK-NEXT: [[TMP93:%.*]] = mul nuw <2 x i32> [[TMP92]], splat (i32 65535) -; CHECK-NEXT: [[TMP94:%.*]] = add <2 x i32> [[TMP93]], [[TMP90]] -; CHECK-NEXT: [[TMP95:%.*]] = xor <2 x i32> [[TMP94]], [[TMP93]] -; CHECK-NEXT: [[TMP96:%.*]] = lshr <2 x i32> [[TMP87]], splat (i32 15) -; CHECK-NEXT: [[TMP97:%.*]] = and <2 x i32> [[TMP96]], splat (i32 65537) -; CHECK-NEXT: [[TMP98:%.*]] = mul nuw <2 x i32> [[TMP97]], splat (i32 65535) -; CHECK-NEXT: [[TMP99:%.*]] = add <2 x i32> [[TMP98]], [[TMP87]] -; CHECK-NEXT: [[TMP100:%.*]] = xor <2 x i32> [[TMP99]], [[TMP98]] -; CHECK-NEXT: [[TMP101:%.*]] = add <2 x i32> [[TMP95]], [[TMP100]] -; CHECK-NEXT: [[TMP102:%.*]] = lshr <2 x i32> [[TMP86]], splat (i32 15) -; CHECK-NEXT: [[TMP103:%.*]] = and <2 x i32> [[TMP102]], splat (i32 65537) -; CHECK-NEXT: [[TMP104:%.*]] = mul nuw <2 x i32> [[TMP103]], splat (i32 65535) -; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP104]], [[TMP86]] -; CHECK-NEXT: [[TMP106:%.*]] = xor <2 x i32> [[TMP105]], [[TMP104]] -; CHECK-NEXT: [[TMP107:%.*]] = add <2 x i32> [[TMP101]], [[TMP106]] -; CHECK-NEXT: [[TMP108:%.*]] = lshr <2 x i32> [[TMP88]], splat (i32 15) -; CHECK-NEXT: [[TMP109:%.*]] = and <2 x i32> [[TMP108]], splat (i32 65537) -; CHECK-NEXT: [[TMP110:%.*]] = mul nuw <2 x i32> [[TMP109]], splat (i32 65535) -; CHECK-NEXT: [[TMP111:%.*]] = add <2 x i32> [[TMP110]], [[TMP88]] -; CHECK-NEXT: [[TMP112:%.*]] = xor <2 x i32> [[TMP111]], [[TMP110]] -; CHECK-NEXT: [[TMP113:%.*]] = add <2 x i32> [[TMP107]], [[TMP112]] -; CHECK-NEXT: [[TMP114:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP115:%.*]] = and <4 x i32> [[TMP114]], -; CHECK-NEXT: [[TMP116:%.*]] = lshr <4 x i32> [[TMP114]], -; CHECK-NEXT: [[TMP117:%.*]] = shufflevector <4 x i32> [[TMP115]], <4 x i32> [[TMP116]], <4 x i32> +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 1 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 2 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 3 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[IDX_EXT]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr align 1 [[PIX1]], i64 [[TMP0]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[IDX_EXT31]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr align 1 [[PIX2]], i64 [[TMP3]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[IDX_EXT]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 [[TMP7]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[IDX_EXT31]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 [[TMP10]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[TMP11]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <4 x i32> [[TMP9]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = sub nsw <4 x i32> [[TMP6]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = shl nsw <4 x i32> [[TMP15]], splat (i32 16) +; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[IDX_EXT]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr align 1 [[ARRAYIDX10]], i64 [[TMP18]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP20:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[IDX_EXT31]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr align 1 [[ARRAYIDX12]], i64 [[TMP21]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw <4 x i32> [[TMP20]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[IDX_EXT]], 1 +; CHECK-NEXT: [[TMP26:%.*]] = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr align 1 [[ARRAYIDX15]], i64 [[TMP25]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP27:%.*]] = zext <4 x i8> [[TMP26]] to <4 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[IDX_EXT31]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr align 1 [[ARRAYIDX17]], i64 [[TMP28]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = sub nsw <4 x i32> [[TMP27]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP24]] +; CHECK-NEXT: [[TMP33:%.*]] = sub nsw <4 x i32> [[TMP24]], [[TMP31]] +; CHECK-NEXT: [[TMP34:%.*]] = shl nsw <4 x i32> [[TMP33]], splat (i32 16) +; CHECK-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[TMP32]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = sub nsw <4 x i32> [[TMP17]], [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP36]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP17]] +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <4 x i32> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x i32> [[TMP40]], <4 x i32> [[TMP41]], <4 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[TMP42]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = add nsw <4 x i32> [[TMP42]], [[TMP43]] +; CHECK-NEXT: [[TMP45:%.*]] = sub nsw <4 x i32> [[TMP42]], [[TMP43]] +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i32> [[TMP44]], <4 x i32> [[TMP45]], <4 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = lshr <4 x i32> [[TMP46]], splat (i32 15) +; CHECK-NEXT: [[TMP48:%.*]] = and <4 x i32> [[TMP47]], splat (i32 65537) +; CHECK-NEXT: [[TMP49:%.*]] = mul nuw <4 x i32> [[TMP48]], splat (i32 65535) +; CHECK-NEXT: [[TMP50:%.*]] = add <4 x i32> [[TMP49]], [[TMP46]] +; CHECK-NEXT: [[TMP117:%.*]] = xor <4 x i32> [[TMP50]], [[TMP49]] ; CHECK-NEXT: [[TMP118:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP117]]) -; CHECK-NEXT: [[SHR84:%.*]] = lshr i32 [[TMP118]], 1 +; CHECK-NEXT: [[CONV78:%.*]] = and i32 [[TMP118]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP118]], 16 +; CHECK-NEXT: [[ADD80:%.*]] = add nuw nsw i32 [[SHR]], [[CONV78]] +; CHECK-NEXT: [[TMP53:%.*]] = add nsw <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[TMP54:%.*]] = sub nsw <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> [[TMP54]], <4 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = add nsw <4 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = sub nsw <4 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP58]], <4 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = lshr <4 x i32> [[TMP59]], splat (i32 15) +; CHECK-NEXT: [[TMP61:%.*]] = and <4 x i32> [[TMP60]], splat (i32 65537) +; CHECK-NEXT: [[TMP62:%.*]] = mul nuw <4 x i32> [[TMP61]], splat (i32 65535) +; CHECK-NEXT: [[TMP63:%.*]] = add <4 x i32> [[TMP62]], [[TMP59]] +; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i32> [[TMP63]], [[TMP62]] +; CHECK-NEXT: [[TMP65:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP64]]) +; CHECK-NEXT: [[CONV78_1:%.*]] = and i32 [[TMP65]], 65535 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[TMP65]], 16 +; CHECK-NEXT: [[ADD79_1:%.*]] = add nuw nsw i32 [[SHR_1]], [[ADD80]] +; CHECK-NEXT: [[ADD80_1:%.*]] = add nuw nsw i32 [[ADD79_1]], [[CONV78_1]] +; CHECK-NEXT: [[SHR84:%.*]] = lshr i32 [[ADD80_1]], 1 ; CHECK-NEXT: ret i32 [[SHR84]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll index e3279290c3074..dbceefdb2e986 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -5,38 +5,34 @@ define void @Test(i32) { ; CHECK-LABEL: @Test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP6]], -; CHECK-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685 +; CHECK-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]] -; CHECK-NEXT: [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]] +; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]] ; CHECK-NEXT: [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 +; CHECK-NEXT: [[TMP10]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_44]], i32 1 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] -; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2 -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP6]], -; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685 +; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] +; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], ; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]] -; FORCE_REDUCTION-NEXT: [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]] +; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] +; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]] ; FORCE_REDUCTION-NEXT: [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_43]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_44]], i32 1 ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll b/llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll index 5d0898d0d1a97..29f7aa4beb0d9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll @@ -9,43 +9,43 @@ define void @test(i32 %arg, i32 %arg1, i64 %arg2) { ; CHECK: [[BB3]]: ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ 0, %[[BB3]] ], [ 0, %[[BB]] ] ; CHECK-NEXT: [[PHI4:%.*]] = phi i64 [ [[OP_RDX7:%.*]], %[[BB3]] ], [ 0, %[[BB]] ] -; CHECK-NEXT: [[SHL:%.*]] = shl i32 0, 1 -; CHECK-NEXT: [[SEXT:%.*]] = sext i32 [[SHL]] to i64 -; CHECK-NEXT: [[SHL17:%.*]] = shl i32 [[SHL]], 0 +; CHECK-NEXT: [[SHL34:%.*]] = shl i32 0, 1 +; CHECK-NEXT: [[OP_RDX5:%.*]] = sext i32 [[SHL34]] to i64 +; CHECK-NEXT: [[SHL18:%.*]] = shl i32 [[SHL34]], 0 ; CHECK-NEXT: [[ADD18:%.*]] = add i64 1, 0 ; CHECK-NEXT: [[TRUNC19:%.*]] = trunc i64 [[ADD18]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> , i64 [[PHI]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i64> zeroinitializer, [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 -; CHECK-NEXT: [[TRUNC10:%.*]] = trunc i64 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> , i64 [[PHI]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i64> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2 +; CHECK-NEXT: [[TRUNC10:%.*]] = trunc i64 [[TMP10]] to i32 ; CHECK-NEXT: [[OR11:%.*]] = or i32 [[TRUNC10]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i64> zeroinitializer, [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i64> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = mul <4 x i64> [[TMP7]], [[TMP8]] ; CHECK-NEXT: [[XOR:%.*]] = xor i32 0, [[TRUNC]] ; CHECK-NEXT: [[SEXT15:%.*]] = sext i32 [[XOR]] to i64 -; CHECK-NEXT: [[SEXT22:%.*]] = sext i32 [[SHL17]] to i64 -; CHECK-NEXT: [[XOR24:%.*]] = xor i32 [[ARG]], [[TRUNC10]] -; CHECK-NEXT: [[SEXT25:%.*]] = sext i32 [[XOR24]] to i64 +; CHECK-NEXT: [[SEXT23:%.*]] = sext i32 [[SHL18]] to i64 +; CHECK-NEXT: [[SHL:%.*]] = xor i32 [[ARG]], [[TRUNC10]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = sext i32 [[SHL]] to i64 ; CHECK-NEXT: [[TRUNC27:%.*]] = trunc i64 [[ARG2]] to i32 -; CHECK-NEXT: [[SEXT29:%.*]] = sext i32 [[SHL]] to i64 -; CHECK-NEXT: [[XOR31:%.*]] = xor i32 [[ARG1]], [[TRUNC19]] -; CHECK-NEXT: [[SEXT32:%.*]] = sext i32 [[XOR31]] to i64 -; CHECK-NEXT: [[SHL34:%.*]] = shl i32 [[ARG1]], 0 ; CHECK-NEXT: [[SEXT35:%.*]] = sext i32 [[SHL34]] to i64 +; CHECK-NEXT: [[SHL17:%.*]] = xor i32 [[ARG1]], [[TRUNC19]] +; CHECK-NEXT: [[SEXT22:%.*]] = sext i32 [[SHL17]] to i64 +; CHECK-NEXT: [[SHL35:%.*]] = shl i32 [[ARG1]], 0 +; CHECK-NEXT: [[SEXT36:%.*]] = sext i32 [[SHL35]] to i64 ; CHECK-NEXT: [[XOR37:%.*]] = xor i32 [[ARG]], [[TRUNC27]] ; CHECK-NEXT: [[SEXT38:%.*]] = sext i32 [[XOR37]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP9]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP7]], [[SEXT]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i64 [[SEXT15]], [[SEXT22]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = add i64 [[SEXT25]], [[SEXT29]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = add i64 [[SEXT32]], [[SEXT35]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = add i64 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX5:%.*]] = add i64 [[OP_RDX2]], [[OP_RDX3]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP13]]) ; CHECK-NEXT: [[OP_RDX6:%.*]] = add i64 [[OP_RDX4]], [[OP_RDX5]] -; CHECK-NEXT: [[OP_RDX7]] = add i64 [[OP_RDX6]], [[SEXT38]] +; CHECK-NEXT: [[OP_RDX8:%.*]] = add i64 [[SEXT15]], [[SEXT23]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i64 [[OP_RDX1]], [[SEXT35]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = add i64 [[SEXT22]], [[SEXT36]] +; CHECK-NEXT: [[OP_RDX9:%.*]] = add i64 [[OP_RDX6]], [[OP_RDX8]] +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i64 [[OP_RDX2]], [[OP_RDX3]] +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i64 [[OP_RDX9]], [[OP_RDX10]] +; CHECK-NEXT: [[OP_RDX7]] = add i64 [[OP_RDX11]], [[SEXT38]] ; CHECK-NEXT: br i1 false, label %[[BB40:.*]], label %[[BB3]] ; CHECK: [[BB40]]: ; CHECK-NEXT: [[PHI41:%.*]] = phi i64 [ [[OP_RDX7]], %[[BB3]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll index 56919ae0ffc90..1cf837df719ec 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll @@ -10,9 +10,9 @@ define i32 @foo() { ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[RDX_OP:%.*]] = mul <4 x i32> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[RDX_OP]]) -; CHECK-NEXT: [[OP_RDX7:%.*]] = mul i32 0, [[TMP2]] ; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 0, [[TMP5]] -; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX7]], [[OP_RDX]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], 0 +; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX1]], [[TMP2]] ; CHECK-NEXT: ret i32 [[OP_RDX6]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll index 2774d5f3b64e4..f7811aba5ab5f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll @@ -7,17 +7,16 @@ define <4 x i16> @test() { ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[RDX_OP1:%.*]] = or <16 x i16> [[RDX_OP]], zeroinitializer -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]]) ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0 -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]]) ; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1 -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]]) ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2 -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]]) ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3 ; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer