diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index cf460907bc7246..c206a5c982a2fb 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -104,7 +104,7 @@ class VectorCombine { bool scalarizeLoadExtract(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleFromReductions(Instruction &I); - bool foldSelectShuffle(Instruction &I); + bool foldSelectShuffle(Instruction &I, bool FromReduction = false); void replaceValue(Value &Old, Value &New) { Old.replaceAllUsesWith(&New); @@ -1223,7 +1223,9 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { replaceValue(*Shuffle, *NewShuffle); } - return false; + // See if we can re-use foldSelectShuffle, getting it to reduce the size of + // the shuffle into a nicer order, as it can ignore the order of the shuffles. + return foldSelectShuffle(*Shuffle, true); } /// This method looks for groups of shuffles acting on binops, of the form: @@ -1236,7 +1238,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { /// the shuffle to a form where only parts of a and b need to be computed. On /// architectures with no obvious "select" shuffle, this can reduce the total /// number of operations if the target reports them as cheaper. -bool VectorCombine::foldSelectShuffle(Instruction &I) { +bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { auto *SVI = dyn_cast(&I); auto *VT = dyn_cast(I.getType()); if (!SVI || !VT) @@ -1275,6 +1277,10 @@ bool VectorCombine::foldSelectShuffle(Instruction &I) { }; if (!collectShuffles(Op0) || !collectShuffles(Op1)) return false; + // From a reduction, we need to be processing a single shuffle, otherwise the + // other uses will not be lane-invariant. + if (FromReduction && Shuffles.size() > 1) + return false; // For each of the output shuffles, we try to sort all the first vector // elements to the beginning, followed by the second array elements at the @@ -1328,6 +1334,10 @@ bool VectorCombine::foldSelectShuffle(Instruction &I) { } } + // For reductions, we know that the lane ordering out doesn't alter the + // result. In-order can help simplify the shuffle away. + if (FromReduction) + sort(ReconstructMask); ReconstructMasks.push_back(ReconstructMask); } diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll index 3d6507a6e1f981..d6eab5a3c9555d 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -22,11 +22,13 @@ define <16 x i32> @test1(<16 x i32> %x, <16 x i32> %y) { define i32 @test1_reduce(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: @test1_reduce( -; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> -; CHECK-NEXT: [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]] -; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[S3]]) ; CHECK-NEXT: ret i32 [[R]] ; @@ -512,23 +514,27 @@ define dso_local i32 @full(i8* nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP62]], [[TMP64]] ; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[REORDER2:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP68]], [[REORDER2]] -; CHECK-NEXT: [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP68]], [[REORDER2]] -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> -; CHECK-NEXT: [[REORDER3:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = add nsw <16 x i32> [[TMP71]], [[REORDER3]] -; CHECK-NEXT: [[TMP73:%.*]] = sub nsw <16 x i32> [[TMP71]], [[REORDER3]] -; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> -; CHECK-NEXT: [[TMP75:%.*]] = lshr <16 x i32> [[TMP74]], -; CHECK-NEXT: [[TMP76:%.*]] = and <16 x i32> [[TMP75]], -; CHECK-NEXT: [[TMP77:%.*]] = mul nuw <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = add <16 x i32> [[TMP77]], [[TMP74]] -; CHECK-NEXT: [[TMP79:%.*]] = xor <16 x i32> [[TMP78]], [[TMP77]] -; CHECK-NEXT: [[TMP80:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP79]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP80]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP80]], 16 +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP70]] +; CHECK-NEXT: [[TMP73:%.*]] = sub nsw <16 x i32> [[TMP69]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> +; CHECK-NEXT: [[TMP78:%.*]] = add nsw <16 x i32> [[TMP74]], [[TMP76]] +; CHECK-NEXT: [[TMP79:%.*]] = sub nsw <16 x i32> [[TMP75]], [[TMP77]] +; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <16 x i32> [[TMP78]], <16 x i32> [[TMP79]], <16 x i32> +; CHECK-NEXT: [[TMP81:%.*]] = lshr <16 x i32> [[TMP80]], +; CHECK-NEXT: [[TMP82:%.*]] = and <16 x i32> [[TMP81]], +; CHECK-NEXT: [[TMP83:%.*]] = mul nuw <16 x i32> [[TMP82]], +; CHECK-NEXT: [[TMP84:%.*]] = add <16 x i32> [[TMP83]], [[TMP80]] +; CHECK-NEXT: [[TMP85:%.*]] = xor <16 x i32> [[TMP84]], [[TMP83]] +; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP85]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP86]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP86]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] @@ -719,23 +725,27 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi ; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> ; CHECK-NEXT: [[TMP58:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP56]] ; CHECK-NEXT: [[TMP59:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP57]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> -; CHECK-NEXT: [[REORDER192:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP60]], [[REORDER192]] -; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP60]], [[REORDER192]] -; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[REORDER193:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = add nsw <16 x i32> [[TMP63]], [[REORDER193]] -; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP63]], [[REORDER193]] -; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = lshr <16 x i32> [[TMP66]], -; CHECK-NEXT: [[TMP68:%.*]] = and <16 x i32> [[TMP67]], -; CHECK-NEXT: [[TMP69:%.*]] = mul nuw <16 x i32> [[TMP68]], -; CHECK-NEXT: [[TMP70:%.*]] = add <16 x i32> [[TMP69]], [[TMP66]] -; CHECK-NEXT: [[TMP71:%.*]] = xor <16 x i32> [[TMP70]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP71]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP72]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP72]], 16 +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP62]] +; CHECK-NEXT: [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP61]], [[TMP63]] +; CHECK-NEXT: [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP66]], [[TMP68]] +; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP69]] +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = lshr <16 x i32> [[TMP72]], +; CHECK-NEXT: [[TMP74:%.*]] = and <16 x i32> [[TMP73]], +; CHECK-NEXT: [[TMP75:%.*]] = mul nuw <16 x i32> [[TMP74]], +; CHECK-NEXT: [[TMP76:%.*]] = add <16 x i32> [[TMP75]], [[TMP72]] +; CHECK-NEXT: [[TMP77:%.*]] = xor <16 x i32> [[TMP76]], [[TMP75]] +; CHECK-NEXT: [[TMP78:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP77]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP78]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP78]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]