diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 900df48c80d8b2..5a88b5c223d799 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4032,10 +4032,15 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { if (VectorizableTree.size() != 2) return false; - // Handle splat and all-constants stores. + // Handle splat and all-constants stores. Also try to vectorize tiny trees + // with the second gather nodes if they have less scalar operands rather than + // the initial tree element (may be profitable to shuffle the second gather). if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || - isSplat(VectorizableTree[1]->Scalars))) + isSplat(VectorizableTree[1]->Scalars) || + (VectorizableTree[1]->State == TreeEntry::NeedToGather && + VectorizableTree[1]->Scalars.size() < + VectorizableTree[0]->Scalars.size()))) return true; // Gathering cost would be too much for tiny trees. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll index 65f02770ab853b..5548a828b778aa 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -272,21 +272,18 @@ define void @tiny_vector_gather(i32 *%a, i32 *%v1, i32 *%v2) { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[V1:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[V2:%.*]], align 4 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR0]], align 16 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR1]], align 4 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR2]], align 8 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR3]], align 4 ; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR4]], align 16 ; CHECK-NEXT: [[PTR5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 5 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR5]], align 4 ; CHECK-NEXT: [[PTR6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 6 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR6]], align 8 ; CHECK-NEXT: [[PTR7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 7 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR7]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[PTR0]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; %1 = load i32, i32* %v1, align 4