diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7f7e9e3d0c4632..b9f86cd10498be 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -737,11 +737,11 @@ static void inversePermutation(ArrayRef Indices, /// \returns inserting index of InsertElement or InsertValue instruction, /// using Offset as base offset for index. -static Optional getInsertIndex(Value *InsertInst, +static Optional getInsertIndex(const Value *InsertInst, unsigned Offset = 0) { int Index = Offset; - if (auto *IE = dyn_cast(InsertInst)) { - if (auto *CI = dyn_cast(IE->getOperand(2))) { + if (const auto *IE = dyn_cast(InsertInst)) { + if (const auto *CI = dyn_cast(IE->getOperand(2))) { auto *VT = cast(IE->getType()); if (CI->getValue().uge(VT->getNumElements())) return None; @@ -752,13 +752,13 @@ static Optional getInsertIndex(Value *InsertInst, return None; } - auto *IV = cast(InsertInst); + const auto *IV = cast(InsertInst); Type *CurrentType = IV->getType(); for (unsigned I : IV->indices()) { - if (auto *ST = dyn_cast(CurrentType)) { + if (const auto *ST = dyn_cast(CurrentType)) { Index *= ST->getNumElements(); CurrentType = ST->getElementType(I); - } else if (auto *AT = dyn_cast(CurrentType)) { + } else if (const auto *AT = dyn_cast(CurrentType)) { Index *= AT->getNumElements(); CurrentType = AT->getElementType(); } else { @@ -6556,6 +6556,8 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, return false; auto *IE1 = VU; auto *IE2 = V; + unsigned Idx1 = *getInsertIndex(IE1); + unsigned Idx2 = *getInsertIndex(IE2); // Go through the vector operand of insertelement instructions trying to find // either VU as the original vector for IE2 or V as the original vector for // IE1. @@ -6563,13 +6565,15 @@ static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, if (IE2 == VU || IE1 == V) return true; if (IE1) { - if (IE1 != VU && !IE1->hasOneUse()) + if ((IE1 != VU && !IE1->hasOneUse()) || + getInsertIndex(IE1).getValueOr(Idx2) == Idx2) IE1 = nullptr; else IE1 = dyn_cast(IE1->getOperand(0)); } if (IE2) { - if (IE2 != V && !IE2->hasOneUse()) + if ((IE2 != V && !IE2->hasOneUse()) || + getInsertIndex(IE2).getValueOr(Idx1) == Idx1) IE2 = nullptr; else IE2 = dyn_cast(IE2->getOperand(0)); @@ -6586,6 +6590,8 @@ static bool isFirstInsertElement(const InsertElementInst *IE1, const auto *I2 = IE2; const InsertElementInst *PrevI1; const InsertElementInst *PrevI2; + unsigned Idx1 = *getInsertIndex(IE1); + unsigned Idx2 = *getInsertIndex(IE2); do { if (I2 == IE1) return true; @@ -6593,9 +6599,11 @@ static bool isFirstInsertElement(const InsertElementInst *IE1, return false; PrevI1 = I1; PrevI2 = I2; - if (I1 && (I1 == IE1 || I1->hasOneUse())) + if (I1 && (I1 == IE1 || I1->hasOneUse()) && + getInsertIndex(I1).getValueOr(Idx2) != Idx2) I1 = dyn_cast(I1->getOperand(0)); - if (I2 && (I2 == IE2 || I2->hasOneUse())) + if (I2 && ((I2 == IE2 || I2->hasOneUse())) && + getInsertIndex(I2).getValueOr(Idx1) != Idx1) I2 = dyn_cast(I2->getOperand(0)); } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2)); llvm_unreachable("Two different buildvectors not expected."); @@ -6764,7 +6772,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { // Find the insertvector, vectorized in tree, if any. Value *Base = VU; while (auto *IEBase = dyn_cast(Base)) { - if (IEBase != EU.User && !IEBase->hasOneUse()) + if (IEBase != EU.User && + (!IEBase->hasOneUse() || + getInsertIndex(IEBase).getValueOr(*InsertIdx) == *InsertIdx)) break; // Build the mask for the vectorized insertelement instructions. if (const TreeEntry *E = getTreeEntry(IEBase)) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll new file mode 100644 index 00000000000000..0d3c7809e868e3 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +;RUN: opt -S -slp-vectorizer -mtriple=x86_64-unknown-linux-android23 < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr undef, i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr undef, align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0 +; CHECK-NEXT: store <2 x float> zeroinitializer, ptr null, align 4 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP11]], i64 0 +; CHECK-NEXT: store <2 x float> zeroinitializer, ptr null, align 4 +; CHECK-NEXT: ret void +; + %1 = getelementptr inbounds float, ptr undef, i32 2 + %2 = load float, ptr %1, align 4 + %3 = load float, ptr undef, align 4 + %4 = fsub float %2, %3 + %5 = getelementptr inbounds float, ptr undef, i32 3 + %6 = load float, ptr %5, align 4 + %7 = getelementptr inbounds float, ptr undef, i32 1 + %8 = load float, ptr %7, align 4 + %9 = fsub float %6, %8 + %10 = fcmp olt float %9, %4 + %11 = insertelement <2 x float> undef, float %3, i64 0 + %12 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0 + store <2 x float> zeroinitializer, ptr null, align 4 + %13 = insertelement <2 x float> %11, float %6, i64 0 + store <2 x float> zeroinitializer, ptr null, align 4 + ret void +}