[SLP]Outline and fix code for finding common insertelement vectors.

Need to outline the code for finding common vectors in insertelement instructions into a separate function for future patches. It also improves the process by adding some extra checks for early exit and fixes a bug where it always finds the match because of erroneous compare of the same values. Differential Revision: https://reviews.llvm.org/D114909
llvm · Dec 2, 2021 · 8ceccbd · 8ceccbd
1 parent b8f1ccb
commit 8ceccbd
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 65 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5375,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const {
   return Cost;
 }
 
+/// Check if two insertelement instructions are from the same buildvector.
+static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
+                                            InsertElementInst *V) {
+  // Instructions must be from the same basic blocks.
+  if (VU->getParent() != V->getParent())
+    return false;
+  // Checks if 2 insertelements are from the same buildvector.
+  if (VU->getType() != V->getType())
+    return false;
+  // Multiple used inserts are separate nodes.
+  if (!VU->hasOneUse() && !V->hasOneUse())
+    return false;
+  auto *IE1 = VU;
+  auto *IE2 = V;
+  // Go through the vector operand of insertelement instructions trying to find
+  // either VU as the original vector for IE2 or V as the original vector for
+  // IE1.
+  do {
+    if (IE2 == VU || IE1 == V)
+      return true;
+    if (IE1) {
+      if (IE1 != VU && !IE1->hasOneUse())
+        IE1 = nullptr;
+      else
+        IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
+    }
+    if (IE2) {
+      if (IE2 != V && !IE2->hasOneUse())
+        IE2 = nullptr;
+      else
+        IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
+    }
+  } while (IE1 || IE2);
+  return false;
+}
+
 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   InstructionCost Cost = 0;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
@@ -5422,29 +5458,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 
     // If found user is an insertelement, do not calculate extract cost but try
     // to detect it as a final shuffled/identity match.
-    if (isa_and_nonnull<InsertElementInst>(EU.User)) {
-      if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
-        Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
+    if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
+      if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
+        Optional<int> InsertIdx = getInsertIndex(VU, 0);
         if (!InsertIdx || *InsertIdx == UndefMaskElem)
           continue;
-        Value *VU = EU.User;
         auto *It = find_if(FirstUsers, [VU](Value *V) {
-          // Checks if 2 insertelements are from the same buildvector.
-          if (VU->getType() != V->getType())
-            return false;
-          auto *IE1 = cast<InsertElementInst>(VU);
-          auto *IE2 = cast<InsertElementInst>(V);
-          // Go through of insertelement instructions trying to find either VU
-          // as the original vector for IE2 or V as the original vector for IE1.
-          do {
-            if (IE1 == VU || IE2 == V)
-              return true;
-            if (IE1)
-              IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
-            if (IE2)
-              IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
-          } while (IE1 || IE2);
-          return false;
+          return areTwoInsertFromSameBuildVector(VU,
+                                                 cast<InsertElementInst>(V));
         });
         int VecId = -1;
         if (It == FirstUsers.end()) {
@@ -5455,7 +5476,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           while (isa<InsertElementInst>(Base)) {
             // Build the mask for the vectorized insertelement instructions.
             if (const TreeEntry *E = getTreeEntry(Base)) {
-              VU = Base;
+              VU = cast<InsertElementInst>(Base);
               do {
                 int Idx = E->findLaneForValue(Base);
                 ShuffleMask.back()[Idx] = Idx;

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; Check that we can commute operands based on the predicate.
@@ -235,26 +235,46 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
 }
 
 define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
-; CHECK-LABEL: @fcmp_ord_uno_v4i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; CHECK-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; CHECK-NEXT:    [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE-LABEL: @fcmp_ord_uno_v4i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; SSE-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; SSE-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
+; SSE-NEXT:    [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX-LABEL: @fcmp_ord_uno_v4i32(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; AVX-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; AVX-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
+; AVX-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
+; AVX-NEXT:    [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
+; AVX-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
+; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; AVX-NEXT:    ret <4 x i32> [[R]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
-; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; Check that we can commute operands based on the predicate.
@@ -235,26 +235,46 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
 }
 
 define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
-; CHECK-LABEL: @fcmp_ord_uno_v4i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; CHECK-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; CHECK-NEXT:    [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE-LABEL: @fcmp_ord_uno_v4i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; SSE-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; SSE-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; SSE-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
+; SSE-NEXT:    [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX-LABEL: @fcmp_ord_uno_v4i32(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; AVX-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; AVX-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
+; AVX-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
+; AVX-NEXT:    [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
+; AVX-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
+; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; AVX-NEXT:    ret <4 x i32> [[R]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1