Skip to content

Commit

Permalink
[SLP]Outline and fix code for finding common insertelement vectors.
Browse files Browse the repository at this point in the history
Need to outline the code for finding common vectors in insertelement
instructions into a separate function for future patches. It also
improves the process by adding some extra checks for early exit and
fixes a bug where it always finds the match because of erroneous compare
of the same values.

Differential Revision: https://reviews.llvm.org/D114909
  • Loading branch information
alexey-bataev committed Dec 2, 2021
1 parent b8f1ccb commit 8ceccbd
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 65 deletions.
63 changes: 42 additions & 21 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -5375,6 +5375,42 @@ InstructionCost BoUpSLP::getSpillCost() const {
return Cost;
}

/// Check if two insertelement instructions are from the same buildvector.
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU,
InsertElementInst *V) {
// Instructions must be from the same basic blocks.
if (VU->getParent() != V->getParent())
return false;
// Checks if 2 insertelements are from the same buildvector.
if (VU->getType() != V->getType())
return false;
// Multiple used inserts are separate nodes.
if (!VU->hasOneUse() && !V->hasOneUse())
return false;
auto *IE1 = VU;
auto *IE2 = V;
// Go through the vector operand of insertelement instructions trying to find
// either VU as the original vector for IE2 or V as the original vector for
// IE1.
do {
if (IE2 == VU || IE1 == V)
return true;
if (IE1) {
if (IE1 != VU && !IE1->hasOneUse())
IE1 = nullptr;
else
IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
}
if (IE2) {
if (IE2 != V && !IE2->hasOneUse())
IE2 = nullptr;
else
IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
}
} while (IE1 || IE2);
return false;
}

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
Expand Down Expand Up @@ -5422,29 +5458,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
if (isa_and_nonnull<InsertElementInst>(EU.User)) {
if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
Optional<int> InsertIdx = getInsertIndex(VU, 0);
if (!InsertIdx || *InsertIdx == UndefMaskElem)
continue;
Value *VU = EU.User;
auto *It = find_if(FirstUsers, [VU](Value *V) {
// Checks if 2 insertelements are from the same buildvector.
if (VU->getType() != V->getType())
return false;
auto *IE1 = cast<InsertElementInst>(VU);
auto *IE2 = cast<InsertElementInst>(V);
// Go through of insertelement instructions trying to find either VU
// as the original vector for IE2 or V as the original vector for IE1.
do {
if (IE1 == VU || IE2 == V)
return true;
if (IE1)
IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
if (IE2)
IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
} while (IE1 || IE2);
return false;
return areTwoInsertFromSameBuildVector(VU,
cast<InsertElementInst>(V));
});
int VecId = -1;
if (It == FirstUsers.end()) {
Expand All @@ -5455,7 +5476,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
while (isa<InsertElementInst>(Base)) {
// Build the mask for the vectorized insertelement instructions.
if (const TreeEntry *E = getTreeEntry(Base)) {
VU = Base;
VU = cast<InsertElementInst>(Base);
do {
int Idx = E->findLaneForValue(Base);
ShuffleMask.back()[Idx] = Idx;
Expand Down
64 changes: 42 additions & 22 deletions llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX

;
; Check that we can commute operands based on the predicate.
Expand Down Expand Up @@ -235,26 +235,46 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
}

define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
; CHECK-LABEL: @fcmp_ord_uno_v4i32(
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; CHECK-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; CHECK-NEXT: ret <4 x i32> [[R]]
; SSE-LABEL: @fcmp_ord_uno_v4i32(
; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; SSE-NEXT: ret <4 x i32> [[R]]
;
; AVX-LABEL: @fcmp_ord_uno_v4i32(
; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; AVX-NEXT: ret <4 x i32> [[R]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1
Expand Down
64 changes: 42 additions & 22 deletions llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX

;
; Check that we can commute operands based on the predicate.
Expand Down Expand Up @@ -235,26 +235,46 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
}

define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
; CHECK-LABEL: @fcmp_ord_uno_v4i32(
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; CHECK-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; CHECK-NEXT: ret <4 x i32> [[R]]
; SSE-LABEL: @fcmp_ord_uno_v4i32(
; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; SSE-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; SSE-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
; SSE-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; SSE-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; SSE-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; SSE-NEXT: ret <4 x i32> [[R]]
;
; AVX-LABEL: @fcmp_ord_uno_v4i32(
; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
; AVX-NEXT: ret <4 x i32> [[R]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1
Expand Down

0 comments on commit 8ceccbd

Please sign in to comment.