Skip to content

Commit

Permalink
[SLP]Improve analysis/emission of vector operands for alternate nodes.
Browse files Browse the repository at this point in the history
Compiler has an analysis for perfect diamond matching but it does not
support nodes with main/alternate opcodes. The problem is that the
scalars themselves are different and might not match directly with other
nodes, but operands and main/alternate opcodes might match and compiler
might reuse some previously emitted vector instructions. Need to include
this analysis in the cost model and actual vector instructions emission
process.

Differential Revision: https://reviews.llvm.org/D114101
  • Loading branch information
alexey-bataev committed Nov 24, 2021
1 parent 1ad7de9 commit 496254c
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 79 deletions.
77 changes: 68 additions & 9 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -1680,6 +1680,28 @@ class BoUpSLP {
return IsSame(Scalars, ReuseShuffleIndices);
}

/// \returns true if current entry has same operands as \p TE.
bool hasEqualOperands(const TreeEntry &TE) const {
if (TE.getNumOperands() != getNumOperands())
return false;
SmallBitVector Used(getNumOperands());
for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
unsigned PrevCount = Used.count();
for (unsigned K = 0; K < E; ++K) {
if (Used.test(K))
continue;
if (getOperand(K) == TE.getOperand(I)) {
Used.set(K);
break;
}
}
// Check if we actually found the matching operand.
if (PrevCount == Used.count())
return false;
}
return true;
}

/// \return Final vectorization factor for the node. Defined by the total
/// number of vectorized scalars, including those, used several times in the
/// entry and counted in the \a ReuseShuffleIndices, if any.
Expand Down Expand Up @@ -1773,6 +1795,12 @@ class BoUpSLP {
return Operands[OpIdx];
}

/// \returns the \p OpIdx operand of this TreeEntry.
ArrayRef<Value *> getOperand(unsigned OpIdx) const {
assert(OpIdx < Operands.size() && "Off bounds");
return Operands[OpIdx];
}

/// \returns the number of operands.
unsigned getNumOperands() const { return Operands.size(); }

Expand Down Expand Up @@ -2078,7 +2106,7 @@ class BoUpSLP {
SmallPtrSet<const Value *, 32> EphValues;

/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
SetVector<Instruction *> GatherShuffleSeq;

/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
Expand Down Expand Up @@ -5016,7 +5044,30 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
InstructionCost VecCost = 0;
if (Instruction::isBinaryOp(E->getOpcode())) {
// Try to find the previous shuffle node with the same operands and same
// main/alternate ops.
auto &&TryFindNodeWithEqualOperands = [this, E]() {
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE.get() == E)
break;
if (TE->isAltShuffle() &&
((TE->getOpcode() == E->getOpcode() &&
TE->getAltOpcode() == E->getAltOpcode()) ||
(TE->getOpcode() == E->getAltOpcode() &&
TE->getAltOpcode() == E->getOpcode())) &&
TE->hasEqualOperands(*E))
return true;
}
return false;
};
if (TryFindNodeWithEqualOperands()) {
LLVM_DEBUG({
dbgs() << "SLP: diamond match for alternate node found.\n";
E->dump();
});
// No need to add new vector costs here since we're going to reuse
// same main/alternate vector ops, just do different shuffling.
} else if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
Expand Down Expand Up @@ -5728,7 +5779,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
auto *InsElt = dyn_cast<InsertElementInst>(Vec);
if (!InsElt)
return Vec;
GatherSeq.insert(InsElt);
GatherShuffleSeq.insert(InsElt);
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
if (TreeEntry *Entry = getTreeEntry(V)) {
Expand Down Expand Up @@ -5915,7 +5966,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
ShuffleBuilder.addMask(ReuseShuffleIndicies);
Vec = ShuffleBuilder.finalize(Vec);
if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(I);
GatherShuffleSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
Expand Down Expand Up @@ -5953,7 +6004,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ShuffleBuilder.addMask(E->ReuseShuffleIndices);
Vec = ShuffleBuilder.finalize(Vec);
if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(I);
GatherShuffleSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
Expand Down Expand Up @@ -6444,6 +6495,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
V1 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
}
// Add V0 and V1 to later analysis to try to find and remove matching
// instruction, if any.
for (Value *V : {V0, V1}) {
if (auto *I = dyn_cast<Instruction>(V)) {
GatherShuffleSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}

// Create shuffle to take alternate operations from the vector.
// Also, gather up main and alt scalar ops to propagate IR flags to
Expand Down Expand Up @@ -6657,10 +6716,10 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}

void BoUpSLP::optimizeGatherSequence() {
LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
for (Instruction *I : GatherSeq) {
for (Instruction *I : GatherShuffleSeq) {
if (isDeleted(I))
continue;

Expand Down Expand Up @@ -6719,7 +6778,7 @@ void BoUpSLP::optimizeGatherSequence() {
if (isDeleted(&In))
continue;
if (!isa<InsertElementInst>(&In) && !isa<ExtractElementInst>(&In) &&
!isa<ShuffleVectorInst>(&In))
!isa<ShuffleVectorInst>(&In) && !GatherShuffleSeq.contains(&In))
continue;

// Check if we can replace this instruction with any of the
Expand All @@ -6741,7 +6800,7 @@ void BoUpSLP::optimizeGatherSequence() {
}
}
CSEBlocks.clear();
GatherSeq.clear();
GatherShuffleSeq.clear();
}

// Groups the instructions to a bundle (which is then a single scheduling entity)
Expand Down
Expand Up @@ -9,11 +9,9 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
; CHECK-NEXT: ret <2 x i64> [[TMP7]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
; CHECK-NEXT: ret <2 x i64> [[TMP5]]
;
%v0.0 = extractelement <2 x i64> %v0, i32 0
%v0.1 = extractelement <2 x i64> %v0, i32 1
Expand All @@ -39,12 +37,10 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8
; CHECK-NEXT: ret void
;
%a.0 = getelementptr i64, i64* %a, i64 0
Expand Down Expand Up @@ -73,11 +69,9 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: ret <4 x i32> [[TMP7]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
Expand Down Expand Up @@ -111,11 +105,9 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]]
;
%v0.0 = extractelement <2 x i32> %v0, i32 0
Expand Down Expand Up @@ -224,17 +216,15 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 7, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 65537, i32 65537, i32 65537, i32 65537>
; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], <i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]]
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
; CHECK-NEXT: ret i32 [[TMP13]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
; CHECK-NEXT: ret i32 [[TMP11]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
Expand Down
54 changes: 22 additions & 32 deletions llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
Expand Up @@ -9,11 +9,9 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
; CHECK-NEXT: ret <2 x i64> [[TMP7]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
; CHECK-NEXT: ret <2 x i64> [[TMP5]]
;
%v0.0 = extractelement <2 x i64> %v0, i32 0
%v0.1 = extractelement <2 x i64> %v0, i32 1
Expand All @@ -39,12 +37,10 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8
; CHECK-NEXT: ret void
;
%a.0 = getelementptr i64, i64* %a, i64 0
Expand Down Expand Up @@ -73,11 +69,9 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: ret <4 x i32> [[TMP7]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
Expand Down Expand Up @@ -111,11 +105,9 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]]
;
%v0.0 = extractelement <2 x i32> %v0, i32 0
Expand Down Expand Up @@ -224,17 +216,15 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 7, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], <i32 65537, i32 65537, i32 65537, i32 65537>
; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], <i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]]
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
; CHECK-NEXT: ret i32 [[TMP13]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
; CHECK-NEXT: [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
; CHECK-NEXT: ret i32 [[TMP11]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/Transforms/SLPVectorizer/X86/remark_alternate.ll
Expand Up @@ -8,7 +8,7 @@
; YAML-NEXT: Function: build_vec_v2i64
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
; YAML-NEXT: - Cost: '-8'
; YAML-NEXT: - Cost: '-10'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '8'

Expand All @@ -17,11 +17,9 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]]
; CHECK-NEXT: ret <2 x i64> [[TMP7]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
; CHECK-NEXT: ret <2 x i64> [[TMP5]]
;
%v0.0 = extractelement <2 x i64> %v0, i32 0
%v0.1 = extractelement <2 x i64> %v0, i32 1
Expand Down

0 comments on commit 496254c

Please sign in to comment.