Skip to content

Commit

Permalink
[SLP]Improve reduction cost model for scalars.
Browse files Browse the repository at this point in the history
Instead of abstract cost of the scalar reduction ops, try to use the
cost of actual reduction operation instructions, where possible. Also,
remove the estimation of the vectorized GEPs pointers for reduced loads,
since it is already handled in the tree.

Differential Revision: https://reviews.llvm.org/D148036
  • Loading branch information
alexey-bataev committed Apr 12, 2023
1 parent caea93c commit b28f407
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 92 deletions.
87 changes: 47 additions & 40 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1148,18 +1148,6 @@ class BoUpSLP {
/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);

/// Checks if the very first tree node is going to be vectorized.
bool isVectorizedFirstNode() const {
return !VectorizableTree.empty() &&
VectorizableTree.front()->State == TreeEntry::Vectorize;
}

/// Returns the main instruction for the very first node.
Instruction *getFirstNodeMainOp() const {
assert(!VectorizableTree.empty() && "No tree to get the first node from");
return VectorizableTree.front()->getMainOp();
}

/// Returns whether the root node has in-tree uses.
bool doesRootHaveInTreeUses() const {
return !VectorizableTree.empty() &&
Expand Down Expand Up @@ -13340,22 +13328,7 @@ class HorizontalReduction {
// Estimate cost.
InstructionCost TreeCost = V.getTreeCost(VL);
InstructionCost ReductionCost =
getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {
Instruction *MainOp = V.getFirstNodeMainOp();
for (Value *V : VL) {
auto *VI = dyn_cast<LoadInst>(V);
// Add the costs of scalar GEP pointers, to be removed from the
// code.
if (!VI || VI == MainOp)
continue;
auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
continue;
TreeCost -= TTI->getArithmeticInstrCost(
Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);
}
}
getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
InstructionCost Cost = TreeCost + ReductionCost;
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
if (!Cost.isValid())
Expand Down Expand Up @@ -13591,7 +13564,8 @@ class HorizontalReduction {
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
unsigned ReduxWidth, FastMathFlags FMF) {
bool IsCmpSelMinMax, unsigned ReduxWidth,
FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Value *FirstReducedVal = ReducedVals.front();
Type *ScalarTy = FirstReducedVal->getType();
Expand All @@ -13600,6 +13574,35 @@ class HorizontalReduction {
// If all of the reduced values are constant, the vector cost is 0, since
// the reduction value can be calculated at the compile time.
bool AllConsts = allConstant(ReducedVals);
auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
InstructionCost Cost = 0;
// Scalar cost is repeated for N-1 elements.
int Cnt = ReducedVals.size();
for (Value *RdxVal : ReducedVals) {
if (Cnt == 1)
break;
--Cnt;
if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
Cost += GenCostFn();
continue;
}
InstructionCost ScalarCost = 0;
for (User *U : RdxVal->users()) {
auto *RdxOp = cast<Instruction>(U);
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
continue;
}
ScalarCost = InstructionCost::getInvalid();
break;
}
if (ScalarCost.isValid())
Cost += ScalarCost;
else
Cost += GenCostFn();
}
return Cost;
};
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
Expand All @@ -13612,7 +13615,9 @@ class HorizontalReduction {
if (!AllConsts)
VectorCost =
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
ScalarCost = EvaluateScalarCost([&]() {
return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
});
break;
}
case RecurKind::FMax:
Expand All @@ -13626,10 +13631,12 @@ class HorizontalReduction {
/*IsUnsigned=*/false, CostKind);
}
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
SclCondTy, RdxPred, CostKind);
ScalarCost = EvaluateScalarCost([&]() {
return TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy,
RdxPred, CostKind) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
RdxPred, CostKind);
});
break;
}
case RecurKind::SMax:
Expand All @@ -13646,18 +13653,18 @@ class HorizontalReduction {
IsUnsigned, CostKind);
}
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
SclCondTy, RdxPred, CostKind);
ScalarCost = EvaluateScalarCost([&]() {
return TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy,
RdxPred, CostKind) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
RdxPred, CostKind);
});
break;
}
default:
llvm_unreachable("Expected arithmetic or min/max reduction operation");
}

// Scalar cost is repeated for N-1 elements.
ScalarCost *= (ReduxWidth - 1);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
<< " for reduction that starts with " << *FirstReducedVal
<< " (It is a splitting reduction)\n");
Expand Down
57 changes: 16 additions & 41 deletions llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE2
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE,SSE4
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,THRESH

@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
Expand Down Expand Up @@ -1113,41 +1113,18 @@ define i16 @smin_intrinsic_rdx_v8i16(ptr %p0) {
}

define i64 @umax_intrinsic_rdx_v4i64(ptr %p0) {
; SSE2-LABEL: @umax_intrinsic_rdx_v4i64(
; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
; SSE2-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4
; SSE2-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4
; SSE2-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4
; SSE2-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4
; SSE2-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
; SSE2-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
; SSE2-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
; SSE2-NEXT: ret i64 [[M]]
;
; SSE4-LABEL: @umax_intrinsic_rdx_v4i64(
; SSE4-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
; SSE4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
; SSE4-NEXT: ret i64 [[TMP2]]
;
; AVX-LABEL: @umax_intrinsic_rdx_v4i64(
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
; AVX-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4
; AVX-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4
; AVX-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4
; AVX-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4
; AVX-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
; AVX-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
; AVX-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
; AVX-NEXT: ret i64 [[M]]
;
; AVX2-LABEL: @umax_intrinsic_rdx_v4i64(
; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
; AVX2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
; AVX2-NEXT: ret i64 [[TMP2]]
; DEFAULT-LABEL: @umax_intrinsic_rdx_v4i64(
; DEFAULT-NEXT: [[P1:%.*]] = getelementptr inbounds i64, ptr [[P0:%.*]], i64 1
; DEFAULT-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
; DEFAULT-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
; DEFAULT-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4
; DEFAULT-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4
; DEFAULT-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4
; DEFAULT-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4
; DEFAULT-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
; DEFAULT-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
; DEFAULT-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
; DEFAULT-NEXT: ret i64 [[M]]
;
; THRESH-LABEL: @umax_intrinsic_rdx_v4i64(
; THRESH-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[P0:%.*]], align 4
Expand Down Expand Up @@ -1252,5 +1229,3 @@ define void @PR49730() {
%t14 = call i32 @llvm.umin.i32(i32 %t13, i32 93)
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SSE: {{.*}}
32 changes: 21 additions & 11 deletions llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX

Expand All @@ -22,10 +22,25 @@ define i32 @smax_v2i32(i32) {
}

define i32 @smax_v4i32(i32) {
; CHECK-LABEL: @smax_v4i32(
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
; CHECK-NEXT: ret i32 [[TMP3]]
; SSE2-LABEL: @smax_v4i32(
; SSE2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16
; SSE2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
; SSE2-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
; SSE2-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
; SSE2-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
; SSE2-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
; SSE2-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
; SSE2-NEXT: ret i32 [[TMP8]]
;
; SSE4-LABEL: @smax_v4i32(
; SSE4-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
; SSE4-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
; SSE4-NEXT: ret i32 [[TMP3]]
;
; AVX-LABEL: @smax_v4i32(
; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
; AVX-NEXT: ret i32 [[TMP3]]
;
%2 = load i32, ptr @arr, align 16
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
Expand Down Expand Up @@ -100,8 +115,3 @@ define i32 @smax_v16i32(i32) {
%32 = call i32 @llvm.smax.i32(i32 %31, i32 %17)
ret i32 %32
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX: {{.*}}
; SSE: {{.*}}
; SSE2: {{.*}}
; SSE4: {{.*}}

0 comments on commit b28f407

Please sign in to comment.