Skip to content

Commit

Permalink
[LV] Exclude loop-invariant inputs from scalar cost computation.
Browse files Browse the repository at this point in the history
Loop invariant operands do not need to be scalarized, as we are using
the values outside the loop. We should ignore them when computing the
scalarization overhead.

Fixes PR41294

Reviewers: hsaito, rengolin, dcaballe, Ayal

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D59995

llvm-svn: 366030
  • Loading branch information
fhahn committed Jul 14, 2019
1 parent 8111807 commit 9428d95
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 22 deletions.
64 changes: 42 additions & 22 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -1179,7 +1179,7 @@ class LoopVectorizationCostModel {
/// VF. Return the cost of the instruction, including scalarization overhead
/// if it's needed. The flag NeedToScalarize shows if the call needs to be
/// scalarized -
// i.e. either vector version isn't available, or is too expensive.
/// i.e. either vector version isn't available, or is too expensive.
unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);

private:
Expand Down Expand Up @@ -1332,6 +1332,30 @@ class LoopVectorizationCostModel {

DecisionList WideningDecisions;

/// Returns true if \p V is expected to be vectorized and it needs to be
/// extracted.
bool needsExtract(Value *V, unsigned VF) const {
Instruction *I = dyn_cast<Instruction>(V);
if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
return false;

// Assume we can vectorize V (and hence we need extraction) if the
// scalars are not computed yet. This can happen, because it is called
// via getScalarizationOverhead from setCostBasedWideningDecision, before
// the scalars are collected. That should be a safe assumption in most
// cases, because we check if the operands have vectorizable types
// beforehand in LoopVectorizationLegality.
return Scalars.find(VF) == Scalars.end() ||
!isScalarAfterVectorization(I, VF);
};

/// Returns a range containing only operands needing to be extracted.
SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
unsigned VF) {
return SmallVector<Value *, 4>(make_filter_range(
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}

public:
/// The loop that we evaluate.
Loop *TheLoop;
Expand Down Expand Up @@ -3125,8 +3149,11 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();

SmallVector<Value *, 4> Operands(CI->arg_operands());
return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.
return TTI.getIntrinsicInstrCost(
ID, CI->getType(), filterExtractingOperands(CI->arg_operands(), VF), FMF,
VF);
}

static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
Expand Down Expand Up @@ -5346,15 +5373,6 @@ int LoopVectorizationCostModel::computePredInstDiscount(
return true;
};

// Returns true if an operand that cannot be scalarized must be extracted
// from a vector. We will account for this scalarization overhead below. Note
// that the non-void predicated instructions are placed in their own blocks,
// and their return values are inserted into vectors. Thus, an extract would
// still be required.
auto needsExtract = [&](Instruction *I) -> bool {
return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
};

// Compute the expected cost discount from scalarizing the entire expression
// feeding the predicated instruction. We currently only consider expressions
// that are single-use instruction chains.
Expand Down Expand Up @@ -5394,7 +5412,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
"Instruction has non-scalar type");
if (canBeScalarized(J))
Worklist.push_back(J);
else if (needsExtract(J))
else if (needsExtract(J, VF))
ScalarCost += TTI.getScalarizationOverhead(
ToVectorTy(J->getType(),VF), false, true);
}
Expand Down Expand Up @@ -5684,16 +5702,18 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
return Cost;

if (CallInst *CI = dyn_cast<CallInst>(I)) {
SmallVector<const Value *, 4> Operands(CI->arg_operands());
Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
} else if (!isa<StoreInst>(I) ||
!TTI.supportsEfficientVectorElementLoadStore()) {
SmallVector<const Value *, 4> Operands(I->operand_values());
Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
}
// Some targets support efficient element stores.
if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
return Cost;

return Cost;
// Collect operands to consider.
CallInst *CI = dyn_cast<CallInst>(I);
Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();

// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.
return Cost + TTI.getOperandsScalarizationOverhead(
filterExtractingOperands(Ops, VF), VF);
}

void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
Expand Down
@@ -0,0 +1,109 @@
; REQUIRES: asserts

; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM %s
; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 %s -S | FileCheck --check-prefix=FORCED %s

; Test case from PR41294.

; Check scalar cost for extractvalue. The constant and loop invariant operands are free,
; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.

; CM: LV: Scalar loop costs: 7.
; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0
; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1

; Check that the extractvalue operands are actually free in vector code.

; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
; FORCED-NEXT: %0 = add i32 %index, 0
; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0
; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 0
; FORCED-NEXT: %3 = insertelement <2 x i64> undef, i64 %1, i32 0
; FORCED-NEXT: %4 = insertelement <2 x i64> %3, i64 %2, i32 1
; FORCED-NEXT: %5 = extractvalue { i64, i64 } %sv, 1
; FORCED-NEXT: %6 = extractvalue { i64, i64 } %sv, 1
; FORCED-NEXT: %7 = insertelement <2 x i64> undef, i64 %5, i32 0
; FORCED-NEXT: %8 = insertelement <2 x i64> %7, i64 %6, i32 1
; FORCED-NEXT: %9 = getelementptr i64, i64* %dst, i32 %0
; FORCED-NEXT: %10 = add <2 x i64> %4, %8
; FORCED-NEXT: %11 = getelementptr i64, i64* %9, i32 0
; FORCED-NEXT: %12 = bitcast i64* %11 to <2 x i64>*
; FORCED-NEXT: store <2 x i64> %10, <2 x i64>* %12, align 4
; FORCED-NEXT: %index.next = add i32 %index, 2
; FORCED-NEXT: %13 = icmp eq i32 %index.next, 0
; FORCED-NEXT: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0

define void @test1(i64* %dst, {i64, i64} %sv) {
entry:
br label %loop.body

loop.body:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
%a = extractvalue { i64, i64 } %sv, 0
%b = extractvalue { i64, i64 } %sv, 1
%addr = getelementptr i64, i64* %dst, i32 %iv
%add = add i64 %a, %b
store i64 %add, i64* %addr
%iv.next = add nsw i32 %iv, 1
%cond = icmp ne i32 %iv.next, 0
br i1 %cond, label %loop.body, label %exit

exit:
ret void
}


; Similar to the test case above, but checks getVectorCallCost as well.
declare float @pow(float, float) readnone nounwind

; CM: LV: Scalar loop costs: 16.
; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0
; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1

; FORCED-LABEL: define void @test_getVectorCallCost

; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
; FORCED-NEXT: %0 = add i32 %index, 0
; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0
; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 0
; FORCED-NEXT: %3 = insertelement <2 x float> undef, float %1, i32 0
; FORCED-NEXT: %4 = insertelement <2 x float> %3, float %2, i32 1
; FORCED-NEXT: %5 = extractvalue { float, float } %sv, 1
; FORCED-NEXT: %6 = extractvalue { float, float } %sv, 1
; FORCED-NEXT: %7 = insertelement <2 x float> undef, float %5, i32 0
; FORCED-NEXT: %8 = insertelement <2 x float> %7, float %6, i32 1
; FORCED-NEXT: %9 = getelementptr float, float* %dst, i32 %0
; FORCED-NEXT: %10 = call <2 x float> @llvm.pow.v2f32(<2 x float> %4, <2 x float> %8)
; FORCED-NEXT: %11 = getelementptr float, float* %9, i32 0
; FORCED-NEXT: %12 = bitcast float* %11 to <2 x float>*
; FORCED-NEXT: store <2 x float> %10, <2 x float>* %12, align 4
; FORCED-NEXT: %index.next = add i32 %index, 2
; FORCED-NEXT: %13 = icmp eq i32 %index.next, 0
; FORCED-NEXT: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4

define void @test_getVectorCallCost(float* %dst, {float, float} %sv) {
entry:
br label %loop.body

loop.body:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
%a = extractvalue { float, float } %sv, 0
%b = extractvalue { float, float } %sv, 1
%addr = getelementptr float, float* %dst, i32 %iv
%p = call float @pow(float %a, float %b)
store float %p, float* %addr
%iv.next = add nsw i32 %iv, 1
%cond = icmp ne i32 %iv.next, 0
br i1 %cond, label %loop.body, label %exit

exit:
ret void
}

0 comments on commit 9428d95

Please sign in to comment.