Skip to content

Commit

Permalink
[LoopVectorize] Simplify scalar cost calculation in getInstructionCost
Browse files Browse the repository at this point in the history
This patch simplifies the calculation of certain costs in
getInstructionCost when isScalarAfterVectorization() returns a true value.
There are a few places where we multiply a cost by a number N, i.e.

  unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
  return N * TTI.getArithmeticInstrCost(...

After some investigation it seems that there are only these cases that occur
in practice:

1. VF is a scalar, in which case N = 1.
2. VF is a vector. We can only get here if: a) the instruction is a
GEP/bitcast/PHI with scalar uses, or b) this is an update to an induction
variable that remains scalar.

I have changed the code so that N is assumed to always be 1. For GEPs
the cost is always 0, since this is calculated later on as part of the
load/store cost. PHI nodes are costed separately and were never previously
multiplied by VF. For all other cases I have added an assert that none of
the users needs scalarising, which didn't fire in any unit tests.

Only one test required fixing and I believe the original cost for the scalar
add instruction to have been wrong, since only one copy remains after
vectorisation.

I have also added a new test for the case when a pointer PHI feeds directly
into a store that will be scalarised as we were previously never testing it.

Differential Revision: https://reviews.llvm.org/D99718
  • Loading branch information
david-arm committed Apr 27, 2021
1 parent c20e4fb commit 4afeda9
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 27 deletions.
64 changes: 38 additions & 26 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -7316,10 +7316,37 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

auto hasSingleCopyAfterVectorization = [this](Instruction *I,
ElementCount VF) -> bool {
if (VF.isScalar())
return true;

auto Scalarized = InstsToScalarize.find(VF);
assert(Scalarized != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
return !Scalarized->second.count(I) &&
llvm::all_of(I->users(), [&](User *U) {
auto *UI = cast<Instruction>(U);
return !Scalarized->second.count(UI);
});
};

if (isScalarAfterVectorization(I, VF)) {
// With the exception of GEPs and PHIs, after scalarization there should
// only be one copy of the instruction generated in the loop. This is
// because the VF is either 1, or any instructions that need scalarizing
// have already been dealt with by the the time we get here. As a result,
// it means we don't have to multiply the instruction cost by VF.
assert(I->getOpcode() == Instruction::GetElementPtr ||
I->getOpcode() == Instruction::PHI ||
hasSingleCopyAfterVectorization(I, VF));
VectorTy = RetTy;
} else
VectorTy = ToVectorTy(RetTy, VF);

// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
Expand Down Expand Up @@ -7447,21 +7474,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Op2VK = TargetTransformInfo::OK_UniformValue;

SmallVector<const Value *, 4> Operands(I->operand_values());
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
return TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind,
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
I->getOperand(0), I);
return TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None, I->getOperand(0), I);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
Expand Down Expand Up @@ -7605,14 +7627,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
}
}

unsigned N;
if (isScalarAfterVectorization(I, VF)) {
assert(!VF.isScalable() && "VF is assumed to be non scalable");
N = VF.getKnownMinValue();
} else
N = 1;
return N *
TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
case Instruction::Call: {
bool NeedToScalarize;
Expand All @@ -7627,11 +7642,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
default:
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
Instruction::Mul, VectorTy, CostKind) +
getScalarizationOverhead(I, VF);
// This opcode is unknown. Assume that it is the same as 'mul'.
return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
} // end of switch.
}

Expand Down
Expand Up @@ -6,7 +6,7 @@ target triple = "aarch64--linux-gnu"

; CHECK-LABEL: all_scalar
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
;
define void @all_scalar(i64* %a, i64 %n) {
Expand Down
35 changes: 35 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
Expand Up @@ -86,6 +86,41 @@ for.end:
ret void
}

; CHECK-LABEL: predicated_store_phi
;
; Same as predicate_store except we use a pointer PHI to maintain the address
;
; CHECK: Found new scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
; CHECK: Found new scalar instruction: %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %addr, align 4
;
define void @predicated_store_phi(i32* %a, i1 %c, i32 %x, i64 %n) {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
%addr = phi i32 * [ %a, %entry ], [ %addr.next, %for.inc ]
%tmp1 = load i32, i32* %addr, align 4
%tmp2 = add nsw i32 %tmp1, %x
br i1 %c, label %if.then, label %for.inc

if.then:
store i32 %tmp2, i32* %addr, align 4
br label %for.inc

for.inc:
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
%addr.next = getelementptr inbounds i32, i32* %addr, i64 1
br i1 %cond, label %for.body, label %for.end

for.end:
ret void
}

; CHECK-LABEL: predicated_udiv_scalarized_operand
;
; This test checks that we correctly compute the cost of the predicated udiv
Expand Down

0 comments on commit 4afeda9

Please sign in to comment.