Skip to content

Commit

Permalink
[LV] Scalarize operands of predicated instructions
Browse files Browse the repository at this point in the history
This patch attempts to scalarize the operand expressions of predicated
instructions if they were conditionally executed in the original loop. After
scalarization, the expressions will be sunk inside the blocks created for the
predicated instructions. The transformation essentially performs
un-if-conversion on the operands.

The cost model has been updated to determine if scalarization is profitable. It
compares the cost of a vectorized instruction, assuming it will be
if-converted, to the cost of the scalarized instruction, assuming that the
instructions corresponding to each vector lane will be sunk inside a predicated
block, possibly avoiding execution. If it's more profitable to scalarize the
entire expression tree feeding the predicated instruction, the expression will
be scalarized; otherwise, it will be vectorized. We only consider the cost of
the entire expression to accurately estimate the cost of the required
insertelement and extractelement instructions.

Differential Revision: https://reviews.llvm.org/D26083

llvm-svn: 288909
  • Loading branch information
mssimpso committed Dec 7, 2016
1 parent 41552d6 commit 364da7e
Show file tree
Hide file tree
Showing 6 changed files with 539 additions and 14 deletions.
217 changes: 210 additions & 7 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -518,6 +518,10 @@ class InnerLoopVectorizer {
/// induction variable will first be truncated to the corresponding type.
void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);

/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const;

/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;

Expand Down Expand Up @@ -1907,6 +1911,15 @@ class LoopVectorizationCostModel {
return MinBWs;
}

/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
return Scalars->second.count(I);
}

private:
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
Expand Down Expand Up @@ -1949,6 +1962,29 @@ class LoopVectorizationCostModel {
/// to this type.
MapVector<Instruction *, uint64_t> MinBWs;

/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.
typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;

/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
/// vectorization factor. The entries are VF-ScalarCostTy pairs.
DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;

/// Returns the expected difference in cost from scalarizing the expression
/// feeding a predicated instruction \p PredInst. The instructions to
/// scalarize and their scalar costs are collected in \p ScalarCosts. A
/// non-negative return value implies the expression will be scalarized.
/// Currently, only single-use chains are considered for scalarization.
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
unsigned VF);

/// Collects the instructions to scalarize for each predicated instruction in
/// the loop.
void collectInstsToScalarize(unsigned VF);

public:
/// The loop that we evaluate.
Loop *TheLoop;
Expand Down Expand Up @@ -2183,12 +2219,17 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
VecInd->addIncoming(LastInduction, LoopVectorLatch);
}

bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
return Legal->isScalarAfterVectorization(I) ||
Cost->isProfitableToScalarize(I, VF);
}

bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
if (Legal->isScalarAfterVectorization(IV))
if (shouldScalarizeInstruction(IV))
return true;
auto isScalarInst = [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return (OrigLoop->contains(I) && Legal->isScalarAfterVectorization(I));
return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
};
return any_of(IV->users(), isScalarInst);
}
Expand Down Expand Up @@ -2229,7 +2270,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (VF > 1 && IV->getType() == Induction->getType() && Step &&
!Legal->isScalarAfterVectorization(EntryVal)) {
!shouldScalarizeInstruction(EntryVal)) {
createVectorIntInductionPHI(ID, EntryVal);
VectorizedIV = true;
}
Expand Down Expand Up @@ -4648,10 +4689,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
continue;

// Scalarize instructions that should remain scalar after vectorization.
if (!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
if (VF > 1 &&
!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
isa<DbgInfoIntrinsic>(&I)) &&
Legal->isScalarAfterVectorization(&I)) {
scalarizeInstruction(&I);
shouldScalarizeInstruction(&I)) {
scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
continue;
}

Expand Down Expand Up @@ -6124,6 +6166,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");

Factor.Width = UserVF;
collectInstsToScalarize(UserVF);
return Factor;
}

Expand Down Expand Up @@ -6530,10 +6573,160 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
return RUs;
}

void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {

// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
// expected cost for interleaving.
if (VF < 2 || InstsToScalarize.count(VF))
return;

// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
// not profitable to scalarize any instructions, the presence of VF in the
// map will indicate that we've analyzed it already.
ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];

// Find all the instructions that are scalar with predication in the loop and
// determine if it would be better to not if-convert the blocks they are in.
// If so, we also record the instructions to scalarize.
for (BasicBlock *BB : TheLoop->blocks()) {
if (!Legal->blockNeedsPredication(BB))
continue;
for (Instruction &I : *BB)
if (Legal->isScalarWithPredication(&I)) {
ScalarCostsTy ScalarCosts;
if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
}
}
}

int LoopVectorizationCostModel::computePredInstDiscount(
Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
unsigned VF) {

assert(!Legal->isUniformAfterVectorization(PredInst) &&
"Instruction marked uniform-after-vectorization will be predicated");

// Initialize the discount to zero, meaning that the scalar version and the
// vector version cost the same.
int Discount = 0;

// Holds instructions to analyze. The instructions we visit are mapped in
// ScalarCosts. Those instructions are the ones that would be scalarized if
// we find that the scalar version costs less.
SmallVector<Instruction *, 8> Worklist;

// Returns true if the given instruction can be scalarized.
auto canBeScalarized = [&](Instruction *I) -> bool {

// We only attempt to scalarize instructions forming a single-use chain
// from the original predicated block that would otherwise be vectorized.
// Although not strictly necessary, we give up on instructions we know will
// already be scalar to avoid traversing chains that are unlikely to be
// beneficial.
if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
Legal->isScalarAfterVectorization(I))
return false;

// If the instruction is scalar with predication, it will be analyzed
// separately. We ignore it within the context of PredInst.
if (Legal->isScalarWithPredication(I))
return false;

// If any of the instruction's operands are uniform after vectorization,
// the instruction cannot be scalarized. This prevents, for example, a
// masked load from being scalarized.
//
// We assume we will only emit a value for lane zero of an instruction
// marked uniform after vectorization, rather than VF identical values.
// Thus, if we scalarize an instruction that uses a uniform, we would
// create uses of values corresponding to the lanes we aren't emitting code
// for. This behavior can be changed by allowing getScalarValue to clone
// the lane zero values for uniforms rather than asserting.
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
if (Legal->isUniformAfterVectorization(J))
return false;

// Otherwise, we can scalarize the instruction.
return true;
};

// Returns true if an operand that cannot be scalarized must be extracted
// from a vector. We will account for this scalarization overhead below. Note
// that the non-void predicated instructions are placed in their own blocks,
// and their return values are inserted into vectors. Thus, an extract would
// still be required.
auto needsExtract = [&](Instruction *I) -> bool {
return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
};

// Compute the expected cost discount from scalarizing the entire expression
// feeding the predicated instruction. We currently only consider expressions
// that are single-use instruction chains.
Worklist.push_back(PredInst);
while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();

// If we've already analyzed the instruction, there's nothing to do.
if (ScalarCosts.count(I))
continue;

// Compute the cost of the vector instruction. Note that this cost already
// includes the scalarization overhead of the predicated instruction.
unsigned VectorCost = getInstructionCost(I, VF).first;

// Compute the cost of the scalarized instruction. This cost is the cost of
// the instruction as if it wasn't if-converted and instead remained in the
// predicated block. We will scale this cost by block probability after
// computing the scalarization overhead.
unsigned ScalarCost = VF * getInstructionCost(I, 1).first;

// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,
false, TTI);
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
}

// Compute the scalarization overhead of needed extractelement
// instructions. For each of the instruction's operands, if the operand can
// be scalarized, add it to the worklist; otherwise, account for the
// overhead.
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get())) {
assert(VectorType::isValidElementType(J->getType()) &&
"Instruction has non-scalar type");
if (canBeScalarized(J))
Worklist.push_back(J);
else if (needsExtract(J))
ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),
false, true, TTI);
}

// Scale the total scalar cost by block probability.
ScalarCost /= getReciprocalPredBlockProb();

// Compute the discount. A non-negative discount means the vector version
// of the instruction costs more, and scalarizing would be beneficial.
Discount += VectorCost - ScalarCost;
ScalarCosts[I] = ScalarCost;
}

return Discount;
}

LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy Cost;

// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
collectInstsToScalarize(VF);

// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
VectorizationCostTy BlockCost;
Expand Down Expand Up @@ -6641,6 +6834,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
if (Legal->isUniformAfterVectorization(I))
VF = 1;

if (VF > 1 && isProfitableToScalarize(I, VF))
return VectorizationCostTy(InstsToScalarize[VF][I], false);

Type *VectorTy;
unsigned C = getInstructionCost(I, VF, VectorTy);

Expand Down Expand Up @@ -7007,7 +7203,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}

// Insert values known to be scalar into VecValuesToIgnore.
// Insert values known to be scalar into VecValuesToIgnore. This is a
// conservative estimation of the values that will later be scalarized.
//
// FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
// still be scalarized. For example, we may find an instruction to be
// more profitable for a given vectorization factor if it were to be
// scalarized. But at this point, we haven't yet computed the
// vectorization factor.
for (auto *BB : TheLoop->getBlocks())
for (auto &I : *BB)
if (Legal->isScalarAfterVectorization(&I))
Expand Down
63 changes: 63 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -0,0 +1,63 @@
; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s
; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s

target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"

; CHECK-LABEL: predicated_udiv_scalarized_operand
;
; This test checks that we correctly compute the scalarized operands for a
; user-specified vectorization factor when interleaving is disabled. We use the
; "optsize" attribute to disable all interleaving calculations.
;
; CHECK: vector.body:
; CHECK: %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4
; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
; CHECK: [[IF0]]:
; CHECK: %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0
; CHECK: %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0
; CHECK: %[[T02:.+]] = add nsw i64 %[[T01]], %x
; CHECK: %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]]
; CHECK: %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0
; CHECK: br label %[[CONT0]]
; CHECK: [[CONT0]]:
; CHECK: %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
; CHECK: [[IF1]]:
; CHECK: %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1
; CHECK: %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1
; CHECK: %[[T08:.+]] = add nsw i64 %[[T07]], %x
; CHECK: %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]]
; CHECK: %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1
; CHECK: br label %[[CONT1]]
; CHECK: [[CONT1]]:
; CHECK: phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body

define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
%r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
%tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
%tmp2 = load i64, i64* %tmp0, align 4
br i1 %c, label %if.then, label %for.inc

if.then:
%tmp3 = add nsw i64 %tmp2, %x
%tmp4 = udiv i64 %tmp2, %tmp3
br label %for.inc

for.inc:
%tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
%tmp6 = add i64 %r, %tmp5
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, 100
br i1 %cond, label %for.body, label %for.end

for.end:
%tmp7 = phi i64 [ %tmp6, %for.inc ]
ret i64 %tmp7
}

0 comments on commit 364da7e

Please sign in to comment.