Skip to content

Commit

Permalink
[LoopVectorizer] Refine loop vectorizer's register usage calculator b…
Browse files Browse the repository at this point in the history
…y ignoring specific instructions.

LoopVectorizationCostModel::calculateRegisterUsage() is used to estimate the
register usage for specific VFs. However, it takes into account many
instructions that won't be vectorized, such as induction variables,
GetElementPtr instruction, etc.. This makes the loop vectorizer too conservative
when choosing VF. In this patch, the induction variables that won't be
vectorized plus GetElementPtr instruction will be added to ValuesToIgnore set
so that their register usage won't be considered any more.


Differential revision: http://reviews.llvm.org/D15177

llvm-svn: 255454
  • Loading branch information
Cong Hou committed Dec 13, 2015
1 parent e436185 commit 7f8b43d
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 32 deletions.
137 changes: 106 additions & 31 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -1409,15 +1409,14 @@ class LoopVectorizationLegality {
/// different operations.
class LoopVectorizationCostModel {
public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
LoopInfo *LI, LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC, const Function *F,
const LoopVectorizeHints *Hints,
SmallPtrSetImpl<const Value *> &ValuesToIgnore)
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
const LoopVectorizeHints *Hints)
: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
AC(AC), TheFunction(F), Hints(Hints) {}

/// Information about vectorization costs
struct VectorizationFactor {
Expand Down Expand Up @@ -1465,6 +1464,9 @@ class LoopVectorizationCostModel {
SmallVector<RegisterUsage, 8>
calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);

/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();

private:
/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
Expand Down Expand Up @@ -1496,8 +1498,8 @@ class LoopVectorizationCostModel {

/// The loop that we evaluate.
Loop *TheLoop;
/// Scev analysis.
ScalarEvolution *SE;
/// Predicated scalar evolution analysis.
PredicatedScalarEvolution &PSE;
/// Loop Info analysis.
LoopInfo *LI;
/// Vectorization legality.
Expand All @@ -1506,13 +1508,17 @@ class LoopVectorizationCostModel {
const TargetTransformInfo &TTI;
/// Target Library Info.
const TargetLibraryInfo *TLI;
/// Demanded bits analysis
/// Demanded bits analysis.
DemandedBits *DB;
/// Assumption cache.
AssumptionCache *AC;
const Function *TheFunction;
// Loop Vectorize Hint.
/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
// Values to ignore in the cost model.
const SmallPtrSetImpl<const Value *> &ValuesToIgnore;
/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;
/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};

/// \brief This holds vectorization requirements that must be verified late in
Expand Down Expand Up @@ -1757,19 +1763,10 @@ struct LoopVectorize : public FunctionPass {
return false;
}

// Collect values we want to ignore in the cost model. This includes
// type-promoting instructions we identified during reduction detection.
SmallPtrSet<const Value *, 32> ValuesToIgnore;
CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore);
for (auto &Reduction : *LVL.getReductionVars()) {
RecurrenceDescriptor &RedDes = Reduction.second;
SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
ValuesToIgnore.insert(Casts.begin(), Casts.end());
}

// Use the cost model.
LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
F, &Hints, ValuesToIgnore);
LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
&Hints);
CM.collectValuesToIgnore();

// Check the function attributes to find out if this function should be
// optimized for size.
Expand Down Expand Up @@ -4737,7 +4734,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
}

// Find the trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
Expand Down Expand Up @@ -4939,7 +4936,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
return 1;

// Do not interleave loops with a relatively small trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
return 1;

Expand Down Expand Up @@ -5167,15 +5164,15 @@ LoopVectorizationCostModel::calculateRegisterUsage(
// Ignore instructions that are never used within the loop.
if (!Ends.count(I)) continue;

// Skip ignored values.
if (ValuesToIgnore.count(I))
continue;

// Remove all of the instructions that end at this location.
InstrList &List = TransposeEnds[i];
for (unsigned int j = 0, e = List.size(); j < e; ++j)
OpenIntervals.erase(List[j]);

// Skip ignored values.
if (ValuesToIgnore.count(I))
continue;

// For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
if (VFs[j] == 1) {
Expand All @@ -5185,8 +5182,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(

// Count the number of live intervals.
unsigned RegUsage = 0;
for (auto Inst : OpenIntervals)
for (auto Inst : OpenIntervals) {
// Skip ignored values for VF > 1.
if (VecValuesToIgnore.count(Inst))
continue;
RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
}
MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
}

Expand Down Expand Up @@ -5330,6 +5331,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
if (VF > 1 && MinBWs.count(I))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
Type *VectorTy = ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();

// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
Expand Down Expand Up @@ -5631,6 +5633,79 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
return false;
}

void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : *Legal->getReductionVars()) {
RecurrenceDescriptor &RedDes = Reduction.second;
SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}

// Ignore induction phis that are only used in either GetElementPtr or ICmp
// instruction to exit loop. Induction variables usually have large types and
// can have big impact when estimating register usage.
// This is for when VF > 1.
for (auto &Induction : *Legal->getInductionVars()) {
auto *PN = Induction.first;
auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());

// Check that the PHI is only used by the induction increment (UpdateV) or
// by GEPs. Then check that UpdateV is only used by a compare instruction or
// the loop header PHI.
// FIXME: Need precise def-use analysis to determine if this instruction
// variable will be vectorized.
if (std::all_of(PN->user_begin(), PN->user_end(),
[&](const User *U) -> bool {
return U == UpdateV || isa<GetElementPtrInst>(U);
}) &&
std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
[&](const User *U) -> bool {
return U == PN || isa<ICmpInst>(U);
})) {
VecValuesToIgnore.insert(PN);
VecValuesToIgnore.insert(UpdateV);
}
}

// Ignore instructions that will not be vectorized.
// This is for when VF > 1.
for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
++bb) {
for (auto &Inst : **bb) {
switch (Inst.getOpcode()) {
case Instruction::GetElementPtr: {
// Ignore GEP if its last operand is an induction variable so that it is
// a consecutive load/store and won't be vectorized as scatter/gather
// pattern.

GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
unsigned NumOperands = Gep->getNumOperands();
unsigned InductionOperand = getGEPInductionOperand(Gep);
bool GepToIgnore = true;

// Check that all of the gep indices are uniform except for the
// induction operand.
for (unsigned i = 0; i != NumOperands; ++i) {
if (i != InductionOperand &&
!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
TheLoop)) {
GepToIgnore = false;
break;
}
}

if (GepToIgnore)
VecValuesToIgnore.insert(&Inst);
break;
}
}
}
}
}

void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore) {
Expand Down
Expand Up @@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gnu"
; -vectorizer-maximize-bandwidth is indicated.
;
; CHECK-label: foo
; CHECK: LV: Selecting VF: 16.
; CHECK: LV: Selecting VF: 32.
define void @foo() {
entry:
br label %for.body
Expand Down
68 changes: 68 additions & 0 deletions llvm/test/Transforms/LoopVectorize/reg-usage.ll
@@ -0,0 +1,68 @@
; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth 2>&1 | FileCheck %s


@a = global [1024 x i8] zeroinitializer, align 16
@b = global [1024 x i8] zeroinitializer, align 16

define i32 @foo() {
; This function has a loop of SAD pattern. Here we check when VF = 16 the
; register usage doesn't exceed 16.
;
; CHECK-LABEL: foo
; CHECK: LV(REG): VF = 4
; CHECK-NEXT: LV(REG): Found max usage: 4
; CHECK: LV(REG): VF = 8
; CHECK-NEXT: LV(REG): Found max usage: 7
; CHECK: LV(REG): VF = 16
; CHECK-NEXT: LV(REG): Found max usage: 13

entry:
br label %for.body

for.cond.cleanup:
%add.lcssa = phi i32 [ %add, %for.body ]
ret i32 %add.lcssa

for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
%0 = load i8, i8* %arrayidx, align 1
%conv = zext i8 %0 to i32
%arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
%1 = load i8, i8* %arrayidx2, align 1
%conv3 = zext i8 %1 to i32
%sub = sub nsw i32 %conv, %conv3
%ispos = icmp sgt i32 %sub, -1
%neg = sub nsw i32 0, %sub
%2 = select i1 %ispos, i32 %sub, i32 %neg
%add = add nsw i32 %2, %s.015
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

define i64 @bar(i64* nocapture %a) {
; CHECK-LABEL: bar
; CHECK: LV(REG): VF = 2
; CHECK: LV(REG): Found max usage: 4
;
entry:
br label %for.body

for.cond.cleanup:
%add2.lcssa = phi i64 [ %add2, %for.body ]
ret i64 %add2.lcssa

for.body:
%i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
%arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
%0 = load i64, i64* %arrayidx, align 8
%add = add nsw i64 %0, %i.012
store i64 %add, i64* %arrayidx, align 8
%add2 = add nsw i64 %add, %s.011
%inc = add nuw nsw i64 %i.012, 1
%exitcond = icmp eq i64 %inc, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

0 comments on commit 7f8b43d

Please sign in to comment.