Skip to content

Commit

Permalink
Fix PR23384 (under "-lsr-insns-cost" option)
Browse files Browse the repository at this point in the history
Summary:
The patch adds instructions number generated by a solution
 to LSR cost under "-lsr-insns-cost" option.

Reviewers: qcolombet, hfinkel

Differential Revision: http://reviews.llvm.org/D28307

From: Evgeny Stupachenko <evstupac@gmail.com>
llvm-svn: 294821
  • Loading branch information
evstupac committed Feb 11, 2017
1 parent a05bdf7 commit fe6f548
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 4 deletions.
61 changes: 57 additions & 4 deletions llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
Expand Up @@ -129,6 +129,11 @@ static cl::opt<bool> EnablePhiElim(
"enable-lsr-phielim", cl::Hidden, cl::init(true),
cl::desc("Enable LSR phi elimination"));

// The flag adds instruction count to solutions cost comparision.
static cl::opt<bool> InsnsCost(
"lsr-insns-cost", cl::Hidden, cl::init(false),
cl::desc("Add instruction count to a LSR cost model"));

#ifndef NDEBUG
// Stress test IV chain generation.
static cl::opt<bool> StressIVChain(
Expand Down Expand Up @@ -325,6 +330,8 @@ struct Formula {

bool unscale();

bool hasZeroEnd() const;

size_t getNumRegs() const;
Type *getType() const;

Expand Down Expand Up @@ -459,6 +466,14 @@ bool Formula::unscale() {
return true;
}

bool Formula::hasZeroEnd() const {
if (UnfoldedOffset || BaseOffset)
return false;
if (BaseRegs.size() != 1 || ScaledReg)
return false;
return true;
}

/// Return the total number of register operands used by this formula. This does
/// not include register uses implied by non-constant addrec strides.
size_t Formula::getNumRegs() const {
Expand Down Expand Up @@ -895,6 +910,7 @@ namespace {
class Cost {
/// TODO: Some of these could be merged. Also, a lexical ordering
/// isn't always optimal.
unsigned Insns;
unsigned NumRegs;
unsigned AddRecCost;
unsigned NumIVMuls;
Expand All @@ -905,8 +921,8 @@ class Cost {

public:
Cost()
: NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
SetupCost(0), ScaleCost(0) {}
: Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0),
ImmCost(0), SetupCost(0), ScaleCost(0) {}

bool operator<(const Cost &Other) const;

Expand All @@ -915,9 +931,9 @@ class Cost {
#ifndef NDEBUG
// Once any of the metrics loses, they must all remain losers.
bool isValid() {
return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
| ImmCost | SetupCost | ScaleCost) != ~0u)
|| ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
|| ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
& ImmCost & SetupCost & ScaleCost) == ~0u);
}
#endif
Expand Down Expand Up @@ -1163,6 +1179,9 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
assert(F.isCanonical() && "Cost is accurate only for canonical formula");
// Tally up the registers.
unsigned PrevAddRecCost = AddRecCost;
unsigned PrevNumRegs = NumRegs;
unsigned PrevNumBaseAdds = NumBaseAdds;
if (const SCEV *ScaledReg = F.ScaledReg) {
if (VisitedRegs.count(ScaledReg)) {
Lose();
Expand All @@ -1182,6 +1201,18 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
return;
}

// Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
// additional instruction (at least fill).
unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
if (NumRegs > TTIRegNum) {
// Cost already exceeded TTIRegNum, then only newly added register can add
// new instructions.
if (PrevNumRegs > TTIRegNum)
Insns += (NumRegs - PrevNumRegs);
else
Insns += (NumRegs - TTIRegNum);
}

// Determine how many (unfolded) adds we'll need inside the loop.
size_t NumBaseParts = F.getNumRegs();
if (NumBaseParts > 1)
Expand Down Expand Up @@ -1210,11 +1241,30 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
!TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
NumBaseAdds++;
}

// If ICmpZero formula ends with not 0, it could not be replaced by
// just add or sub. We'll need to compare final result of AddRec.
// That means we'll need an additional instruction.
// For -10 + {0, +, 1}:
// i = i + 1;
// cmp i, 10
//
// For {-10, +, 1}:
// i = i + 1;
if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
Insns++;
// Each new AddRec adds 1 instruction to calculation.
Insns += (AddRecCost - PrevAddRecCost);

// BaseAdds adds instructions for unfolded registers.
if (LU.Kind != LSRUse::ICmpZero)
Insns += NumBaseAdds - PrevNumBaseAdds;
assert(isValid() && "invalid cost");
}

/// Set this cost to a losing value.
void Cost::Lose() {
Insns = ~0u;
NumRegs = ~0u;
AddRecCost = ~0u;
NumIVMuls = ~0u;
Expand All @@ -1226,6 +1276,8 @@ void Cost::Lose() {

/// Choose the lower cost.
bool Cost::operator<(const Cost &Other) const {
if (InsnsCost && Insns != Other.Insns)
return Insns < Other.Insns;
return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
ImmCost, SetupCost) <
std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
Expand All @@ -1234,6 +1286,7 @@ bool Cost::operator<(const Cost &Other) const {
}

void Cost::print(raw_ostream &OS) const {
OS << Insns << " instruction" << (Insns == 1 ? " " : "s ");
OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
if (AddRecCost != 0)
OS << ", with addrec cost " << AddRecCost;
Expand Down
52 changes: 52 additions & 0 deletions llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
@@ -0,0 +1,52 @@
; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s

; OPT test checks that LSR optimize compare for static counter to compare with 0.

; BOTH: for.body:
; INSN: icmp eq i64 %lsr.iv.next, 0
; REGS: icmp eq i64 %indvars.iv.next, 1024

; LLC test checks that LSR optimize compare for static counter.
; That means that instead of creating the following:
; movl %ecx, (%rdx,%rax,4)
; incq %rax
; cmpq $1024, %rax
; LSR should optimize out cmp:
; movl %ecx, 4096(%rdx,%rax)
; addq $4, %rax
; or
; movl %ecx, 4096(%rdx,%rax,4)
; incq %rax

; CHECK: LBB0_1:
; CHECK-NEXT: movl 4096(%{{...}},[[REG:%...]]
; CHECK-NEXT: addl 4096(%{{...}},[[REG]]
; CHECK-NEXT: movl %{{...}}, 4096(%{{...}},[[REG]]
; CHECK-NOT: cmp
; CHECK: jne

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

; Function Attrs: norecurse nounwind uwtable
define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) {
entry:
br label %for.body

for.cond.cleanup: ; preds = %for.body
ret void

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
%tmp = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
%tmp1 = load i32, i32* %arrayidx2, align 4
%add = add nsw i32 %tmp1, %tmp
%arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
store i32 %add, i32* %arrayidx4, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
58 changes: 58 additions & 0 deletions llvm/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
@@ -0,0 +1,58 @@
; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s

; OPT checks that LSR prefers less instructions to less registers.
; For x86 LSR should prefer complicated address to new lsr induction
; variables.

; BOTH: for.body:
; INSN: getelementptr i32, i32* %x, i64 %indvars.iv
; INSN: getelementptr i32, i32* %y, i64 %indvars.iv
; INSN: getelementptr i32, i32* %q, i64 %indvars.iv
; REGS %lsr.iv4 = phi
; REGS %lsr.iv2 = phi
; REGS %lsr.iv1 = phi
; REGS: getelementptr i32, i32* %lsr.iv1, i64 1
; REGS: getelementptr i32, i32* %lsr.iv2, i64 1
; REGS: getelementptr i32, i32* %lsr.iv4, i64 1

; LLC checks that LSR prefers less instructions to less registers.
; LSR should prefer complicated address to additonal add instructions.

; CHECK: LBB0_2:
; CHECK-NEXT: movl (%r{{[a-z][a-z]}},
; CHECK-NEXT: addl (%r{{[a-z][a-z]}},
; CHECK-NEXT: movl %e{{[a-z][a-z]}}, (%r{{[a-z][a-z]}},

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

; Function Attrs: norecurse nounwind uwtable
define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) {
entry:
%cmp10 = icmp sgt i32 %n, 0
br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %n to i64
br label %for.body

for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void

for.body: ; preds = %for.body, %for.body.preheader
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
%tmp = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
%tmp1 = load i32, i32* %arrayidx2, align 4
%add = add nsw i32 %tmp1, %tmp
%arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
store i32 %add, i32* %arrayidx4, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}

0 comments on commit fe6f548

Please sign in to comment.