Skip to content

Commit

Permalink
The patch adds CTLZ idiom recognition.
Browse files Browse the repository at this point in the history
Summary:

The following loops should be recognized:
i = 0;
while (n) {
  n = n >> 1;
  i++;
  body();
}
use(i);

And replaced with builtin_ctlz(n) if body() is empty or
for CPUs that have CTLZ instruction converted to countable:

for (j = 0; j < builtin_ctlz(n); j++) {
  n = n >> 1;
  i++;
  body();
}
use(builtin_ctlz(n));

Reviewers: rengolin, joerg

Differential Revision: http://reviews.llvm.org/D32605

From: Evgeny Stupachenko <evstupac@gmail.com>
llvm-svn: 303102
  • Loading branch information
evstupac committed May 15, 2017
1 parent 5868499 commit 2fecd38
Show file tree
Hide file tree
Showing 3 changed files with 664 additions and 1 deletion.
295 changes: 294 additions & 1 deletion llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Expand Up @@ -144,6 +144,10 @@ class LoopIdiomRecognize {
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
bool recognizeAndInsertCTLZ();
void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var, const DebugLoc DL,
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);

/// @}
};
Expand Down Expand Up @@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
}

bool LoopIdiomRecognize::runOnNoncountableLoop() {
return recognizePopcount();
return recognizePopcount() || recognizeAndInsertCTLZ();
}

/// Check if the given conditional branch is based on the comparison between
Expand Down Expand Up @@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
return true;
}

/// Return true if the idiom is detected in the loop.
///
/// Additionally:
/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
/// or nullptr if there is no such.
/// 2) \p CntPhi is set to the corresponding phi node
/// or nullptr if there is no such.
/// 3) \p Var is set to the value whose CTLZ could be used.
/// 4) \p DefX is set to the instruction calculating Loop exit condition.
///
/// The core idiom we are trying to detect is:
/// \code
/// if (x0 == 0)
/// goto loop-exit // the precondition of the loop
/// cnt0 = init-val;
/// do {
/// x = phi (x0, x.next); //PhiX
/// cnt = phi(cnt0, cnt.next);
///
/// cnt.next = cnt + 1;
/// ...
/// x.next = x >> 1; // DefX
/// ...
/// } while(x.next != 0);
///
/// loop-exit:
/// \endcode
static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
Instruction *&CntInst, PHINode *&CntPhi,
Instruction *&DefX) {
BasicBlock *LoopEntry;
Value *VarX = nullptr;

DefX = nullptr;
PhiX = nullptr;
CntInst = nullptr;
CntPhi = nullptr;
LoopEntry = *(CurLoop->block_begin());

// step 1: Check if the loop-back branch is in desirable form.
if (Value *T = matchCondition(
dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
DefX = dyn_cast<Instruction>(T);
else
return false;

// step 2: detect instructions corresponding to "x.next = x >> 1"
if (!DefX || DefX->getOpcode() != Instruction::AShr)
return false;
if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
if (!Shft || !Shft->isOne())
return false;
VarX = DefX->getOperand(0);

// step 3: Check the recurrence of variable X
PhiX = dyn_cast<PHINode>(VarX);
if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
return false;

// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
// TODO: We can skip the step. If loop trip count is known (CTLZ),
// then all uses of "cnt.next" could be optimized to the trip count
// plus "cnt0". Currently it is not optimized.
// This step could be used to detect POPCNT instruction:
// cnt.next = cnt + (x.next & 1)
for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
IterE = LoopEntry->end();
Iter != IterE; Iter++) {
Instruction *Inst = &*Iter;
if (Inst->getOpcode() != Instruction::Add)
continue;

ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
if (!Inc || !Inc->isOne())
continue;

PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
if (!Phi || Phi->getParent() != LoopEntry)
continue;

CntInst = Inst;
CntPhi = Phi;
break;
}
if (!CntInst)
return false;

return true;
}

/// Recognize CTLZ idiom in a non-countable loop and convert the loop
/// to countable (with CTLZ trip count).
/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
// Give up if the loop has multiple blocks or multiple backedges.
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
return false;

Instruction *CntInst, *DefX;
PHINode *CntPhi, *PhiX;
if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
return false;

bool IsCntPhiUsedOutsideLoop = false;
for (User *U : CntPhi->users())
if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
IsCntPhiUsedOutsideLoop = true;
break;
}
bool IsCntInstUsedOutsideLoop = false;
for (User *U : CntInst->users())
if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
IsCntInstUsedOutsideLoop = true;
break;
}
// If both CntInst and CntPhi are used outside the loop the profitability
// is questionable.
if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
return false;

// For some CPUs result of CTLZ(X) intrinsic is undefined
// when X is 0. If we can not guarantee X != 0, we need to check this
// when expand.
bool ZeroCheck = false;
// It is safe to assume Preheader exist as it was checked in
// parent function RunOnLoop.
BasicBlock *PH = CurLoop->getLoopPreheader();
Value *InitX = PhiX->getIncomingValueForBlock(PH);
// If we check X != 0 before entering the loop we don't need a zero
// check in CTLZ intrinsic.
if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
if (BranchInst *PreCondBr =
dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
if (matchCondition(PreCondBr, PH) == InitX)
ZeroCheck = true;
}

// Check if CTLZ intrinsic is profitable. Assume it is always profitable
// if we delete the loop (the loop has only 6 instructions):
// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
// %shr = ashr %n.addr.0, 1
// %tobool = icmp eq %shr, 0
// %inc = add nsw %i.0, 1
// br i1 %tobool

IRBuilder<> Builder(PH->getTerminator());
SmallVector<const Value *, 2> Ops =
{InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
ArrayRef<const Value *> Args(Ops);
if (CurLoop->getHeader()->size() != 6 &&
TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
TargetTransformInfo::TCC_Basic)
return false;

const DebugLoc DL = DefX->getDebugLoc();
transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
IsCntPhiUsedOutsideLoop);
return true;
}

/// Recognizes a population count idiom in a non-countable loop.
///
/// If detected, transforms the relevant code to issue the popcount intrinsic
Expand Down Expand Up @@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
return CI;
}

static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
const DebugLoc &DL, bool ZeroCheck) {
Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
Type *Tys[] = {Val->getType()};

Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
CallInst *CI = IRBuilder.CreateCall(Func, Ops);
CI->setDebugLoc(DL);

return CI;
}

/// Transform the following loop:
/// loop:
/// CntPhi = PHI [Cnt0, CntInst]
/// PhiX = PHI [InitX, DefX]
/// CntInst = CntPhi + 1
/// DefX = PhiX >> 1
// LOOP_BODY
/// Br: loop if (DefX != 0)
/// Use(CntPhi) or Use(CntInst)
///
/// Into:
/// If CntPhi used outside the loop:
/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
/// Count = CountPrev + 1
/// else
/// Count = BitWidth(InitX) - CTLZ(InitX)
/// loop:
/// CntPhi = PHI [Cnt0, CntInst]
/// PhiX = PHI [InitX, DefX]
/// PhiCount = PHI [Count, Dec]
/// CntInst = CntPhi + 1
/// DefX = PhiX >> 1
/// Dec = PhiCount - 1
/// LOOP_BODY
/// Br: loop if (Dec != 0)
/// Use(CountPrev + Cnt0) // Use(CntPhi)
/// or
/// Use(Count + Cnt0) // Use(CntInst)
///
/// If LOOP_BODY is empty the loop will be deleted.
/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
void LoopIdiomRecognize::transformLoopToCountable(
BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());

// Step 1: Insert the CTLZ instruction at the end of the preheader block
// Count = BitWidth - CTLZ(InitX);
// If there are uses of CntPhi create:
// CountPrev = BitWidth - CTLZ(InitX >> 1);
IRBuilder<> Builder(PreheaderBr);
Builder.SetCurrentDebugLocation(DL);
Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;

if (IsCntPhiUsedOutsideLoop)
InitXNext = Builder.CreateAShr(InitX,
ConstantInt::get(InitX->getType(), 1));
else
InitXNext = InitX;
CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
Count = Builder.CreateSub(
ConstantInt::get(CTLZ->getType(),
CTLZ->getType()->getIntegerBitWidth()),
CTLZ);
if (IsCntPhiUsedOutsideLoop) {
CountPrev = Count;
Count = Builder.CreateAdd(
CountPrev,
ConstantInt::get(CountPrev->getType(), 1));
}
if (IsCntPhiUsedOutsideLoop)
NewCount = Builder.CreateZExtOrTrunc(CountPrev,
cast<IntegerType>(CntInst->getType()));
else
NewCount = Builder.CreateZExtOrTrunc(Count,
cast<IntegerType>(CntInst->getType()));

// If the CTLZ counter's initial value is not zero, insert Add Inst.
Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
if (!InitConst || !InitConst->isZero())
NewCount = Builder.CreateAdd(NewCount, CntInitVal);

// Step 2: Insert new IV and loop condition:
// loop:
// ...
// PhiCount = PHI [Count, Dec]
// ...
// Dec = PhiCount - 1
// ...
// Br: loop if (Dec != 0)
BasicBlock *Body = *(CurLoop->block_begin());
auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
Type *Ty = Count->getType();

PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());

Builder.SetInsertPoint(LbCond);
Instruction *TcDec = cast<Instruction>(
Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
"tcdec", false, true));

TcPhi->addIncoming(Count, Preheader);
TcPhi->addIncoming(TcDec, Body);

CmpInst::Predicate Pred =
(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
LbCond->setPredicate(Pred);
LbCond->setOperand(0, TcDec);
LbCond->setOperand(1, ConstantInt::get(Ty, 0));

// Step 3: All the references to the original counter outside
// the loop are replaced with the NewCount -- the value returned from
// __builtin_ctlz(x).
if (IsCntPhiUsedOutsideLoop)
CntPhi->replaceUsesOutsideBlock(NewCount, Body);
else
CntInst->replaceUsesOutsideBlock(NewCount, Body);

// step 4: Forget the "non-computable" trip-count SCEV associated with the
// loop. The loop would otherwise not be deleted even if it becomes empty.
SE->forgetLoop(CurLoop);
}

void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
Instruction *CntInst,
PHINode *CntPhi, Value *Var) {
Expand Down

0 comments on commit 2fecd38

Please sign in to comment.