Skip to content

Commit c5da190

Browse files
authored
[LoopIdiom] Use HashRecognize to optimize CRC (#143208)
Optimize CRC loops using a Sarwate table-lookup by using the results of HashRecognize in LoopIdiomRecognize. The optimization is checked for correctness using the SingleSource/UnitTests/HashRecognize tests in llvm-test-suite.
1 parent adacab3 commit c5da190

File tree

3 files changed

+720
-1
lines changed

3 files changed

+720
-1
lines changed

llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ struct DisableLIRP {
4040

4141
/// When true, Wcslen is disabled.
4242
static bool Wcslen;
43+
44+
/// When true, HashRecognize is disabled.
45+
static bool HashRecognize;
4346
};
4447

4548
/// Performs Loop Idiom Recognize Pass.

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Lines changed: 175 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "llvm/ADT/StringRef.h"
4040
#include "llvm/Analysis/AliasAnalysis.h"
4141
#include "llvm/Analysis/CmpInstAnalysis.h"
42+
#include "llvm/Analysis/HashRecognize.h"
4243
#include "llvm/Analysis/LoopInfo.h"
4344
#include "llvm/Analysis/LoopPass.h"
4445
#include "llvm/Analysis/MemoryLocation.h"
@@ -143,6 +144,14 @@ static cl::opt<bool, true>
143144
cl::location(DisableLIRP::Wcslen), cl::init(false),
144145
cl::ReallyHidden);
145146

147+
bool DisableLIRP::HashRecognize;
148+
static cl::opt<bool, true>
149+
DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",
150+
cl::desc("Proceed with loop idiom recognize pass, "
151+
"but do not optimize CRC loops."),
152+
cl::location(DisableLIRP::HashRecognize),
153+
cl::init(false), cl::ReallyHidden);
154+
146155
static cl::opt<bool> UseLIRCodeSizeHeurs(
147156
"use-lir-code-size-heurs",
148157
cl::desc("Use loop idiom recognition code size heuristics when compiling "
@@ -242,6 +251,7 @@ class LoopIdiomRecognize {
242251
const SCEV *BECount);
243252
bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
244253
bool IsLoopMemset = false);
254+
bool optimizeCRCLoop(const PolynomialInfo &Info);
245255

246256
/// @}
247257
/// \name Noncountable Loop Idiom Handling
@@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
287297
// but ORE cannot be preserved (see comment before the pass definition).
288298
OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
289299

300+
std::optional<PolynomialInfo> HR;
301+
290302
LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
291303
AR.MSSA, DL, ORE);
292304
if (!LIR.runOnLoop(&L))
@@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
335347
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
336348
HasMemcpy = TLI->has(LibFunc_memcpy);
337349

338-
if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy)
350+
if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||
351+
HasMemcpy || !DisableLIRP::HashRecognize)
339352
if (SE->hasLoopInvariantBackedgeTakenCount(L))
340353
return runOnCountableLoop();
341354

@@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
378391

379392
MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
380393
}
394+
395+
// Optimize a CRC loop if HashRecognize found one, provided we're not
396+
// optimizing for size.
397+
if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)
398+
if (auto Res = HashRecognize(*CurLoop, *SE).getResult())
399+
optimizeCRCLoop(*Res);
400+
381401
return MadeChange;
382402
}
383403

@@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
15141534
return false;
15151535
}
15161536

1537+
bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
1538+
// FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using
1539+
// carry-less multiplication instructions, which is more efficient than our
1540+
// Sarwate table-lookup optimization. Hence, until we're able to emit
1541+
// target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,
1542+
// disable the optimization for Hexagon.
1543+
Module &M = *CurLoop->getHeader()->getModule();
1544+
Triple TT(M.getTargetTriple());
1545+
if (TT.getArch() == Triple::hexagon)
1546+
return false;
1547+
1548+
// First, create a new GlobalVariable corresponding to the
1549+
// Sarwate-lookup-table.
1550+
Type *CRCTy = Info.LHS->getType();
1551+
unsigned CRCBW = CRCTy->getIntegerBitWidth();
1552+
std::array<Constant *, 256> CRCConstants;
1553+
transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),
1554+
CRCConstants.begin(),
1555+
[CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });
1556+
Constant *ConstArray =
1557+
ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);
1558+
GlobalVariable *GV =
1559+
new GlobalVariable(M, ConstArray->getType(), true,
1560+
GlobalValue::PrivateLinkage, ConstArray, ".crctable");
1561+
1562+
PHINode *IV = CurLoop->getCanonicalInductionVariable();
1563+
SmallVector<PHINode *, 2> Cleanup;
1564+
1565+
// Next, mark all PHIs for removal except IV.
1566+
{
1567+
for (PHINode &PN : CurLoop->getHeader()->phis()) {
1568+
if (&PN == IV)
1569+
continue;
1570+
PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
1571+
Cleanup.push_back(&PN);
1572+
}
1573+
}
1574+
1575+
// Next, fix up the trip count.
1576+
{
1577+
unsigned NewBTC = (Info.TripCount / 8) - 1;
1578+
BasicBlock *LoopBlk = CurLoop->getLoopLatch();
1579+
BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator());
1580+
CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
1581+
? ICmpInst::Predicate::ICMP_NE
1582+
: ICmpInst::Predicate::ICMP_EQ;
1583+
Instruction *ExitCond = CurLoop->getLatchCmpInst();
1584+
Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);
1585+
IRBuilder<> Builder(ExitCond);
1586+
Value *NewExitCond =
1587+
Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");
1588+
ExitCond->replaceAllUsesWith(NewExitCond);
1589+
deleteDeadInstruction(ExitCond);
1590+
}
1591+
1592+
// Finally, fill the loop with the Sarwate-table-lookup logic, and replace all
1593+
// uses of ComputedValue.
1594+
//
1595+
// Little-endian:
1596+
// crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]
1597+
// Big-Endian:
1598+
// crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
1599+
{
1600+
auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
1601+
Type *OpTy = Op->getType();
1602+
unsigned OpBW = OpTy->getIntegerBitWidth();
1603+
return OpBW > 8
1604+
? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name)
1605+
: Op;
1606+
};
1607+
auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
1608+
const Twine &Name) {
1609+
Type *OpTy = Op->getType();
1610+
1611+
// When the bitwidth of the CRC mismatches the Op's bitwidth, we need to
1612+
// use the CRC's bitwidth as the reference for shifting right.
1613+
return LoByte(Builder,
1614+
CRCBW > 8 ? Builder.CreateLShr(
1615+
Op, ConstantInt::get(OpTy, CRCBW - 8), Name)
1616+
: Op,
1617+
Name + ".lo.byte");
1618+
};
1619+
1620+
IRBuilder<> Builder(CurLoop->getHeader(),
1621+
CurLoop->getHeader()->getFirstNonPHIIt());
1622+
1623+
// Create the CRC PHI, and initialize its incoming value to the initial
1624+
// value of CRC.
1625+
PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");
1626+
CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());
1627+
1628+
// CRC is now an evolving variable, initialized to the PHI.
1629+
Value *CRC = CRCPhi;
1630+
1631+
// TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte
1632+
// of LHSAux), if LHSAux is non-nullptr.
1633+
Value *Indexer = CRC;
1634+
if (Value *Data = Info.LHSAux) {
1635+
Type *DataTy = Data->getType();
1636+
1637+
// To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we
1638+
// shift right by that amount, and take the lo-byte (in the little-endian
1639+
// case), or shift left by that amount, and take the hi-idx (in the
1640+
// big-endian case).
1641+
Value *IVBits = Builder.CreateZExtOrTrunc(
1642+
Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");
1643+
Value *DataIndexer =
1644+
Info.ByteOrderSwapped
1645+
? Builder.CreateShl(Data, IVBits, "data.indexer")
1646+
: Builder.CreateLShr(Data, IVBits, "data.indexer");
1647+
Indexer = Builder.CreateXor(
1648+
DataIndexer,
1649+
Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),
1650+
"crc.data.indexer");
1651+
}
1652+
1653+
Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")
1654+
: LoByte(Builder, Indexer, "indexer.lo");
1655+
1656+
// Always index into a GEP using the index type.
1657+
Indexer = Builder.CreateZExt(
1658+
Indexer, SE->getDataLayout().getIndexType(GV->getType()),
1659+
"indexer.ext");
1660+
1661+
// CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].
1662+
Value *CRCTableGEP =
1663+
Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");
1664+
Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");
1665+
1666+
// CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of
1667+
// CRC-8.
1668+
Value *CRCNext = CRCTableLd;
1669+
if (CRCBW > 8) {
1670+
Value *CRCShift = Info.ByteOrderSwapped
1671+
? Builder.CreateShl(CRC, 8, "crc.be.shift")
1672+
: Builder.CreateLShr(CRC, 8, "crc.le.shift");
1673+
CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");
1674+
}
1675+
1676+
// Connect the back-edge for the loop, and RAUW the ComputedValue.
1677+
CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());
1678+
Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,
1679+
CurLoop->getLoopLatch());
1680+
}
1681+
1682+
// Cleanup.
1683+
{
1684+
for (PHINode *PN : Cleanup)
1685+
RecursivelyDeleteDeadPHINode(PN);
1686+
SE->forgetLoop(CurLoop);
1687+
}
1688+
return true;
1689+
}
1690+
15171691
bool LoopIdiomRecognize::runOnNoncountableLoop() {
15181692
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
15191693
<< CurLoop->getHeader()->getParent()->getName()

0 commit comments

Comments
 (0)