Skip to content

Commit

Permalink
[memcpyopt] Restructure store(load src, dest) form of callslotopt for…
Browse files Browse the repository at this point in the history
… compile time

The search for the clobbering call is fairly expensive if uses are not optimized at construction.  Defer the clobber walk to the point in the implementation we need it; there are a bunch of bailouts before that point.  (e.g. If the source pointer is not an alloca, we can't do callslotopt.)

On a test case which involves a bunch of copies from argument pointers, this switches memcpyopt from > 1/2 second to < 10ms.
  • Loading branch information
preames committed Apr 4, 2022
1 parent c0f90c8 commit 7c51669
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 29 deletions.
2 changes: 1 addition & 1 deletion llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
Expand Up @@ -61,7 +61,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
bool processMemMove(MemMoveInst *M);
bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
Align cpyAlign, CallInst *C);
Align cpyAlign, std::function<CallInst *()> GetC);
bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);
bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet);
Expand Down
60 changes: 32 additions & 28 deletions llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
Expand Up @@ -761,27 +761,25 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// Detect cases where we're performing call slot forwarding, but
// happen to be using a load-store pair to implement it, rather than
// a memcpy.
CallInst *C = nullptr;
if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
// The load most post-dom the call. Limit to the same block for now.
// TODO: Support non-local call-slot optimization?
if (LoadClobber->getBlock() == SI->getParent())
C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
}

if (C) {
bool changed = performCallSlotOptzn(
LI, SI, SI->getPointerOperand()->stripPointerCasts(),
LI->getPointerOperand()->stripPointerCasts(),
DL.getTypeStoreSize(SI->getOperand(0)->getType()),
commonAlignment(SI->getAlign(), LI->getAlign()), C);
if (changed) {
eraseInstruction(SI);
eraseInstruction(LI);
++NumMemCpyInstr;
return true;
}
auto GetCall = [&]() -> CallInst * {
// We defer this expensive clobber walk until the cheap checks
// have been done on the source inside performCallSlotOptzn.
if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
MSSA->getWalker()->getClobberingMemoryAccess(LI)))
return dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
return nullptr;
};

bool changed = performCallSlotOptzn(
LI, SI, SI->getPointerOperand()->stripPointerCasts(),
LI->getPointerOperand()->stripPointerCasts(),
DL.getTypeStoreSize(SI->getOperand(0)->getType()),
commonAlignment(SI->getAlign(), LI->getAlign()), GetCall);
if (changed) {
eraseInstruction(SI);
eraseInstruction(LI);
++NumMemCpyInstr;
return true;
}
}
}
Expand Down Expand Up @@ -856,7 +854,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
Instruction *cpyStore, Value *cpyDest,
Value *cpySrc, TypeSize cpySize,
Align cpyAlign, CallInst *C) {
Align cpyAlign,
std::function<CallInst *()> GetC) {
// The general transformation to keep in mind is
//
// call @func(..., src, ...)
Expand All @@ -875,11 +874,6 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
if (cpySize.isScalable())
return false;

// Lifetime marks shouldn't be operated on.
if (Function *F = C->getCalledFunction())
if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
return false;

// Require that src be an alloca. This simplifies the reasoning considerably.
auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
if (!srcAlloca)
Expand All @@ -896,6 +890,16 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
if (cpySize < srcSize)
return false;

CallInst *C = GetC();
if (!C)
return false;

// Lifetime marks shouldn't be operated on.
if (Function *F = C->getCalledFunction())
if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
return false;


if (C->getParent() != cpyStore->getParent()) {
LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n");
return false;
Expand Down Expand Up @@ -1459,7 +1463,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
if (performCallSlotOptzn(
M, M, M->getDest(), M->getSource(),
TypeSize::getFixed(CopySize->getZExtValue()), Alignment,
C)) {
[C]() -> CallInst * { return C; })) {
LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
<< " call: " << *C << "\n"
<< " memcpy: " << *M << "\n");
Expand Down

0 comments on commit 7c51669

Please sign in to comment.