Skip to content

Commit 244588b

Browse files
authored
[BOLT][AArch64] Inlining of Memcpy (#154929)
The pass for inlining memcpy in BOLT was currently X86-specific and was using the instruction `rep movsb`. This patch implements a static size analysis system for AArch64 memcpy inlining that extracts copy sizes from preceding instructions to then use it to generate the optimal width-specific load/store sequences.
1 parent 872d2c9 commit 244588b

File tree

6 files changed

+540
-4
lines changed

6 files changed

+540
-4
lines changed

bolt/docs/CommandLineArgumentReference.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@
637637

638638
- `--inline-memcpy`
639639

640-
Inline memcpy using 'rep movsb' instruction (X86-only)
640+
Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)
641641

642642
- `--inline-small-functions`
643643

bolt/include/bolt/Core/MCPlusBuilder.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#ifndef BOLT_CORE_MCPLUSBUILDER_H
1515
#define BOLT_CORE_MCPLUSBUILDER_H
1616

17+
#include "bolt/Core/BinaryBasicBlock.h"
1718
#include "bolt/Core/MCPlus.h"
1819
#include "bolt/Core/Relocation.h"
1920
#include "llvm/ADT/ArrayRef.h"
@@ -1902,13 +1903,38 @@ class MCPlusBuilder {
19021903
return {};
19031904
}
19041905

1906+
/// Find memcpy size in bytes by using preceding instructions.
1907+
/// Returns std::nullopt if size cannot be determined (no-op for most
1908+
/// targets).
1909+
virtual std::optional<uint64_t>
1910+
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
1911+
BinaryBasicBlock::iterator CallInst) const {
1912+
return std::nullopt;
1913+
}
1914+
19051915
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
19061916
/// (dest + n) instead of dest.
19071917
virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
19081918
llvm_unreachable("not implemented");
19091919
return {};
19101920
}
19111921

1922+
/// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
1923+
/// generates optimized code for that specific size. Falls back to regular
1924+
/// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
1925+
virtual InstructionListType
1926+
createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
1927+
return createInlineMemcpy(ReturnEnd);
1928+
}
1929+
1930+
/// Extract immediate value from move instruction that sets the given
1931+
/// register. Returns the immediate value if the instruction is a
1932+
/// move-immediate to TargetReg.
1933+
virtual std::optional<uint64_t>
1934+
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
1935+
return std::nullopt;
1936+
}
1937+
19121938
/// Create a target-specific relocation out of the \p Fixup.
19131939
/// Note that not every fixup could be converted into a relocation.
19141940
virtual std::optional<Relocation>

bolt/lib/Passes/BinaryPasses.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
18431843
}
18441844

18451845
Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1846-
if (!BC.isX86())
1846+
if (!BC.isX86() && !BC.isAArch64())
18471847
return Error::success();
18481848

18491849
uint64_t NumInlined = 0;
@@ -1866,8 +1866,16 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
18661866
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
18671867
const bool IsTailCall = BC.MIB->isTailCall(Inst);
18681868

1869+
// Extract size from preceding instructions (AArch64 only).
1870+
// Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
1871+
std::optional<uint64_t> KnownSize =
1872+
BC.MIB->findMemcpySizeInBytes(BB, II);
1873+
1874+
if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64))
1875+
continue;
1876+
18691877
const InstructionListType NewCode =
1870-
BC.MIB->createInlineMemcpy(IsMemcpy8);
1878+
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
18711879
II = BB.replaceInstruction(II, NewCode);
18721880
std::advance(II, NewCode.size() - 1);
18731881
if (IsTailCall) {

bolt/lib/Rewrite/BinaryPassManager.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
248248

249249
static cl::opt<bool> StringOps(
250250
"inline-memcpy",
251-
cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
251+
cl::desc(
252+
"inline memcpy using size-specific optimized instructions "
253+
"(X86: 'rep movsb', AArch64: width-optimized register operations)"),
252254
cl::cat(BoltOptCategory));
253255

254256
static cl::opt<bool> StripRepRet(

bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2620,6 +2620,122 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
26202620
getInstructionSize(const MCInst &Inst) const override {
26212621
return 4;
26222622
}
2623+
2624+
std::optional<uint64_t>
2625+
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
2626+
// Match MOVZ instructions (both X and W register variants) with no shift.
2627+
if ((Inst.getOpcode() == AArch64::MOVZXi ||
2628+
Inst.getOpcode() == AArch64::MOVZWi) &&
2629+
Inst.getOperand(2).getImm() == 0 &&
2630+
getAliases(TargetReg)[Inst.getOperand(0).getReg()])
2631+
return Inst.getOperand(1).getImm();
2632+
return std::nullopt;
2633+
}
2634+
2635+
std::optional<uint64_t>
2636+
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
2637+
BinaryBasicBlock::iterator CallInst) const override {
2638+
MCPhysReg SizeReg = getIntArgRegister(2);
2639+
if (SizeReg == getNoRegister())
2640+
return std::nullopt;
2641+
2642+
BitVector WrittenRegs(RegInfo->getNumRegs());
2643+
const BitVector &SizeRegAliases = getAliases(SizeReg);
2644+
2645+
for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
2646+
const MCInst &Inst = *InstIt;
2647+
WrittenRegs.reset();
2648+
getWrittenRegs(Inst, WrittenRegs);
2649+
2650+
if (WrittenRegs.anyCommon(SizeRegAliases))
2651+
return extractMoveImmediate(Inst, SizeReg);
2652+
}
2653+
return std::nullopt;
2654+
}
2655+
2656+
InstructionListType
2657+
createInlineMemcpy(bool ReturnEnd,
2658+
std::optional<uint64_t> KnownSize) const override {
2659+
assert(KnownSize.has_value() &&
2660+
"AArch64 memcpy inlining requires known size");
2661+
InstructionListType Code;
2662+
uint64_t Size = *KnownSize;
2663+
2664+
generateSizeSpecificMemcpy(Code, Size);
2665+
2666+
// If _memcpy8, adjust X0 to return dest+size instead of dest.
2667+
if (ReturnEnd)
2668+
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
2669+
.addReg(AArch64::X0)
2670+
.addReg(AArch64::X0)
2671+
.addImm(Size)
2672+
.addImm(0));
2673+
return Code;
2674+
}
2675+
2676+
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
2677+
uint64_t Size) const {
2678+
auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
2679+
unsigned Reg, unsigned Offset = 0) {
2680+
Code.emplace_back(MCInstBuilder(LoadOpc)
2681+
.addReg(Reg)
2682+
.addReg(AArch64::X1)
2683+
.addImm(Offset));
2684+
Code.emplace_back(MCInstBuilder(StoreOpc)
2685+
.addReg(Reg)
2686+
.addReg(AArch64::X0)
2687+
.addImm(Offset));
2688+
};
2689+
2690+
// Generate optimal instruction sequences based on exact size.
2691+
switch (Size) {
2692+
case 1:
2693+
AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
2694+
break;
2695+
case 2:
2696+
AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
2697+
break;
2698+
case 4:
2699+
AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
2700+
break;
2701+
case 8:
2702+
AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
2703+
break;
2704+
case 16:
2705+
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
2706+
break;
2707+
case 32:
2708+
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
2709+
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
2710+
break;
2711+
2712+
default:
2713+
// For sizes up to 64 bytes, greedily use the largest possible loads.
2714+
// Caller should have already filtered out sizes > 64 bytes.
2715+
assert(Size <= 64 &&
2716+
"Size should be <= 64 bytes for AArch64 memcpy inlining");
2717+
2718+
uint64_t Remaining = Size;
2719+
uint64_t Offset = 0;
2720+
2721+
const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
2722+
LoadStoreOps = {
2723+
{{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
2724+
{8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
2725+
{4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
2726+
{2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
2727+
{1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
2728+
2729+
for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
2730+
while (Remaining >= OpSize) {
2731+
AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
2732+
Remaining -= OpSize;
2733+
Offset += OpSize;
2734+
}
2735+
break;
2736+
}
2737+
return Code;
2738+
}
26232739
};
26242740

26252741
} // end anonymous namespace

0 commit comments

Comments
 (0)