From 87f96e70a86c2210ae49598b9597897bf894b7b1 Mon Sep 17 00:00:00 2001 From: YongKang Zhu Date: Fri, 29 Aug 2025 18:02:15 -0700 Subject: [PATCH] [BOLT][AArch64][instr] Consider targeting ARM64 CPUs without LSE support `stadd` is only available in recent arm64 CPUs that have LSE support (like Cortex-A73 and Cortex-A75) and is not available on old arm64 CPUs (like Cortex-A53 and Cortex-A55). Devices could have a mixture of those two kinds of CPUs, so we need to provide an option for BOLT to generate instrumentation sequence that emulates what `stadd` would do. The implementation puts counter increment into an injected helper function since we don't want to rebuild CFG in the function that is being instrumented. --- bolt/include/bolt/Core/MCPlusBuilder.h | 12 +- bolt/lib/Passes/Instrumentation.cpp | 2 + .../Target/AArch64/AArch64MCPlusBuilder.cpp | 141 ++++++++++++++++-- bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp | 6 +- bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 6 +- bolt/test/AArch64/instrumentation_sequence.s | 50 +++++++ 6 files changed, 197 insertions(+), 20 deletions(-) create mode 100644 bolt/test/AArch64/instrumentation_sequence.s diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 90129d475d870..5b711b0e27bab 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -51,6 +51,7 @@ class raw_ostream; namespace bolt { class BinaryBasicBlock; +class BinaryContext; class BinaryFunction; /// Different types of indirect branches encountered during disassembly. @@ -530,10 +531,15 @@ class MCPlusBuilder { return 0; } + /// Create a helper function to increment counter for Instrumentation + virtual void createInstrCounterIncrFunc(BinaryContext &BC) { + llvm_unreachable("not implemented"); + } + /// Create increment contents of target by 1 for Instrumentation - virtual InstructionListType - createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, - unsigned CodePointerSize) const { + virtual InstructionListType createInstrIncMemory(const MCSymbol *Target, + MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) { llvm_unreachable("not implemented"); return InstructionListType(); } diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp index c2f876f0dff9e..150461b020f06 100644 --- a/bolt/lib/Passes/Instrumentation.cpp +++ b/bolt/lib/Passes/Instrumentation.cpp @@ -753,6 +753,8 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) { createSimpleFunction("__bolt_fini_trampoline", BC.MIB->createReturnInstructionList(BC.Ctx.get())); } + if (BC.isAArch64()) + BC.MIB->createInstrCounterIncrFunc(BC); } } diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index f972646aa12ea..a6589f8f9ee42 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -26,6 +26,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegister.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/DataExtractor.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -35,6 +36,15 @@ using namespace llvm; using namespace bolt; +namespace opts { +extern cl::OptionCategory BoltInstrCategory; +static cl::opt NoLSEAtomics( + "no-lse-atomics", + cl::desc("generate instrumentation code sequence without using LSE atomic " + "instruction"), + cl::init(false), cl::Optional, cl::cat(BoltInstrCategory)); +} // namespace opts + namespace { static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { @@ -106,7 +116,7 @@ static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) { } static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) { - // NOTE: Supports only ARM with LSE extension + assert(!opts::NoLSEAtomics && "Supports only ARM with LSE extension"); Inst.setOpcode(AArch64::LDADDX); Inst.clear(); Inst.addOperand(MCOperand::createReg(AArch64::XZR)); @@ -135,6 +145,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { public: using MCPlusBuilder::MCPlusBuilder; + BinaryFunction *InstrCounterIncrFunc{nullptr}; + std::unique_ptr createTargetSymbolizer(BinaryFunction &Function, bool CreateNewSymbols) const override { @@ -2513,22 +2525,129 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return Insts; } - InstructionListType - createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, - unsigned CodePointerSize) const override { + // Instrumentation code sequence using LSE atomic instruction has a total of + // 6 instructions: + // + // stp x0, x1, [sp, #-0x10]! + // adrp x0, page_address(counter) + // add x0, x0, page_offset(counter) + // mov x1, #0x1 + // stadd x1, [x0] + // ldp x0, x1, [sp], #0x10 + // + // Instrumentation code sequence without using LSE atomic instruction has + // 8 instructions at instrumentation place, with 6 instructions in the helper: + // + // stp x0, x30, [sp, #-0x10]! + // stp x1, x2, [sp, #-0x10]! + // adrp x0, page_address(counter) + // add x0, x0, page_offset(counter) + // adrp x1, page_address(helper) + // add x1, x1, page_offset(helper) + // blr x1 + // ldp x0, x30, [sp], #0x10 + // + // : + // ldaxr x1, [x0] + // add x1, x1, #0x1 + // stlxr w2, x1, [x0] + // cbnz w2, + // ldp x1, x2, [sp], #0x10 + // ret + + void createInstrCounterIncrFunc(BinaryContext &BC) override { + assert(InstrCounterIncrFunc == nullptr && + "helper function of counter increment for instrumentation " + "has already been created"); + + if (!opts::NoLSEAtomics) + return; + + MCContext *Ctx = BC.Ctx.get(); + InstrCounterIncrFunc = BC.createInjectedBinaryFunction( + "__bolt_instr_counter_incr", /*IsSimple*/ false); + std::vector> BBs; + + BBs.emplace_back(InstrCounterIncrFunc->createBasicBlock()); + InstructionListType Instrs(4); + Instrs[0].setOpcode(AArch64::LDAXRX); + Instrs[0].clear(); + Instrs[0].addOperand(MCOperand::createReg(AArch64::X1)); + Instrs[0].addOperand(MCOperand::createReg(AArch64::X0)); + Instrs[1].setOpcode(AArch64::ADDXri); + Instrs[1].clear(); + Instrs[1].addOperand(MCOperand::createReg(AArch64::X1)); + Instrs[1].addOperand(MCOperand::createReg(AArch64::X1)); + Instrs[1].addOperand(MCOperand::createImm(1)); + Instrs[1].addOperand(MCOperand::createImm(0)); + Instrs[2].setOpcode(AArch64::STLXRX); + Instrs[2].clear(); + Instrs[2].addOperand(MCOperand::createReg(AArch64::W2)); + Instrs[2].addOperand(MCOperand::createReg(AArch64::X1)); + Instrs[2].addOperand(MCOperand::createReg(AArch64::X0)); + Instrs[3].setOpcode(AArch64::CBNZW); + Instrs[3].clear(); + Instrs[3].addOperand(MCOperand::createReg(AArch64::W2)); + Instrs[3].addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(BBs.back()->getLabel(), *Ctx))); + BBs.back()->addInstructions(Instrs.begin(), Instrs.end()); + BBs.back()->setCFIState(0); + + BBs.emplace_back(InstrCounterIncrFunc->createBasicBlock()); + InstructionListType InstrsEpilog(2); + createPopRegisters(InstrsEpilog[0], AArch64::X1, AArch64::X2); + createReturn(InstrsEpilog[1]); + BBs.back()->addInstructions(InstrsEpilog.begin(), InstrsEpilog.end()); + BBs.back()->setCFIState(0); + + BBs[0]->addSuccessor(BBs[0].get()); + BBs[0]->addSuccessor(BBs[1].get()); + + InstrCounterIncrFunc->insertBasicBlocks(nullptr, std::move(BBs), + /*UpdateLayout*/ true, + /*UpdateCFIState*/ false); + InstrCounterIncrFunc->updateState(BinaryFunction::State::CFG_Finalized); + + LLVM_DEBUG({ + dbgs() << "BOLT-DEBUG: instrumentation counter increment helper:\n"; + InstrCounterIncrFunc->dump(); + }); + } + + InstructionListType createInstrIncMemory(const MCSymbol *Target, + MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) override { unsigned int I = 0; - InstructionListType Instrs(6); + InstructionListType Instrs(opts::NoLSEAtomics ? 8 : 6); + + if (opts::NoLSEAtomics) { + createPushRegisters(Instrs[I++], AArch64::X0, AArch64::LR); + createPushRegisters(Instrs[I++], AArch64::X1, AArch64::X2); + } else { + createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); + } - createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0); assert(Addr.size() == 2 && "Invalid Addr size"); std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I); I += Addr.size(); - InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1); - assert(Insts.size() == 2 && "Invalid Insts size"); - std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); - I += Insts.size(); - createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1); + + if (opts::NoLSEAtomics) { + const MCSymbol *Helper = InstrCounterIncrFunc->getSymbol(); + InstructionListType HelperAddr = + materializeAddress(Helper, Ctx, AArch64::X1); + assert(HelperAddr.size() == 2 && "Invalid HelperAddr size"); + std::copy(HelperAddr.begin(), HelperAddr.end(), Instrs.begin() + I); + I += HelperAddr.size(); + createIndirectCallInst(Instrs[I++], /*IsTailCall*/ false, AArch64::X1); + } else { + InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1); + assert(Insts.size() == 2 && "Invalid Insts size"); + std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); + I += Insts.size(); + } + createPopRegisters(Instrs[I++], AArch64::X0, + opts::NoLSEAtomics ? AArch64::LR : AArch64::X1); return Instrs; } diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp index 10b4913b6ab7f..7c4a8781fd57d 100644 --- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp @@ -626,9 +626,9 @@ class RISCVMCPlusBuilder : public MCPlusBuilder { return Insts; } - InstructionListType - createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, - unsigned CodePointerSize) const override { + InstructionListType createInstrIncMemory(const MCSymbol *Target, + MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) override { // We need 2 scratch registers: one for the target address (x10), and one // for the increment value (x11). // addi sp, sp, -16 diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 1842509dcc5e0..9026a9df7b5c2 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -3053,9 +3053,9 @@ class X86MCPlusBuilder : public MCPlusBuilder { Inst.clear(); } - InstructionListType - createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, - unsigned CodePointerSize) const override { + InstructionListType createInstrIncMemory(const MCSymbol *Target, + MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) override { InstructionListType Instrs(IsLeaf ? 13 : 11); unsigned int I = 0; diff --git a/bolt/test/AArch64/instrumentation_sequence.s b/bolt/test/AArch64/instrumentation_sequence.s new file mode 100644 index 0000000000000..371851fe9a8e3 --- /dev/null +++ b/bolt/test/AArch64/instrumentation_sequence.s @@ -0,0 +1,50 @@ +# This test is to validate instrumentation code sequence generated with +# and without `--no-lse-atomics`. + +# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} + +# RUN: %clang %cflags -pie %s -o %t.so -Wl,-q -Wl,--init=_foo -Wl,--fini=_foo + + .text + .global _foo + .type _foo, %function +_foo: + ret + + .global _start + .type _start, %function +_start: + ret + + # Dummy relocation to force relocation mode + .reloc 0, R_AARCH64_NONE + +# RUN: llvm-bolt %t.so -o %t.instr.so --instrument +# RUN: llvm-objdump -d %t.instr.so | FileCheck %s --check-prefix=INLINE +# INLINE: {{.*}} <_foo>: +# INLINE-NEXT: {{.*}} stp x0, x1, [sp, #-0x10]! +# INLINE-NEXT: {{.*}} adrp x0, 0x{{[0-9a-f]*}} {{.*}} +# INLINE-NEXT: {{.*}} add x0, x0, #0x{{[0-9a-f]*}} +# INLINE-NEXT: {{.*}} mov x1, #0x1 +# INLINE-NEXT: {{.*}} stadd x1, [x0] +# INLINE-NEXT: {{.*}} ldp x0, x1, [sp], #0x10 + +# RUN: llvm-bolt %t.so -o %t.instr.no_lse.so --instrument \ +# RUN: --no-lse-atomics +# RUN: llvm-objdump -d %t.instr.no_lse.so | FileCheck %s --check-prefix=NOLSE +# NOLSE: {{.*}} <_foo>: +# NOLSE-NEXT: {{.*}} stp x0, x30, [sp, #-0x10]! +# NOLSE-NEXT: {{.*}} stp x1, x2, [sp, #-0x10]! +# NOLSE-NEXT: {{.*}} adrp x0, 0x{{[0-9a-f]*}} {{.*}} +# NOLSE-NEXT: {{.*}} add x0, x0, #0x{{[0-9a-f]*}} +# NOLSE-NEXT: {{.*}} adrp x1, 0x[[PAGEBASE:[0-9a-f]*]]000 {{.*}} +# NOLSE-NEXT: {{.*}} add x1, x1, #0x[[PAGEOFF:[0-9a-f]*]] +# NOLSE-NEXT: {{.*}} blr x1 +# NOLSE-NEXT: {{.*}} ldp x0, x30, [sp], #0x10 +# NOLSE: {{[0]*}}[[PAGEBASE]][[PAGEOFF]] <__bolt_instr_counter_incr>: +# NOLSE-NEXT: {{.*}} ldaxr x1, [x0] +# NOLSE-NEXT: {{.*}} add x1, x1, #0x1 +# NOLSE-NEXT: {{.*}} stlxr w2, x1, [x0] +# NOLSE-NEXT: {{.*}} cbnz w2, 0x{{[0-9[a-f]*}} <__bolt_instr_counter_incr> +# NOLSE-NEXT: {{.*}} ldp x1, x2, [sp], #0x10 +# NOLSE-NEXT: {{.*}} ret