-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[BOLT][AArch64][instr] Consider targeting ARM64 CPUs without LSE support #158738
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
`stadd` is only available in recent arm64 CPUs that have LSE support (like Cortex-A73 and Cortex-A75) and is not available on old arm64 CPUs (like Cortex-A53 and Cortex-A55). Devices could have a mixture of those two kinds of CPUs, so we need to provide an option for BOLT to generate instrumentation sequence that emulates what `stadd` would do. The implementation puts counter increment into an injected helper function since we don't want to rebuild CFG in the function that is being instrumented.
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-bolt Author: YongKang Zhu (yozhu) Changes
Full diff: https://github.com/llvm/llvm-project/pull/158738.diff 6 Files Affected:
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 90129d475d870..5b711b0e27bab 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -51,6 +51,7 @@ class raw_ostream;
namespace bolt {
class BinaryBasicBlock;
+class BinaryContext;
class BinaryFunction;
/// Different types of indirect branches encountered during disassembly.
@@ -530,10 +531,15 @@ class MCPlusBuilder {
return 0;
}
+ /// Create a helper function to increment counter for Instrumentation
+ virtual void createInstrCounterIncrFunc(BinaryContext &BC) {
+ llvm_unreachable("not implemented");
+ }
+
/// Create increment contents of target by 1 for Instrumentation
- virtual InstructionListType
- createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
- unsigned CodePointerSize) const {
+ virtual InstructionListType createInstrIncMemory(const MCSymbol *Target,
+ MCContext *Ctx, bool IsLeaf,
+ unsigned CodePointerSize) {
llvm_unreachable("not implemented");
return InstructionListType();
}
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index c2f876f0dff9e..150461b020f06 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -753,6 +753,8 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
createSimpleFunction("__bolt_fini_trampoline",
BC.MIB->createReturnInstructionList(BC.Ctx.get()));
}
+ if (BC.isAArch64())
+ BC.MIB->createInstrCounterIncrFunc(BC);
}
}
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index f972646aa12ea..a6589f8f9ee42 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -26,6 +26,7 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegister.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/DataExtractor.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -35,6 +36,15 @@
using namespace llvm;
using namespace bolt;
+namespace opts {
+extern cl::OptionCategory BoltInstrCategory;
+static cl::opt<bool> NoLSEAtomics(
+ "no-lse-atomics",
+ cl::desc("generate instrumentation code sequence without using LSE atomic "
+ "instruction"),
+ cl::init(false), cl::Optional, cl::cat(BoltInstrCategory));
+} // namespace opts
+
namespace {
static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
@@ -106,7 +116,7 @@ static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) {
}
static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) {
- // NOTE: Supports only ARM with LSE extension
+ assert(!opts::NoLSEAtomics && "Supports only ARM with LSE extension");
Inst.setOpcode(AArch64::LDADDX);
Inst.clear();
Inst.addOperand(MCOperand::createReg(AArch64::XZR));
@@ -135,6 +145,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
public:
using MCPlusBuilder::MCPlusBuilder;
+ BinaryFunction *InstrCounterIncrFunc{nullptr};
+
std::unique_ptr<MCSymbolizer>
createTargetSymbolizer(BinaryFunction &Function,
bool CreateNewSymbols) const override {
@@ -2513,22 +2525,129 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
return Insts;
}
- InstructionListType
- createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
- unsigned CodePointerSize) const override {
+ // Instrumentation code sequence using LSE atomic instruction has a total of
+ // 6 instructions:
+ //
+ // stp x0, x1, [sp, #-0x10]!
+ // adrp x0, page_address(counter)
+ // add x0, x0, page_offset(counter)
+ // mov x1, #0x1
+ // stadd x1, [x0]
+ // ldp x0, x1, [sp], #0x10
+ //
+ // Instrumentation code sequence without using LSE atomic instruction has
+ // 8 instructions at instrumentation place, with 6 instructions in the helper:
+ //
+ // stp x0, x30, [sp, #-0x10]!
+ // stp x1, x2, [sp, #-0x10]!
+ // adrp x0, page_address(counter)
+ // add x0, x0, page_offset(counter)
+ // adrp x1, page_address(helper)
+ // add x1, x1, page_offset(helper)
+ // blr x1
+ // ldp x0, x30, [sp], #0x10
+ //
+ // <helper>:
+ // ldaxr x1, [x0]
+ // add x1, x1, #0x1
+ // stlxr w2, x1, [x0]
+ // cbnz w2, <helper>
+ // ldp x1, x2, [sp], #0x10
+ // ret
+
+ void createInstrCounterIncrFunc(BinaryContext &BC) override {
+ assert(InstrCounterIncrFunc == nullptr &&
+ "helper function of counter increment for instrumentation "
+ "has already been created");
+
+ if (!opts::NoLSEAtomics)
+ return;
+
+ MCContext *Ctx = BC.Ctx.get();
+ InstrCounterIncrFunc = BC.createInjectedBinaryFunction(
+ "__bolt_instr_counter_incr", /*IsSimple*/ false);
+ std::vector<std::unique_ptr<BinaryBasicBlock>> BBs;
+
+ BBs.emplace_back(InstrCounterIncrFunc->createBasicBlock());
+ InstructionListType Instrs(4);
+ Instrs[0].setOpcode(AArch64::LDAXRX);
+ Instrs[0].clear();
+ Instrs[0].addOperand(MCOperand::createReg(AArch64::X1));
+ Instrs[0].addOperand(MCOperand::createReg(AArch64::X0));
+ Instrs[1].setOpcode(AArch64::ADDXri);
+ Instrs[1].clear();
+ Instrs[1].addOperand(MCOperand::createReg(AArch64::X1));
+ Instrs[1].addOperand(MCOperand::createReg(AArch64::X1));
+ Instrs[1].addOperand(MCOperand::createImm(1));
+ Instrs[1].addOperand(MCOperand::createImm(0));
+ Instrs[2].setOpcode(AArch64::STLXRX);
+ Instrs[2].clear();
+ Instrs[2].addOperand(MCOperand::createReg(AArch64::W2));
+ Instrs[2].addOperand(MCOperand::createReg(AArch64::X1));
+ Instrs[2].addOperand(MCOperand::createReg(AArch64::X0));
+ Instrs[3].setOpcode(AArch64::CBNZW);
+ Instrs[3].clear();
+ Instrs[3].addOperand(MCOperand::createReg(AArch64::W2));
+ Instrs[3].addOperand(MCOperand::createExpr(
+ MCSymbolRefExpr::create(BBs.back()->getLabel(), *Ctx)));
+ BBs.back()->addInstructions(Instrs.begin(), Instrs.end());
+ BBs.back()->setCFIState(0);
+
+ BBs.emplace_back(InstrCounterIncrFunc->createBasicBlock());
+ InstructionListType InstrsEpilog(2);
+ createPopRegisters(InstrsEpilog[0], AArch64::X1, AArch64::X2);
+ createReturn(InstrsEpilog[1]);
+ BBs.back()->addInstructions(InstrsEpilog.begin(), InstrsEpilog.end());
+ BBs.back()->setCFIState(0);
+
+ BBs[0]->addSuccessor(BBs[0].get());
+ BBs[0]->addSuccessor(BBs[1].get());
+
+ InstrCounterIncrFunc->insertBasicBlocks(nullptr, std::move(BBs),
+ /*UpdateLayout*/ true,
+ /*UpdateCFIState*/ false);
+ InstrCounterIncrFunc->updateState(BinaryFunction::State::CFG_Finalized);
+
+ LLVM_DEBUG({
+ dbgs() << "BOLT-DEBUG: instrumentation counter increment helper:\n";
+ InstrCounterIncrFunc->dump();
+ });
+ }
+
+ InstructionListType createInstrIncMemory(const MCSymbol *Target,
+ MCContext *Ctx, bool IsLeaf,
+ unsigned CodePointerSize) override {
unsigned int I = 0;
- InstructionListType Instrs(6);
+ InstructionListType Instrs(opts::NoLSEAtomics ? 8 : 6);
+
+ if (opts::NoLSEAtomics) {
+ createPushRegisters(Instrs[I++], AArch64::X0, AArch64::LR);
+ createPushRegisters(Instrs[I++], AArch64::X1, AArch64::X2);
+ } else {
+ createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
+ }
- createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0);
assert(Addr.size() == 2 && "Invalid Addr size");
std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I);
I += Addr.size();
- InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1);
- assert(Insts.size() == 2 && "Invalid Insts size");
- std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I);
- I += Insts.size();
- createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
+
+ if (opts::NoLSEAtomics) {
+ const MCSymbol *Helper = InstrCounterIncrFunc->getSymbol();
+ InstructionListType HelperAddr =
+ materializeAddress(Helper, Ctx, AArch64::X1);
+ assert(HelperAddr.size() == 2 && "Invalid HelperAddr size");
+ std::copy(HelperAddr.begin(), HelperAddr.end(), Instrs.begin() + I);
+ I += HelperAddr.size();
+ createIndirectCallInst(Instrs[I++], /*IsTailCall*/ false, AArch64::X1);
+ } else {
+ InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1);
+ assert(Insts.size() == 2 && "Invalid Insts size");
+ std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I);
+ I += Insts.size();
+ }
+ createPopRegisters(Instrs[I++], AArch64::X0,
+ opts::NoLSEAtomics ? AArch64::LR : AArch64::X1);
return Instrs;
}
diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index 10b4913b6ab7f..7c4a8781fd57d 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -626,9 +626,9 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
return Insts;
}
- InstructionListType
- createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
- unsigned CodePointerSize) const override {
+ InstructionListType createInstrIncMemory(const MCSymbol *Target,
+ MCContext *Ctx, bool IsLeaf,
+ unsigned CodePointerSize) override {
// We need 2 scratch registers: one for the target address (x10), and one
// for the increment value (x11).
// addi sp, sp, -16
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 1842509dcc5e0..9026a9df7b5c2 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -3053,9 +3053,9 @@ class X86MCPlusBuilder : public MCPlusBuilder {
Inst.clear();
}
- InstructionListType
- createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
- unsigned CodePointerSize) const override {
+ InstructionListType createInstrIncMemory(const MCSymbol *Target,
+ MCContext *Ctx, bool IsLeaf,
+ unsigned CodePointerSize) override {
InstructionListType Instrs(IsLeaf ? 13 : 11);
unsigned int I = 0;
diff --git a/bolt/test/AArch64/instrumentation_sequence.s b/bolt/test/AArch64/instrumentation_sequence.s
new file mode 100644
index 0000000000000..371851fe9a8e3
--- /dev/null
+++ b/bolt/test/AArch64/instrumentation_sequence.s
@@ -0,0 +1,50 @@
+# This test is to validate instrumentation code sequence generated with
+# and without `--no-lse-atomics`.
+
+# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
+
+# RUN: %clang %cflags -pie %s -o %t.so -Wl,-q -Wl,--init=_foo -Wl,--fini=_foo
+
+ .text
+ .global _foo
+ .type _foo, %function
+_foo:
+ ret
+
+ .global _start
+ .type _start, %function
+_start:
+ ret
+
+ # Dummy relocation to force relocation mode
+ .reloc 0, R_AARCH64_NONE
+
+# RUN: llvm-bolt %t.so -o %t.instr.so --instrument
+# RUN: llvm-objdump -d %t.instr.so | FileCheck %s --check-prefix=INLINE
+# INLINE: {{.*}} <_foo>:
+# INLINE-NEXT: {{.*}} stp x0, x1, [sp, #-0x10]!
+# INLINE-NEXT: {{.*}} adrp x0, 0x{{[0-9a-f]*}} {{.*}}
+# INLINE-NEXT: {{.*}} add x0, x0, #0x{{[0-9a-f]*}}
+# INLINE-NEXT: {{.*}} mov x1, #0x1
+# INLINE-NEXT: {{.*}} stadd x1, [x0]
+# INLINE-NEXT: {{.*}} ldp x0, x1, [sp], #0x10
+
+# RUN: llvm-bolt %t.so -o %t.instr.no_lse.so --instrument \
+# RUN: --no-lse-atomics
+# RUN: llvm-objdump -d %t.instr.no_lse.so | FileCheck %s --check-prefix=NOLSE
+# NOLSE: {{.*}} <_foo>:
+# NOLSE-NEXT: {{.*}} stp x0, x30, [sp, #-0x10]!
+# NOLSE-NEXT: {{.*}} stp x1, x2, [sp, #-0x10]!
+# NOLSE-NEXT: {{.*}} adrp x0, 0x{{[0-9a-f]*}} {{.*}}
+# NOLSE-NEXT: {{.*}} add x0, x0, #0x{{[0-9a-f]*}}
+# NOLSE-NEXT: {{.*}} adrp x1, 0x[[PAGEBASE:[0-9a-f]*]]000 {{.*}}
+# NOLSE-NEXT: {{.*}} add x1, x1, #0x[[PAGEOFF:[0-9a-f]*]]
+# NOLSE-NEXT: {{.*}} blr x1
+# NOLSE-NEXT: {{.*}} ldp x0, x30, [sp], #0x10
+# NOLSE: {{[0]*}}[[PAGEBASE]][[PAGEOFF]] <__bolt_instr_counter_incr>:
+# NOLSE-NEXT: {{.*}} ldaxr x1, [x0]
+# NOLSE-NEXT: {{.*}} add x1, x1, #0x1
+# NOLSE-NEXT: {{.*}} stlxr w2, x1, [x0]
+# NOLSE-NEXT: {{.*}} cbnz w2, 0x{{[0-9[a-f]*}} <__bolt_instr_counter_incr>
+# NOLSE-NEXT: {{.*}} ldp x1, x2, [sp], #0x10
+# NOLSE-NEXT: {{.*}} ret
|
Link BOLTUtils against the AArch64 target to support the new option that enables instrumentation without LSE (see #158738) This fixes shared library builds, eg: https://lab.llvm.org/staging/#/builders/220/builds/1537 Note: the link points to a collapsing builder.
Link BOLTUtils against the AArch64 target to support the new option that enables instrumentation without LSE (see #158738) This fixes shared library builds, eg: https://lab.llvm.org/staging/#/builders/220/builds/1537 Note: the link points to a collapsing builder.
…ort (llvm#158738) `stadd` is only available in recent arm64 CPUs that have LSE support (like Cortex-A73 and Cortex-A75) and is not available on old arm64 CPUs (like Cortex-A53 and Cortex-A55). Devices could have a mixture of these two kinds of CPUs, for which we need to provide an option for BOLT to generate instrumentation sequence that emulates what `stadd` would do. The implementation puts counter increment into an injected helper function so we don't need to update CFG in the function that is being instrumented and instrumentation induced binary size increase will be smaller.
Link BOLTUtils against the AArch64 target to support the new option that enables instrumentation without LSE (see llvm#158738) This fixes shared library builds, eg: https://lab.llvm.org/staging/#/builders/220/builds/1537 Note: the link points to a collapsing builder.
…ort (llvm#158738) `stadd` is only available in recent arm64 CPUs that have LSE support (like Cortex-A73 and Cortex-A75) and is not available on old arm64 CPUs (like Cortex-A53 and Cortex-A55). Devices could have a mixture of these two kinds of CPUs, for which we need to provide an option for BOLT to generate instrumentation sequence that emulates what `stadd` would do. The implementation puts counter increment into an injected helper function so we don't need to update CFG in the function that is being instrumented and instrumentation induced binary size increase will be smaller.
Link BOLTUtils against the AArch64 target to support the new option that enables instrumentation without LSE (see llvm#158738) This fixes shared library builds, eg: https://lab.llvm.org/staging/#/builders/220/builds/1537 Note: the link points to a collapsing builder.
stadd
is only available in recent arm64 CPUs that have LSE support (like Cortex-A73 and Cortex-A75) and is not available on old arm64 CPUs (like Cortex-A53 and Cortex-A55). Devices could have a mixture of these two kinds of CPUs, so we need to provide an option for BOLT to generate instrumentation sequence that emulates whatstadd
would do. The implementation puts counter increment into an injected helper function so we don't need to update CFG in the function that is being instrumented and also make instrumentation induced binary size increase smaller.