From 4a0d3e8632ee2e77cb64c0fffbbde2e6bcd15c54 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Wed, 27 Aug 2025 14:19:30 +0300 Subject: [PATCH 01/12] [BOLT] Refactor MCInstReference and move it to Core (NFC) (#138655) Refactor MCInstReference class and move it from PAuth gadget scanner to Core. MCInstReference is a class representing a constant reference to an instruction inside a parent entity - either inside a basic block (which has a reference to its parent function) or directly inside a function (when CFG information is not available). --- bolt/include/bolt/Core/MCInstUtils.h | 155 +++++++++++++++ bolt/include/bolt/Passes/PAuthGadgetScanner.h | 176 +----------------- bolt/lib/Core/CMakeLists.txt | 1 + bolt/lib/Core/MCInstUtils.cpp | 56 ++++++ bolt/lib/Passes/PAuthGadgetScanner.cpp | 102 +++++----- 5 files changed, 256 insertions(+), 234 deletions(-) create mode 100644 bolt/include/bolt/Core/MCInstUtils.h create mode 100644 bolt/lib/Core/MCInstUtils.cpp diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h new file mode 100644 index 0000000000000..2e93dfaf4c275 --- /dev/null +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -0,0 +1,155 @@ +//===- bolt/Core/MCInstUtils.h ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_CORE_MCINSTUTILS_H +#define BOLT_CORE_MCINSTUTILS_H + +#include "bolt/Core/BinaryBasicBlock.h" +#include +#include + +namespace llvm { +namespace bolt { + +class BinaryFunction; + +/// MCInstReference represents a reference to a constant MCInst as stored either +/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock +/// (after a CFG is created). +class MCInstReference { + using nocfg_const_iterator = std::map::const_iterator; + + // Two cases are possible: + // * functions with CFG reconstructed - a function stores a collection of + // basic blocks, each basic block stores a contiguous vector of MCInst + // * functions without CFG - there are no basic blocks created, + // the instructions are directly stored in std::map in BinaryFunction + // + // In both cases, the direct parent of MCInst is stored together with an + // iterator pointing to the instruction. + + // Helper struct: CFG is available, the direct parent is a basic block, + // iterator's type is `MCInst *`. + struct RefInBB { + RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst) + : BB(BB), It(Inst) {} + RefInBB(const RefInBB &Other) = default; + RefInBB &operator=(const RefInBB &Other) = default; + + const BinaryBasicBlock *BB; + BinaryBasicBlock::const_iterator It; + + bool operator==(const RefInBB &Other) const { + return BB == Other.BB && It == Other.It; + } + }; + + // Helper struct: CFG is *not* available, the direct parent is a function, + // iterator's type is std::map::iterator (the mapped value + // is an instruction's offset). + struct RefInBF { + RefInBF(const BinaryFunction *BF, nocfg_const_iterator It) + : BF(BF), It(It) {} + RefInBF(const RefInBF &Other) = default; + RefInBF &operator=(const RefInBF &Other) = default; + + const BinaryFunction *BF; + nocfg_const_iterator It; + + bool operator==(const RefInBF &Other) const { + return BF == Other.BF && It->first == Other.It->first; + } + }; + + std::variant Reference; + + // Utility methods to be used like this: + // + // if (auto *Ref = tryGetRefInBB()) + // return Ref->doSomething(...); + // return getRefInBF().doSomethingElse(...); + const RefInBB *tryGetRefInBB() const { + assert(std::get_if(&Reference) || + std::get_if(&Reference)); + return std::get_if(&Reference); + } + const RefInBF &getRefInBF() const { + assert(std::get_if(&Reference)); + return *std::get_if(&Reference); + } + +public: + /// Constructs an empty reference. + MCInstReference() : Reference(RefInBB(nullptr, nullptr)) {} + /// Constructs a reference to the instruction inside the basic block. + MCInstReference(const BinaryBasicBlock *BB, const MCInst *Inst) + : Reference(RefInBB(BB, Inst)) { + assert(BB && Inst && "Neither BB nor Inst should be nullptr"); + } + /// Constructs a reference to the instruction inside the basic block. + MCInstReference(const BinaryBasicBlock *BB, unsigned Index) + : Reference(RefInBB(BB, &BB->getInstructionAtIndex(Index))) { + assert(BB && "Basic block should not be nullptr"); + } + /// Constructs a reference to the instruction inside the function without + /// CFG information. + MCInstReference(const BinaryFunction *BF, nocfg_const_iterator It) + : Reference(RefInBF(BF, It)) { + assert(BF && "Function should not be nullptr"); + } + + /// Locates an instruction inside a function and returns a reference. + static MCInstReference get(const MCInst *Inst, const BinaryFunction &BF); + + bool operator==(const MCInstReference &Other) const { + return Reference == Other.Reference; + } + + const MCInst &getMCInst() const { + assert(!empty() && "Empty reference"); + if (auto *Ref = tryGetRefInBB()) + return *Ref->It; + return getRefInBF().It->second; + } + + operator const MCInst &() const { return getMCInst(); } + + bool empty() const { + if (auto *Ref = tryGetRefInBB()) + return Ref->BB == nullptr; + return getRefInBF().BF == nullptr; + } + + bool hasCFG() const { return !empty() && tryGetRefInBB() != nullptr; } + + const BinaryFunction *getFunction() const { + assert(!empty() && "Empty reference"); + if (auto *Ref = tryGetRefInBB()) + return Ref->BB->getFunction(); + return getRefInBF().BF; + } + + const BinaryBasicBlock *getBasicBlock() const { + assert(!empty() && "Empty reference"); + if (auto *Ref = tryGetRefInBB()) + return Ref->BB; + return nullptr; + } + + raw_ostream &print(raw_ostream &OS) const; +}; + +static inline raw_ostream &operator<<(raw_ostream &OS, + const MCInstReference &Ref) { + return Ref.print(OS); +} + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/include/bolt/Passes/PAuthGadgetScanner.h b/bolt/include/bolt/Passes/PAuthGadgetScanner.h index 721fd664a3253..cb865a725d72a 100644 --- a/bolt/include/bolt/Passes/PAuthGadgetScanner.h +++ b/bolt/include/bolt/Passes/PAuthGadgetScanner.h @@ -11,187 +11,13 @@ #include "bolt/Core/BinaryContext.h" #include "bolt/Core/BinaryFunction.h" +#include "bolt/Core/MCInstUtils.h" #include "bolt/Passes/BinaryPasses.h" #include "llvm/Support/raw_ostream.h" #include namespace llvm { namespace bolt { - -/// @brief MCInstReference represents a reference to an MCInst as stored either -/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock -/// (after a CFG is created). It aims to store the necessary information to be -/// able to find the specific MCInst in either the BinaryFunction or -/// BinaryBasicBlock data structures later, so that e.g. the InputAddress of -/// the corresponding instruction can be computed. - -struct MCInstInBBReference { - BinaryBasicBlock *BB; - int64_t BBIndex; - MCInstInBBReference(BinaryBasicBlock *BB, int64_t BBIndex) - : BB(BB), BBIndex(BBIndex) {} - MCInstInBBReference() : BB(nullptr), BBIndex(0) {} - static MCInstInBBReference get(const MCInst *Inst, BinaryFunction &BF) { - for (BinaryBasicBlock &BB : BF) - for (size_t I = 0; I < BB.size(); ++I) - if (Inst == &BB.getInstructionAtIndex(I)) - return MCInstInBBReference(&BB, I); - return {}; - } - bool operator==(const MCInstInBBReference &RHS) const { - return BB == RHS.BB && BBIndex == RHS.BBIndex; - } - bool operator<(const MCInstInBBReference &RHS) const { - return std::tie(BB, BBIndex) < std::tie(RHS.BB, RHS.BBIndex); - } - operator MCInst &() const { - assert(BB != nullptr); - return BB->getInstructionAtIndex(BBIndex); - } - uint64_t getAddress() const { - // 4 bytes per instruction on AArch64. - // FIXME: the assumption of 4 byte per instruction needs to be fixed before - // this method gets used on any non-AArch64 binaries (but should be fine for - // pac-ret analysis, as that is an AArch64-specific feature). - return BB->getFunction()->getAddress() + BB->getOffset() + BBIndex * 4; - } -}; - -raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &); - -struct MCInstInBFReference { - BinaryFunction *BF; - uint64_t Offset; - MCInstInBFReference(BinaryFunction *BF, uint64_t Offset) - : BF(BF), Offset(Offset) {} - - static MCInstInBFReference get(const MCInst *Inst, BinaryFunction &BF) { - for (auto &I : BF.instrs()) - if (Inst == &I.second) - return MCInstInBFReference(&BF, I.first); - return {}; - } - - MCInstInBFReference() : BF(nullptr), Offset(0) {} - bool operator==(const MCInstInBFReference &RHS) const { - return BF == RHS.BF && Offset == RHS.Offset; - } - bool operator<(const MCInstInBFReference &RHS) const { - return std::tie(BF, Offset) < std::tie(RHS.BF, RHS.Offset); - } - operator MCInst &() const { - assert(BF != nullptr); - return *BF->getInstructionAtOffset(Offset); - } - - uint64_t getOffset() const { return Offset; } - - uint64_t getAddress() const { return BF->getAddress() + getOffset(); } -}; - -raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &); - -struct MCInstReference { - enum Kind { FunctionParent, BasicBlockParent }; - Kind ParentKind; - union U { - MCInstInBBReference BBRef; - MCInstInBFReference BFRef; - U(MCInstInBBReference BBRef) : BBRef(BBRef) {} - U(MCInstInBFReference BFRef) : BFRef(BFRef) {} - } U; - MCInstReference(MCInstInBBReference BBRef) - : ParentKind(BasicBlockParent), U(BBRef) {} - MCInstReference(MCInstInBFReference BFRef) - : ParentKind(FunctionParent), U(BFRef) {} - MCInstReference(BinaryBasicBlock *BB, int64_t BBIndex) - : MCInstReference(MCInstInBBReference(BB, BBIndex)) {} - MCInstReference(BinaryFunction *BF, uint32_t Offset) - : MCInstReference(MCInstInBFReference(BF, Offset)) {} - - static MCInstReference get(const MCInst *Inst, BinaryFunction &BF) { - if (BF.hasCFG()) - return MCInstInBBReference::get(Inst, BF); - return MCInstInBFReference::get(Inst, BF); - } - - bool operator<(const MCInstReference &RHS) const { - if (ParentKind != RHS.ParentKind) - return ParentKind < RHS.ParentKind; - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef < RHS.U.BBRef; - case FunctionParent: - return U.BFRef < RHS.U.BFRef; - } - llvm_unreachable(""); - } - - bool operator==(const MCInstReference &RHS) const { - if (ParentKind != RHS.ParentKind) - return false; - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef == RHS.U.BBRef; - case FunctionParent: - return U.BFRef == RHS.U.BFRef; - } - llvm_unreachable(""); - } - - operator MCInst &() const { - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef; - case FunctionParent: - return U.BFRef; - } - llvm_unreachable(""); - } - - operator bool() const { - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef.BB != nullptr; - case FunctionParent: - return U.BFRef.BF != nullptr; - } - llvm_unreachable(""); - } - - uint64_t getAddress() const { - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef.getAddress(); - case FunctionParent: - return U.BFRef.getAddress(); - } - llvm_unreachable(""); - } - - BinaryFunction *getFunction() const { - switch (ParentKind) { - case FunctionParent: - return U.BFRef.BF; - case BasicBlockParent: - return U.BBRef.BB->getFunction(); - } - llvm_unreachable(""); - } - - BinaryBasicBlock *getBasicBlock() const { - switch (ParentKind) { - case FunctionParent: - return nullptr; - case BasicBlockParent: - return U.BBRef.BB; - } - llvm_unreachable(""); - } -}; - -raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &); - namespace PAuthGadgetScanner { // The report classes are designed to be used in an immutable manner. diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt index fc72dc023c590..58cfcab370f16 100644 --- a/bolt/lib/Core/CMakeLists.txt +++ b/bolt/lib/Core/CMakeLists.txt @@ -32,6 +32,7 @@ add_llvm_library(LLVMBOLTCore GDBIndex.cpp HashUtilities.cpp JumpTable.cpp + MCInstUtils.cpp MCPlusBuilder.cpp ParallelUtilities.cpp Relocation.cpp diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp new file mode 100644 index 0000000000000..3cdb9673d4dc0 --- /dev/null +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -0,0 +1,56 @@ +//===- bolt/Passes/MCInstUtils.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "bolt/Core/MCInstUtils.h" +#include "bolt/Core/BinaryBasicBlock.h" +#include "bolt/Core/BinaryFunction.h" + +#include + +using namespace llvm; +using namespace llvm::bolt; + +MCInstReference MCInstReference::get(const MCInst *Inst, + const BinaryFunction &BF) { + if (BF.hasCFG()) { + for (BinaryBasicBlock &BB : BF) + for (MCInst &MI : BB) + if (&MI == Inst) + return MCInstReference(&BB, Inst); + return {}; + } + + for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) { + if (&I->second == Inst) + return MCInstReference(&BF, I); + } + return {}; +} + +raw_ostream &MCInstReference::print(raw_ostream &OS) const { + if (const RefInBB *Ref = tryGetRefInBB()) { + OS << "MCInstBBRef<"; + if (Ref->BB == nullptr) { + OS << "BB:(null)"; + } else { + unsigned IndexInBB = std::distance(Ref->BB->begin(), Ref->It); + OS << "BB:" << Ref->BB->getName() << ":" << IndexInBB; + } + OS << ">"; + return OS; + } + + const RefInBF &Ref = getRefInBF(); + OS << "MCInstBFRef<"; + if (Ref.BF == nullptr) + OS << "BF:(null)"; + else + OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.It->first; + OS << ">"; + return OS; +} diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index 65c84ebc8c4f4..5d884e2d37354 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -24,39 +24,6 @@ namespace llvm { namespace bolt { - -raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &Ref) { - OS << "MCInstBBRef<"; - if (Ref.BB == nullptr) - OS << "BB:(null)"; - else - OS << "BB:" << Ref.BB->getName() << ":" << Ref.BBIndex; - OS << ">"; - return OS; -} - -raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &Ref) { - OS << "MCInstBFRef<"; - if (Ref.BF == nullptr) - OS << "BF:(null)"; - else - OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.getOffset(); - OS << ">"; - return OS; -} - -raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &Ref) { - switch (Ref.ParentKind) { - case MCInstReference::BasicBlockParent: - OS << Ref.U.BBRef; - return OS; - case MCInstReference::FunctionParent: - OS << Ref.U.BFRef; - return OS; - } - llvm_unreachable(""); -} - namespace PAuthGadgetScanner { [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef Label, @@ -91,10 +58,10 @@ template static void iterateOverInstrs(BinaryFunction &BF, T Fn) { if (BF.hasCFG()) { for (BinaryBasicBlock &BB : BF) for (int64_t I = 0, E = BB.size(); I < E; ++I) - Fn(MCInstInBBReference(&BB, I)); + Fn(MCInstReference(&BB, I)); } else { - for (auto I : BF.instrs()) - Fn(MCInstInBFReference(&BF, I.first)); + for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) + Fn(MCInstReference(&BF, I)); } } @@ -566,7 +533,7 @@ class SrcSafetyAnalysis { std::vector Result; for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg)) { MCInstReference Ref = MCInstReference::get(Inst, BF); - assert(Ref && "Expected Inst to be found"); + assert(!Ref.empty() && "Expected Inst to be found"); Result.push_back(Ref); } return Result; @@ -1138,7 +1105,7 @@ class DstSafetyAnalysis { std::vector Result; for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) { MCInstReference Ref = MCInstReference::get(Inst, BF); - assert(Ref && "Expected Inst to be found"); + assert(!Ref.empty() && "Expected Inst to be found"); Result.push_back(Ref); } return Result; @@ -1345,8 +1312,7 @@ static bool shouldAnalyzeTailCallInst(const BinaryContext &BC, // (such as isBranch at the time of writing this comment), some don't (such // as isCall). For that reason, call MCInstrDesc's methods explicitly when // it is important. - const MCInstrDesc &Desc = - BC.MII->get(static_cast(Inst).getOpcode()); + const MCInstrDesc &Desc = BC.MII->get(Inst.getMCInst().getOpcode()); // Tail call should be a branch (but not necessarily an indirect one). if (!Desc.isBranch()) return false; @@ -1541,7 +1507,7 @@ void FunctionAnalysisContext::findUnsafeUses( // This is printed as "[message] in function [name], basic block ..., // at address ..." when the issue is reported to the user. Reports.push_back(make_generic_report( - MCInstReference::get(FirstInst, BF), + MCInstReference(&BB, FirstInst), "Warning: possibly imprecise CFG, the analysis quality may be " "degraded in this function. According to BOLT, unreachable code is " "found" /* in function [name]... */)); @@ -1705,31 +1671,49 @@ void Analysis::runOnFunction(BinaryFunction &BF, } } +// Compute the instruction address for printing (may be slow). +static uint64_t getAddress(const MCInstReference &Inst) { + const BinaryFunction *BF = Inst.getFunction(); + + if (Inst.hasCFG()) { + const BinaryBasicBlock *BB = Inst.getBasicBlock(); + + auto It = static_cast(&Inst.getMCInst()); + unsigned IndexInBB = std::distance(BB->begin(), It); + + // FIXME: this assumes all instructions are 4 bytes in size. This is true + // for AArch64, but it might be good to extract this function so it can be + // used elsewhere and for other targets too. + return BF->getAddress() + BB->getOffset() + IndexInBB * 4; + } + + for (auto I = BF->instrs().begin(), E = BF->instrs().end(); I != E; ++I) { + if (&I->second == &Inst.getMCInst()) + return BF->getAddress() + I->first; + } + llvm_unreachable("Instruction not found in function"); +} + static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB, size_t StartIndex = 0, size_t EndIndex = -1) { if (EndIndex == (size_t)-1) EndIndex = BB->size() - 1; const BinaryFunction *BF = BB->getFunction(); for (unsigned I = StartIndex; I <= EndIndex; ++I) { - // FIXME: this assumes all instructions are 4 bytes in size. This is true - // for AArch64, but it might be good to extract this function so it can be - // used elsewhere and for other targets too. - uint64_t Address = BB->getOffset() + BF->getAddress() + 4 * I; - const MCInst &Inst = BB->getInstructionAtIndex(I); + MCInstReference Inst(BB, I); if (BC.MIB->isCFI(Inst)) continue; - BC.printInstruction(outs(), Inst, Address, BF); + BC.printInstruction(outs(), Inst, getAddress(Inst), BF); } } static void reportFoundGadgetInSingleBBSingleRelatedInst( raw_ostream &OS, const BinaryContext &BC, const MCInstReference RelatedInst, const MCInstReference Location) { - BinaryBasicBlock *BB = Location.getBasicBlock(); - assert(RelatedInst.ParentKind == MCInstReference::BasicBlockParent); - assert(Location.ParentKind == MCInstReference::BasicBlockParent); - MCInstInBBReference RelatedInstBB = RelatedInst.U.BBRef; - if (BB == RelatedInstBB.BB) { + const BinaryBasicBlock *BB = Location.getBasicBlock(); + assert(RelatedInst.hasCFG()); + assert(Location.hasCFG()); + if (BB == RelatedInst.getBasicBlock()) { OS << " This happens in the following basic block:\n"; printBB(BC, BB); } @@ -1737,16 +1721,16 @@ static void reportFoundGadgetInSingleBBSingleRelatedInst( void Diagnostic::printBasicInfo(raw_ostream &OS, const BinaryContext &BC, StringRef IssueKind) const { - BinaryFunction *BF = Location.getFunction(); - BinaryBasicBlock *BB = Location.getBasicBlock(); + const BinaryBasicBlock *BB = Location.getBasicBlock(); + const BinaryFunction *BF = Location.getFunction(); OS << "\nGS-PAUTH: " << IssueKind; OS << " in function " << BF->getPrintName(); if (BB) OS << ", basic block " << BB->getName(); - OS << ", at address " << llvm::format("%x", Location.getAddress()) << "\n"; + OS << ", at address " << llvm::format("%x", getAddress(Location)) << "\n"; OS << " The instruction is "; - BC.printInstruction(OS, Location, Location.getAddress(), BF); + BC.printInstruction(OS, Location, getAddress(Location), BF); } void GadgetDiagnostic::generateReport(raw_ostream &OS, @@ -1762,19 +1746,19 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location, // Sort by address to ensure output is deterministic. SmallVector RI(RelatedInstrs); llvm::sort(RI, [](const MCInstReference &A, const MCInstReference &B) { - return A.getAddress() < B.getAddress(); + return getAddress(A) < getAddress(B); }); for (unsigned I = 0; I < RI.size(); ++I) { MCInstReference InstRef = RI[I]; OS << " " << (I + 1) << ". "; - BC.printInstruction(OS, InstRef, InstRef.getAddress(), &BF); + BC.printInstruction(OS, InstRef, getAddress(InstRef), &BF); }; if (RelatedInstrs.size() == 1) { const MCInstReference RelatedInst = RelatedInstrs[0]; // Printing the details for the MCInstReference::FunctionParent case // is not implemented not to overcomplicate the code, as most functions // are expected to have CFG information. - if (RelatedInst.ParentKind == MCInstReference::BasicBlockParent) + if (RelatedInst.hasCFG()) reportFoundGadgetInSingleBBSingleRelatedInst(OS, BC, RelatedInst, Location); } From eb32eb8a1c3e6163d4efa606ce8bdd96dc73dd5a Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Thu, 28 Aug 2025 15:46:05 +0300 Subject: [PATCH 02/12] Move public members to the top of MCInstReference, make nocfg_const_iterator public --- bolt/include/bolt/Core/MCInstUtils.h | 121 ++++++++++++++------------- 1 file changed, 61 insertions(+), 60 deletions(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 2e93dfaf4c275..82660573afaf3 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -22,68 +22,9 @@ class BinaryFunction; /// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock /// (after a CFG is created). class MCInstReference { +public: using nocfg_const_iterator = std::map::const_iterator; - // Two cases are possible: - // * functions with CFG reconstructed - a function stores a collection of - // basic blocks, each basic block stores a contiguous vector of MCInst - // * functions without CFG - there are no basic blocks created, - // the instructions are directly stored in std::map in BinaryFunction - // - // In both cases, the direct parent of MCInst is stored together with an - // iterator pointing to the instruction. - - // Helper struct: CFG is available, the direct parent is a basic block, - // iterator's type is `MCInst *`. - struct RefInBB { - RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst) - : BB(BB), It(Inst) {} - RefInBB(const RefInBB &Other) = default; - RefInBB &operator=(const RefInBB &Other) = default; - - const BinaryBasicBlock *BB; - BinaryBasicBlock::const_iterator It; - - bool operator==(const RefInBB &Other) const { - return BB == Other.BB && It == Other.It; - } - }; - - // Helper struct: CFG is *not* available, the direct parent is a function, - // iterator's type is std::map::iterator (the mapped value - // is an instruction's offset). - struct RefInBF { - RefInBF(const BinaryFunction *BF, nocfg_const_iterator It) - : BF(BF), It(It) {} - RefInBF(const RefInBF &Other) = default; - RefInBF &operator=(const RefInBF &Other) = default; - - const BinaryFunction *BF; - nocfg_const_iterator It; - - bool operator==(const RefInBF &Other) const { - return BF == Other.BF && It->first == Other.It->first; - } - }; - - std::variant Reference; - - // Utility methods to be used like this: - // - // if (auto *Ref = tryGetRefInBB()) - // return Ref->doSomething(...); - // return getRefInBF().doSomethingElse(...); - const RefInBB *tryGetRefInBB() const { - assert(std::get_if(&Reference) || - std::get_if(&Reference)); - return std::get_if(&Reference); - } - const RefInBF &getRefInBF() const { - assert(std::get_if(&Reference)); - return *std::get_if(&Reference); - } - -public: /// Constructs an empty reference. MCInstReference() : Reference(RefInBB(nullptr, nullptr)) {} /// Constructs a reference to the instruction inside the basic block. @@ -142,6 +83,66 @@ class MCInstReference { } raw_ostream &print(raw_ostream &OS) const; + +private: + // Two cases are possible: + // * functions with CFG reconstructed - a function stores a collection of + // basic blocks, each basic block stores a contiguous vector of MCInst + // * functions without CFG - there are no basic blocks created, + // the instructions are directly stored in std::map in BinaryFunction + // + // In both cases, the direct parent of MCInst is stored together with an + // iterator pointing to the instruction. + + // Helper struct: CFG is available, the direct parent is a basic block, + // iterator's type is `MCInst *`. + struct RefInBB { + RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst) + : BB(BB), It(Inst) {} + RefInBB(const RefInBB &Other) = default; + RefInBB &operator=(const RefInBB &Other) = default; + + const BinaryBasicBlock *BB; + BinaryBasicBlock::const_iterator It; + + bool operator==(const RefInBB &Other) const { + return BB == Other.BB && It == Other.It; + } + }; + + // Helper struct: CFG is *not* available, the direct parent is a function, + // iterator's type is std::map::iterator (the mapped value + // is an instruction's offset). + struct RefInBF { + RefInBF(const BinaryFunction *BF, nocfg_const_iterator It) + : BF(BF), It(It) {} + RefInBF(const RefInBF &Other) = default; + RefInBF &operator=(const RefInBF &Other) = default; + + const BinaryFunction *BF; + nocfg_const_iterator It; + + bool operator==(const RefInBF &Other) const { + return BF == Other.BF && It->first == Other.It->first; + } + }; + + std::variant Reference; + + // Utility methods to be used like this: + // + // if (auto *Ref = tryGetRefInBB()) + // return Ref->doSomething(...); + // return getRefInBF().doSomethingElse(...); + const RefInBB *tryGetRefInBB() const { + assert(std::get_if(&Reference) || + std::get_if(&Reference)); + return std::get_if(&Reference); + } + const RefInBF &getRefInBF() const { + assert(std::get_if(&Reference)); + return *std::get_if(&Reference); + } }; static inline raw_ostream &operator<<(raw_ostream &OS, From 3d8ac8c8cd8d5015b4f5b4a1dff6c7455e1401fd Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Thu, 28 Aug 2025 16:19:56 +0300 Subject: [PATCH 03/12] Fix iterating over the instructions of BinaryBasicBlock --- bolt/include/bolt/Core/MCInstUtils.h | 15 ++++++-- bolt/lib/Core/MCInstUtils.cpp | 52 +++++++++++++++++++++++++- bolt/lib/Passes/PAuthGadgetScanner.cpp | 33 +++------------- 3 files changed, 66 insertions(+), 34 deletions(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 82660573afaf3..a2797c6b31aa2 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -13,6 +13,10 @@ #include #include +namespace llvm { +class MCCodeEmitter; +} + namespace llvm { namespace bolt { @@ -54,7 +58,7 @@ class MCInstReference { const MCInst &getMCInst() const { assert(!empty() && "Empty reference"); if (auto *Ref = tryGetRefInBB()) - return *Ref->It; + return *Ref->Inst; return getRefInBF().It->second; } @@ -82,6 +86,9 @@ class MCInstReference { return nullptr; } + // MCCodeEmitter is not thread safe. + uint64_t getAddress(const MCCodeEmitter *Emitter = nullptr) const; + raw_ostream &print(raw_ostream &OS) const; private: @@ -98,15 +105,15 @@ class MCInstReference { // iterator's type is `MCInst *`. struct RefInBB { RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst) - : BB(BB), It(Inst) {} + : BB(BB), Inst(Inst) {} RefInBB(const RefInBB &Other) = default; RefInBB &operator=(const RefInBB &Other) = default; const BinaryBasicBlock *BB; - BinaryBasicBlock::const_iterator It; + const MCInst *Inst; bool operator==(const RefInBB &Other) const { - return BB == Other.BB && It == Other.It; + return BB == Other.BB && Inst == Other.Inst; } }; diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index 3cdb9673d4dc0..f721c259e9b28 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -9,12 +9,33 @@ #include "bolt/Core/MCInstUtils.h" #include "bolt/Core/BinaryBasicBlock.h" #include "bolt/Core/BinaryFunction.h" +#include "llvm/ADT/iterator.h" -#include +#include using namespace llvm; using namespace llvm::bolt; +// It is assumed in a few places that BinaryBasicBlock stores its instructions +// in a contiguous vector. Give this assumption a name to simplify marking the +// particular places with static_assert. +using BasicBlockStorageIsVector = + std::is_same::const_iterator>; + +namespace { +// Cannot reuse MCPlusBuilder::InstructionIterator because it has to be +// constructed from a non-const std::map iterator. +class mapped_mcinst_iterator + : public iterator_adaptor_base { +public: + mapped_mcinst_iterator(MCInstReference::nocfg_const_iterator It) + : iterator_adaptor_base(It) {} + const MCInst &operator*() const { return this->I->second; } +}; +} // anonymous namespace + MCInstReference MCInstReference::get(const MCInst *Inst, const BinaryFunction &BF) { if (BF.hasCFG()) { @@ -32,13 +53,40 @@ MCInstReference MCInstReference::get(const MCInst *Inst, return {}; } +uint64_t MCInstReference::getAddress(const MCCodeEmitter *Emitter) const { + assert(!empty() && "Taking instruction address by empty reference"); + + const BinaryContext &BC = getFunction()->getBinaryContext(); + if (auto *Ref = tryGetRefInBB()) { + static_assert(BasicBlockStorageIsVector::value, + "Cannot use 'const MCInst *' as iterator type"); + uint64_t AddressOfBB = getFunction()->getAddress() + Ref->BB->getOffset(); + const MCInst *FirstInstInBB = &*Ref->BB->begin(); + + uint64_t OffsetInBB = BC.computeCodeSize(FirstInstInBB, Ref->Inst, Emitter); + + return AddressOfBB + OffsetInBB; + } + + auto &Ref = getRefInBF(); + mapped_mcinst_iterator FirstInstInBF(Ref.BF->instrs().begin()); + mapped_mcinst_iterator ThisInst(Ref.It); + + uint64_t OffsetInBF = BC.computeCodeSize(FirstInstInBF, ThisInst, Emitter); + + return getFunction()->getAddress() + OffsetInBF; +} + raw_ostream &MCInstReference::print(raw_ostream &OS) const { if (const RefInBB *Ref = tryGetRefInBB()) { OS << "MCInstBBRef<"; if (Ref->BB == nullptr) { OS << "BB:(null)"; } else { - unsigned IndexInBB = std::distance(Ref->BB->begin(), Ref->It); + static_assert(BasicBlockStorageIsVector::value, + "Cannot use pointer arithmetic on 'const MCInst *'"); + const MCInst *FirstInstInBB = &*Ref->BB->begin(); + unsigned IndexInBB = Ref->Inst - FirstInstInBB; OS << "BB:" << Ref->BB->getName() << ":" << IndexInBB; } OS << ">"; diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index 5d884e2d37354..287cbe31df3af 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -1671,29 +1671,6 @@ void Analysis::runOnFunction(BinaryFunction &BF, } } -// Compute the instruction address for printing (may be slow). -static uint64_t getAddress(const MCInstReference &Inst) { - const BinaryFunction *BF = Inst.getFunction(); - - if (Inst.hasCFG()) { - const BinaryBasicBlock *BB = Inst.getBasicBlock(); - - auto It = static_cast(&Inst.getMCInst()); - unsigned IndexInBB = std::distance(BB->begin(), It); - - // FIXME: this assumes all instructions are 4 bytes in size. This is true - // for AArch64, but it might be good to extract this function so it can be - // used elsewhere and for other targets too. - return BF->getAddress() + BB->getOffset() + IndexInBB * 4; - } - - for (auto I = BF->instrs().begin(), E = BF->instrs().end(); I != E; ++I) { - if (&I->second == &Inst.getMCInst()) - return BF->getAddress() + I->first; - } - llvm_unreachable("Instruction not found in function"); -} - static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB, size_t StartIndex = 0, size_t EndIndex = -1) { if (EndIndex == (size_t)-1) @@ -1703,7 +1680,7 @@ static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB, MCInstReference Inst(BB, I); if (BC.MIB->isCFI(Inst)) continue; - BC.printInstruction(outs(), Inst, getAddress(Inst), BF); + BC.printInstruction(outs(), Inst, Inst.getAddress(), BF); } } @@ -1728,9 +1705,9 @@ void Diagnostic::printBasicInfo(raw_ostream &OS, const BinaryContext &BC, OS << " in function " << BF->getPrintName(); if (BB) OS << ", basic block " << BB->getName(); - OS << ", at address " << llvm::format("%x", getAddress(Location)) << "\n"; + OS << ", at address " << llvm::format("%x", Location.getAddress()) << "\n"; OS << " The instruction is "; - BC.printInstruction(OS, Location, getAddress(Location), BF); + BC.printInstruction(OS, Location, Location.getAddress(), BF); } void GadgetDiagnostic::generateReport(raw_ostream &OS, @@ -1746,12 +1723,12 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location, // Sort by address to ensure output is deterministic. SmallVector RI(RelatedInstrs); llvm::sort(RI, [](const MCInstReference &A, const MCInstReference &B) { - return getAddress(A) < getAddress(B); + return A.getAddress() < B.getAddress(); }); for (unsigned I = 0; I < RI.size(); ++I) { MCInstReference InstRef = RI[I]; OS << " " << (I + 1) << ". "; - BC.printInstruction(OS, InstRef, getAddress(InstRef), &BF); + BC.printInstruction(OS, InstRef, InstRef.getAddress(), &BF); }; if (RelatedInstrs.size() == 1) { const MCInstReference RelatedInst = RelatedInstrs[0]; From 2a25f3ac711f318195f92335333da3a2ac183421 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Wed, 17 Sep 2025 22:29:14 +0300 Subject: [PATCH 04/12] Use Index inside the BB instead of direct MCInst* in RefInBB --- bolt/include/bolt/Core/MCInstUtils.h | 40 ++++++++++++++++++---------- bolt/lib/Core/MCInstUtils.cpp | 22 +++++++-------- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index a2797c6b31aa2..bd3b3016f706f 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -25,20 +25,21 @@ class BinaryFunction; /// MCInstReference represents a reference to a constant MCInst as stored either /// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock /// (after a CFG is created). +/// +/// The reference may be invalidated when the function containing the referenced +/// instruction is modified. class MCInstReference { public: using nocfg_const_iterator = std::map::const_iterator; /// Constructs an empty reference. - MCInstReference() : Reference(RefInBB(nullptr, nullptr)) {} + MCInstReference() : Reference(RefInBB(nullptr, /*Index=*/0)) {} /// Constructs a reference to the instruction inside the basic block. MCInstReference(const BinaryBasicBlock *BB, const MCInst *Inst) - : Reference(RefInBB(BB, Inst)) { - assert(BB && Inst && "Neither BB nor Inst should be nullptr"); - } + : Reference(RefInBB(BB, getInstIndexInBB(BB, Inst))) {} /// Constructs a reference to the instruction inside the basic block. MCInstReference(const BinaryBasicBlock *BB, unsigned Index) - : Reference(RefInBB(BB, &BB->getInstructionAtIndex(Index))) { + : Reference(RefInBB(BB, Index)) { assert(BB && "Basic block should not be nullptr"); } /// Constructs a reference to the instruction inside the function without @@ -57,8 +58,11 @@ class MCInstReference { const MCInst &getMCInst() const { assert(!empty() && "Empty reference"); - if (auto *Ref = tryGetRefInBB()) - return *Ref->Inst; + if (auto *Ref = tryGetRefInBB()) { + [[maybe_unused]] unsigned NumInstructions = Ref->BB->size(); + assert(Ref->Index < NumInstructions && "Invalid reference"); + return Ref->BB->getInstructionAtIndex(Ref->Index); + } return getRefInBF().It->second; } @@ -92,6 +96,15 @@ class MCInstReference { raw_ostream &print(raw_ostream &OS) const; private: + static unsigned getInstIndexInBB(const BinaryBasicBlock *BB, + const MCInst *Inst) { + assert(BB && Inst && "Neither BB nor Inst should be nullptr"); + // Usage of pointer arithmetic assumes the instructions are stored in a + // vector, see BasicBlockStorageIsVector in MCInstUtils.cpp. + const MCInst *FirstInstInBB = &*BB->begin(); + return Inst - FirstInstInBB; + } + // Two cases are possible: // * functions with CFG reconstructed - a function stores a collection of // basic blocks, each basic block stores a contiguous vector of MCInst @@ -99,21 +112,20 @@ class MCInstReference { // the instructions are directly stored in std::map in BinaryFunction // // In both cases, the direct parent of MCInst is stored together with an - // iterator pointing to the instruction. + // index or iterator pointing to the instruction. - // Helper struct: CFG is available, the direct parent is a basic block, - // iterator's type is `MCInst *`. + // Helper struct: CFG is available, the direct parent is a basic block. struct RefInBB { - RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst) - : BB(BB), Inst(Inst) {} + RefInBB(const BinaryBasicBlock *BB, unsigned Index) + : BB(BB), Index(Index) {} RefInBB(const RefInBB &Other) = default; RefInBB &operator=(const RefInBB &Other) = default; const BinaryBasicBlock *BB; - const MCInst *Inst; + unsigned Index; bool operator==(const RefInBB &Other) const { - return BB == Other.BB && Inst == Other.Inst; + return BB == Other.BB && Index == Other.Index; } }; diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index f721c259e9b28..2bd07882912f2 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -17,11 +17,11 @@ using namespace llvm; using namespace llvm::bolt; // It is assumed in a few places that BinaryBasicBlock stores its instructions -// in a contiguous vector. Give this assumption a name to simplify marking the -// particular places with static_assert. +// in a contiguous vector. using BasicBlockStorageIsVector = std::is_same::const_iterator>; +static_assert(BasicBlockStorageIsVector::value); namespace { // Cannot reuse MCPlusBuilder::InstructionIterator because it has to be @@ -58,12 +58,13 @@ uint64_t MCInstReference::getAddress(const MCCodeEmitter *Emitter) const { const BinaryContext &BC = getFunction()->getBinaryContext(); if (auto *Ref = tryGetRefInBB()) { - static_assert(BasicBlockStorageIsVector::value, - "Cannot use 'const MCInst *' as iterator type"); uint64_t AddressOfBB = getFunction()->getAddress() + Ref->BB->getOffset(); const MCInst *FirstInstInBB = &*Ref->BB->begin(); + const MCInst *ThisInst = &getMCInst(); - uint64_t OffsetInBB = BC.computeCodeSize(FirstInstInBB, Ref->Inst, Emitter); + // Usage of plain 'const MCInst *' as iterators assumes the instructions + // are stored in a vector, see BasicBlockStorageIsVector. + uint64_t OffsetInBB = BC.computeCodeSize(FirstInstInBB, ThisInst, Emitter); return AddressOfBB + OffsetInBB; } @@ -80,15 +81,10 @@ uint64_t MCInstReference::getAddress(const MCCodeEmitter *Emitter) const { raw_ostream &MCInstReference::print(raw_ostream &OS) const { if (const RefInBB *Ref = tryGetRefInBB()) { OS << "MCInstBBRef<"; - if (Ref->BB == nullptr) { + if (Ref->BB == nullptr) OS << "BB:(null)"; - } else { - static_assert(BasicBlockStorageIsVector::value, - "Cannot use pointer arithmetic on 'const MCInst *'"); - const MCInst *FirstInstInBB = &*Ref->BB->begin(); - unsigned IndexInBB = Ref->Inst - FirstInstInBB; - OS << "BB:" << Ref->BB->getName() << ":" << IndexInBB; - } + else + OS << "BB:" << Ref->BB->getName() << ":" << Ref->Index; OS << ">"; return OS; } From b06809883ed96d1c00793834bd05bcff51f032d9 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 23 Sep 2025 14:40:36 +0300 Subject: [PATCH 05/12] Rename getAddress to computeAddress, add a comment --- bolt/include/bolt/Core/MCInstUtils.h | 13 +++++++++++-- bolt/lib/Core/MCInstUtils.cpp | 2 +- bolt/lib/Passes/PAuthGadgetScanner.cpp | 21 ++++++++++++--------- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index bd3b3016f706f..4226437463283 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -90,8 +90,17 @@ class MCInstReference { return nullptr; } - // MCCodeEmitter is not thread safe. - uint64_t getAddress(const MCCodeEmitter *Emitter = nullptr) const; + /// Computes the address of the instruction (or offset from base for PIC). + /// + /// This function is intended for the use cases like debug printing, as it + /// is only as precise as BinaryContext::computeCodeSize() is and requires + /// iterating over the prefix of the basic block (when CFG is available) or + /// of the function (when CFG is unavailable). + /// + /// MCCodeEmitter is not thread safe and the default instance from + /// BinaryContext is used by default, thus pass an instance explicitly if + /// this function may be called from multithreaded code. + uint64_t computeAddress(const MCCodeEmitter *Emitter = nullptr) const; raw_ostream &print(raw_ostream &OS) const; diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index 2bd07882912f2..bc17d1c306122 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -53,7 +53,7 @@ MCInstReference MCInstReference::get(const MCInst *Inst, return {}; } -uint64_t MCInstReference::getAddress(const MCCodeEmitter *Emitter) const { +uint64_t MCInstReference::computeAddress(const MCCodeEmitter *Emitter) const { assert(!empty() && "Taking instruction address by empty reference"); const BinaryContext &BC = getFunction()->getBinaryContext(); diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index 287cbe31df3af..d29c817407eae 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -1680,7 +1680,7 @@ static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB, MCInstReference Inst(BB, I); if (BC.MIB->isCFI(Inst)) continue; - BC.printInstruction(outs(), Inst, Inst.getAddress(), BF); + BC.printInstruction(outs(), Inst, Inst.computeAddress(), BF); } } @@ -1700,14 +1700,15 @@ void Diagnostic::printBasicInfo(raw_ostream &OS, const BinaryContext &BC, StringRef IssueKind) const { const BinaryBasicBlock *BB = Location.getBasicBlock(); const BinaryFunction *BF = Location.getFunction(); + const uint64_t Address = Location.computeAddress(); OS << "\nGS-PAUTH: " << IssueKind; OS << " in function " << BF->getPrintName(); if (BB) OS << ", basic block " << BB->getName(); - OS << ", at address " << llvm::format("%x", Location.getAddress()) << "\n"; + OS << ", at address " << llvm::format("%x", Address) << "\n"; OS << " The instruction is "; - BC.printInstruction(OS, Location, Location.getAddress(), BF); + BC.printInstruction(OS, Location, Address, BF); } void GadgetDiagnostic::generateReport(raw_ostream &OS, @@ -1721,15 +1722,17 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location, const BinaryContext &BC = BF.getBinaryContext(); // Sort by address to ensure output is deterministic. - SmallVector RI(RelatedInstrs); - llvm::sort(RI, [](const MCInstReference &A, const MCInstReference &B) { - return A.getAddress() < B.getAddress(); - }); + SmallVector> RI; + for (auto &InstRef : RelatedInstrs) + RI.push_back(std::make_pair(InstRef.computeAddress(), InstRef)); + llvm::sort(RI, [](auto A, auto B) { return A.first < B.first; }); + for (unsigned I = 0; I < RI.size(); ++I) { - MCInstReference InstRef = RI[I]; + auto [Address, InstRef] = RI[I]; OS << " " << (I + 1) << ". "; - BC.printInstruction(OS, InstRef, InstRef.getAddress(), &BF); + BC.printInstruction(OS, InstRef, InstRef.computeAddress(), &BF); }; + if (RelatedInstrs.size() == 1) { const MCInstReference RelatedInst = RelatedInstrs[0]; // Printing the details for the MCInstReference::FunctionParent case From 0558f1b3892087c43d8d2659cddc9aca6f3ef479 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 23 Sep 2025 14:51:04 +0300 Subject: [PATCH 06/12] Use references instead of pointers expected to be non-NULL --- bolt/include/bolt/Core/MCInstUtils.h | 29 ++++++++++++-------------- bolt/lib/Core/MCInstUtils.cpp | 10 ++++----- bolt/lib/Passes/PAuthGadgetScanner.cpp | 18 ++++++++-------- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 4226437463283..21ed2061a9edc 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -34,23 +34,21 @@ class MCInstReference { /// Constructs an empty reference. MCInstReference() : Reference(RefInBB(nullptr, /*Index=*/0)) {} + /// Constructs a reference to the instruction inside the basic block. - MCInstReference(const BinaryBasicBlock *BB, const MCInst *Inst) - : Reference(RefInBB(BB, getInstIndexInBB(BB, Inst))) {} + MCInstReference(const BinaryBasicBlock &BB, const MCInst &Inst) + : Reference(RefInBB(&BB, getInstIndexInBB(BB, Inst))) {} /// Constructs a reference to the instruction inside the basic block. - MCInstReference(const BinaryBasicBlock *BB, unsigned Index) - : Reference(RefInBB(BB, Index)) { - assert(BB && "Basic block should not be nullptr"); - } + MCInstReference(const BinaryBasicBlock &BB, unsigned Index) + : Reference(RefInBB(&BB, Index)) {} + /// Constructs a reference to the instruction inside the function without /// CFG information. - MCInstReference(const BinaryFunction *BF, nocfg_const_iterator It) - : Reference(RefInBF(BF, It)) { - assert(BF && "Function should not be nullptr"); - } + MCInstReference(const BinaryFunction &BF, nocfg_const_iterator It) + : Reference(RefInBF(&BF, It)) {} /// Locates an instruction inside a function and returns a reference. - static MCInstReference get(const MCInst *Inst, const BinaryFunction &BF); + static MCInstReference get(const MCInst &Inst, const BinaryFunction &BF); bool operator==(const MCInstReference &Other) const { return Reference == Other.Reference; @@ -105,13 +103,12 @@ class MCInstReference { raw_ostream &print(raw_ostream &OS) const; private: - static unsigned getInstIndexInBB(const BinaryBasicBlock *BB, - const MCInst *Inst) { - assert(BB && Inst && "Neither BB nor Inst should be nullptr"); + static unsigned getInstIndexInBB(const BinaryBasicBlock &BB, + const MCInst &Inst) { // Usage of pointer arithmetic assumes the instructions are stored in a // vector, see BasicBlockStorageIsVector in MCInstUtils.cpp. - const MCInst *FirstInstInBB = &*BB->begin(); - return Inst - FirstInstInBB; + const MCInst *FirstInstInBB = &*BB.begin(); + return &Inst - FirstInstInBB; } // Two cases are possible: diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index bc17d1c306122..6742dd4110775 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -36,19 +36,19 @@ class mapped_mcinst_iterator }; } // anonymous namespace -MCInstReference MCInstReference::get(const MCInst *Inst, +MCInstReference MCInstReference::get(const MCInst &Inst, const BinaryFunction &BF) { if (BF.hasCFG()) { for (BinaryBasicBlock &BB : BF) for (MCInst &MI : BB) - if (&MI == Inst) - return MCInstReference(&BB, Inst); + if (&MI == &Inst) + return MCInstReference(BB, Inst); return {}; } for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) { - if (&I->second == Inst) - return MCInstReference(&BF, I); + if (&I->second == &Inst) + return MCInstReference(BF, I); } return {}; } diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index d29c817407eae..e9520a9819939 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -58,10 +58,10 @@ template static void iterateOverInstrs(BinaryFunction &BF, T Fn) { if (BF.hasCFG()) { for (BinaryBasicBlock &BB : BF) for (int64_t I = 0, E = BB.size(); I < E; ++I) - Fn(MCInstReference(&BB, I)); + Fn(MCInstReference(BB, I)); } else { for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) - Fn(MCInstReference(&BF, I)); + Fn(MCInstReference(BF, I)); } } @@ -532,7 +532,7 @@ class SrcSafetyAnalysis { std::vector Result; for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg)) { - MCInstReference Ref = MCInstReference::get(Inst, BF); + MCInstReference Ref = MCInstReference::get(*Inst, BF); assert(!Ref.empty() && "Expected Inst to be found"); Result.push_back(Ref); } @@ -1104,7 +1104,7 @@ class DstSafetyAnalysis { std::vector Result; for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) { - MCInstReference Ref = MCInstReference::get(Inst, BF); + MCInstReference Ref = MCInstReference::get(*Inst, BF); assert(!Ref.empty() && "Expected Inst to be found"); Result.push_back(Ref); } @@ -1507,7 +1507,7 @@ void FunctionAnalysisContext::findUnsafeUses( // This is printed as "[message] in function [name], basic block ..., // at address ..." when the issue is reported to the user. Reports.push_back(make_generic_report( - MCInstReference(&BB, FirstInst), + MCInstReference(BB, *FirstInst), "Warning: possibly imprecise CFG, the analysis quality may be " "degraded in this function. According to BOLT, unreachable code is " "found" /* in function [name]... */)); @@ -1671,11 +1671,11 @@ void Analysis::runOnFunction(BinaryFunction &BF, } } -static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB, +static void printBB(const BinaryContext &BC, const BinaryBasicBlock &BB, size_t StartIndex = 0, size_t EndIndex = -1) { if (EndIndex == (size_t)-1) - EndIndex = BB->size() - 1; - const BinaryFunction *BF = BB->getFunction(); + EndIndex = BB.size() - 1; + const BinaryFunction *BF = BB.getFunction(); for (unsigned I = StartIndex; I <= EndIndex; ++I) { MCInstReference Inst(BB, I); if (BC.MIB->isCFI(Inst)) @@ -1692,7 +1692,7 @@ static void reportFoundGadgetInSingleBBSingleRelatedInst( assert(Location.hasCFG()); if (BB == RelatedInst.getBasicBlock()) { OS << " This happens in the following basic block:\n"; - printBB(BC, BB); + printBB(BC, *BB); } } From 5a7414cf1b7ce5883af02cd6ab886614ed84f5c7 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 23 Sep 2025 15:54:34 +0300 Subject: [PATCH 07/12] Never return an empty reference from MCInstReference::get --- bolt/lib/Core/MCInstUtils.cpp | 7 ++++--- bolt/lib/Passes/PAuthGadgetScanner.cpp | 14 ++++---------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index 6742dd4110775..6f6b9ef638424 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -39,18 +39,19 @@ class mapped_mcinst_iterator MCInstReference MCInstReference::get(const MCInst &Inst, const BinaryFunction &BF) { if (BF.hasCFG()) { - for (BinaryBasicBlock &BB : BF) + for (BinaryBasicBlock &BB : BF) { for (MCInst &MI : BB) if (&MI == &Inst) return MCInstReference(BB, Inst); - return {}; + } + llvm_unreachable("Inst is not contained in BF"); } for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) { if (&I->second == &Inst) return MCInstReference(BF, I); } - return {}; + llvm_unreachable("Inst is not contained in BF"); } uint64_t MCInstReference::computeAddress(const MCCodeEmitter *Emitter) const { diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index e9520a9819939..bd990d8201679 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -531,11 +531,8 @@ class SrcSafetyAnalysis { const SrcState &S = getStateBefore(Inst); std::vector Result; - for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg)) { - MCInstReference Ref = MCInstReference::get(*Inst, BF); - assert(!Ref.empty() && "Expected Inst to be found"); - Result.push_back(Ref); - } + for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg)) + Result.push_back(MCInstReference::get(*Inst, BF)); return Result; } }; @@ -1103,11 +1100,8 @@ class DstSafetyAnalysis { const DstState &S = getStateAfter(Inst); std::vector Result; - for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) { - MCInstReference Ref = MCInstReference::get(*Inst, BF); - assert(!Ref.empty() && "Expected Inst to be found"); - Result.push_back(Ref); - } + for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) + Result.push_back(MCInstReference::get(*Inst, BF)); return Result; } }; From ec6bff9d37a2dba56bd38833b3d4c8be2b27f4e0 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Fri, 26 Sep 2025 18:55:01 +0300 Subject: [PATCH 08/12] Clarify the description of computeAddress() --- bolt/include/bolt/Core/MCInstUtils.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 21ed2061a9edc..20682b3f57a13 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -88,7 +88,8 @@ class MCInstReference { return nullptr; } - /// Computes the address of the instruction (or offset from base for PIC). + /// Computes the original address of the instruction (or offset from base + /// for PIC), assuming the containing function was not modified. /// /// This function is intended for the use cases like debug printing, as it /// is only as precise as BinaryContext::computeCodeSize() is and requires From 2fc734ce57ead327840a5d62a7980e5eaa70eac3 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Mon, 29 Sep 2025 15:00:48 +0300 Subject: [PATCH 09/12] Obtain offset from std::map key when CFG is unavailable --- bolt/include/bolt/Core/MCInstUtils.h | 3 +-- bolt/lib/Core/MCInstUtils.cpp | 19 +------------------ 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h index 20682b3f57a13..eb56629c61c7d 100644 --- a/bolt/include/bolt/Core/MCInstUtils.h +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -93,8 +93,7 @@ class MCInstReference { /// /// This function is intended for the use cases like debug printing, as it /// is only as precise as BinaryContext::computeCodeSize() is and requires - /// iterating over the prefix of the basic block (when CFG is available) or - /// of the function (when CFG is unavailable). + /// iterating over the prefix of the basic block (when CFG is available). /// /// MCCodeEmitter is not thread safe and the default instance from /// BinaryContext is used by default, thus pass an instance explicitly if diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index 6f6b9ef638424..472a5c6bf956b 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -9,7 +9,6 @@ #include "bolt/Core/MCInstUtils.h" #include "bolt/Core/BinaryBasicBlock.h" #include "bolt/Core/BinaryFunction.h" -#include "llvm/ADT/iterator.h" #include @@ -23,19 +22,6 @@ using BasicBlockStorageIsVector = std::vector::const_iterator>; static_assert(BasicBlockStorageIsVector::value); -namespace { -// Cannot reuse MCPlusBuilder::InstructionIterator because it has to be -// constructed from a non-const std::map iterator. -class mapped_mcinst_iterator - : public iterator_adaptor_base { -public: - mapped_mcinst_iterator(MCInstReference::nocfg_const_iterator It) - : iterator_adaptor_base(It) {} - const MCInst &operator*() const { return this->I->second; } -}; -} // anonymous namespace - MCInstReference MCInstReference::get(const MCInst &Inst, const BinaryFunction &BF) { if (BF.hasCFG()) { @@ -71,10 +57,7 @@ uint64_t MCInstReference::computeAddress(const MCCodeEmitter *Emitter) const { } auto &Ref = getRefInBF(); - mapped_mcinst_iterator FirstInstInBF(Ref.BF->instrs().begin()); - mapped_mcinst_iterator ThisInst(Ref.It); - - uint64_t OffsetInBF = BC.computeCodeSize(FirstInstInBF, ThisInst, Emitter); + uint64_t OffsetInBF = Ref.It->first; return getFunction()->getAddress() + OffsetInBF; } From 66e0fb44f94f35a3e370fe6900a406e9ffa8abcf Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 30 Sep 2025 11:21:18 +0300 Subject: [PATCH 10/12] Add `const`s and apply clang-format --- bolt/lib/Core/MCInstUtils.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index 472a5c6bf956b..b7416b75dc54b 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -45,19 +45,21 @@ uint64_t MCInstReference::computeAddress(const MCCodeEmitter *Emitter) const { const BinaryContext &BC = getFunction()->getBinaryContext(); if (auto *Ref = tryGetRefInBB()) { - uint64_t AddressOfBB = getFunction()->getAddress() + Ref->BB->getOffset(); + const uint64_t AddressOfBB = + getFunction()->getAddress() + Ref->BB->getOffset(); const MCInst *FirstInstInBB = &*Ref->BB->begin(); const MCInst *ThisInst = &getMCInst(); // Usage of plain 'const MCInst *' as iterators assumes the instructions // are stored in a vector, see BasicBlockStorageIsVector. - uint64_t OffsetInBB = BC.computeCodeSize(FirstInstInBB, ThisInst, Emitter); + const uint64_t OffsetInBB = + BC.computeCodeSize(FirstInstInBB, ThisInst, Emitter); return AddressOfBB + OffsetInBB; } auto &Ref = getRefInBF(); - uint64_t OffsetInBF = Ref.It->first; + const uint64_t OffsetInBF = Ref.It->first; return getFunction()->getAddress() + OffsetInBF; } From 9590ffcfea8362c4c7df357ffd2d8ff93b274cea Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 30 Sep 2025 12:03:00 +0300 Subject: [PATCH 11/12] nit: fix the file name in the header of MCInstUtils.cpp --- bolt/lib/Core/MCInstUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp index b7416b75dc54b..f505bf73c64eb 100644 --- a/bolt/lib/Core/MCInstUtils.cpp +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -1,4 +1,4 @@ -//===- bolt/Passes/MCInstUtils.cpp ----------------------------------------===// +//===- bolt/Core/MCInstUtils.cpp ------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From 5923babb604b0470c96e66ed5bc606d829e08774 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 30 Sep 2025 12:24:01 +0300 Subject: [PATCH 12/12] nit: do not recompute InstRef.computeAddress() --- bolt/lib/Passes/PAuthGadgetScanner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index bd990d8201679..cfe4b6ba785e4 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -1724,7 +1724,7 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location, for (unsigned I = 0; I < RI.size(); ++I) { auto [Address, InstRef] = RI[I]; OS << " " << (I + 1) << ". "; - BC.printInstruction(OS, InstRef, InstRef.computeAddress(), &BF); + BC.printInstruction(OS, InstRef, Address, &BF); }; if (RelatedInstrs.size() == 1) {