Skip to content

Commit

Permalink
[MTE] Pin the tagged base pointer to one of the stack slots.
Browse files Browse the repository at this point in the history
Summary:
Pin the tagged base pointer to one of the stack slots, and (if
necessary) rewrite tag offsets so that an object that occupies that
slot has both address and tag offsets of 0. This allows ADDG
instructions for that object to be eliminated and their uses replaced
with the tagged base pointer itself.

This optimization must be done in machine instructions and not in the IR
instrumentation pass, because referring to a stack slot through an IRG
pointer would confuse the stack coloring pass.

The optimization makes a (pretty naive) attempt to find the slot that
would benefit the most by counting the uses of stack slots in the
function.

Reviewers: ostannard, pcc

Subscribers: merge_guards_bot, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D72365
  • Loading branch information
eugenis committed Oct 15, 2020
1 parent d1beb95 commit 2f63e57
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 21 deletions.
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,10 @@ def int_aarch64_irg_sp : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty],
// ptr1 = tagp(ptr0, baseptr, tag_offset) returns a pointer where
// * address is the address in ptr0
// * tag is a function of (tag in baseptr, tag_offset).
// ** Beware, this is not the same function as implemented by the ADDG instruction!
// Backend optimizations may change tag_offset; the only guarantee is that calls
// to tagp with the same pair of (baseptr, tag_offset) will produce pointers
// with the same tag value, assuming the set of excluded tags has not changed.
// Address bits in baseptr and tag bits in ptr0 are ignored.
// When offset between ptr0 and baseptr is a compile time constant, this can be emitted as
// ADDG ptr1, baseptr, (ptr0 - baseptr), tag_offset
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1070,9 +1070,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
return;

// Set tagged base pointer to the bottom of the stack frame.
// Set tagged base pointer to the requested stack slot.
// Ideally it should match SP value after prologue.
AFI->setTaggedBasePointerOffset(MFI.getStackSize());
Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
if (TBPI)
AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
else
AFI->setTaggedBasePointerOffset(MFI.getStackSize());

const StackOffset &SVEStackSize = getSVEStackSize(MF);

Expand Down
16 changes: 12 additions & 4 deletions llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
/// that must be forwarded to every musttail call.
SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;

// Offset from SP-at-entry to the tagged base pointer.
// Tagged base pointer is set up to point to the first (lowest address) tagged
// stack slot.
unsigned TaggedBasePointerOffset = 0;
/// FrameIndex for the tagged base pointer.
Optional<int> TaggedBasePointerIndex;

/// Offset from SP-at-entry to the tagged base pointer.
/// Tagged base pointer is set up to point to the first (lowest address)
/// tagged stack slot.
unsigned TaggedBasePointerOffset;

/// OutliningStyle denotes, if a function was outined, how it was outlined,
/// e.g. Tail Call, Thunk, or Function if none apply.
Expand Down Expand Up @@ -343,6 +346,11 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
return ForwardedMustTailRegParms;
}

Optional<int> getTaggedBasePointerIndex() const {
return TaggedBasePointerIndex;
}
void setTaggedBasePointerIndex(int Index) { TaggedBasePointerIndex = Index; }

unsigned getTaggedBasePointerOffset() const {
return TaggedBasePointerOffset;
}
Expand Down
167 changes: 166 additions & 1 deletion llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "AArch64InstrInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
Expand Down Expand Up @@ -50,6 +49,12 @@ cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
"apply unchecked-ld-st when the target is definitely within range"),
clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));

static cl::opt<bool>
ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true),
cl::ZeroOrMore,
cl::desc("Apply first slot optimization for stack tagging "
"(eliminate ADDG Rt, Rn, 0, 0)."));

namespace {

class AArch64StackTaggingPreRA : public MachineFunctionPass {
Expand All @@ -71,6 +76,7 @@ class AArch64StackTaggingPreRA : public MachineFunctionPass {
bool mayUseUncheckedLoadStore();
void uncheckUsesOf(unsigned TaggedReg, int FI);
void uncheckLoadsAndStores();
Optional<int> findFirstSlotCandidate();

bool runOnMachineFunction(MachineFunction &Func) override;
StringRef getPassName() const override {
Expand Down Expand Up @@ -197,6 +203,141 @@ void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
}
}

struct SlotWithTag {
int FI;
int Tag;
SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {}
explicit SlotWithTag(const MachineInstr &MI)
: FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {}
bool operator==(const SlotWithTag &Other) const {
return FI == Other.FI && Tag == Other.Tag;
}
};

namespace llvm {
template <> struct DenseMapInfo<SlotWithTag> {
static inline SlotWithTag getEmptyKey() { return {-2, -2}; }
static inline SlotWithTag getTombstoneKey() { return {-3, -3}; }
static unsigned getHashValue(const SlotWithTag &V) {
return hash_combine(DenseMapInfo<int>::getHashValue(V.FI),
DenseMapInfo<int>::getHashValue(V.Tag));
}
static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) {
return A == B;
}
};
} // namespace llvm

static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) {
return MFI->getUseLocalStackAllocationBlock() &&
MFI->isObjectPreAllocated(FI);
}

// Pin one of the tagged slots to offset 0 from the tagged base pointer.
// This would make its address available in a virtual register (IRG's def), as
// opposed to requiring an ADDG instruction to materialize. This effectively
// eliminates a vreg (by replacing it with direct uses of IRG, which is usually
// live almost everywhere anyway), and therefore needs to happen before
// regalloc.
Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() {
// Find the best (FI, Tag) pair to pin to offset 0.
// Looking at the possible uses of a tagged address, the advantage of pinning
// is:
// - COPY to physical register.
// Does not matter, this would trade a MOV instruction for an ADDG.
// - ST*G matter, but those mostly appear near the function prologue where all
// the tagged addresses need to be materialized anyway; also, counting ST*G
// uses would overweight large allocas that require more than one ST*G
// instruction.
// - Load/Store instructions in the address operand do not require a tagged
// pointer, so they also do not benefit. These operands have already been
// eliminated (see uncheckLoadsAndStores) so all remaining load/store
// instructions count.
// - Any other instruction may benefit from being pinned to offset 0.
LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n");
if (!ClFirstSlot)
return None;

DenseMap<SlotWithTag, int> RetagScore;
SlotWithTag MaxScoreST{-1, -1};
int MaxScore = -1;
for (auto *I : ReTags) {
SlotWithTag ST{*I};
if (isSlotPreAllocated(MFI, ST.FI))
continue;

Register RetagReg = I->getOperand(0).getReg();
if (!Register::isVirtualRegister(RetagReg))
continue;

int Score = 0;
SmallVector<Register, 8> WorkList;
WorkList.push_back(RetagReg);

while (!WorkList.empty()) {
Register UseReg = WorkList.back();
WorkList.pop_back();
for (auto &UseI : MRI->use_instructions(UseReg)) {
unsigned Opcode = UseI.getOpcode();
if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset ||
Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset ||
Opcode == AArch64::STGPi || Opcode == AArch64::STGloop ||
Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback ||
Opcode == AArch64::STZGloop_wback)
continue;
if (UseI.isCopy()) {
Register DstReg = UseI.getOperand(0).getReg();
if (Register::isVirtualRegister(DstReg))
WorkList.push_back(DstReg);
continue;
}
LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %"
<< Register::virtReg2Index(UseReg) << " in " << UseI
<< "\n");
Score++;
}
}

int TotalScore = RetagScore[ST] += Score;
if (TotalScore > MaxScore ||
(TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) {
MaxScore = TotalScore;
MaxScoreST = ST;
}
}

if (MaxScoreST.FI < 0)
return None;

// If FI's tag is already 0, we are done.
if (MaxScoreST.Tag == 0)
return MaxScoreST.FI;

// Otherwise, find a random victim pair (FI, Tag) where Tag == 0.
SlotWithTag SwapST{-1, -1};
for (auto *I : ReTags) {
SlotWithTag ST{*I};
if (ST.Tag == 0) {
SwapST = ST;
break;
}
}

// Swap tags between the victim and the highest scoring pair.
// If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for
// the highest score slot without changing anything else.
for (auto *&I : ReTags) {
SlotWithTag ST{*I};
MachineOperand &TagOp = I->getOperand(4);
if (ST == MaxScoreST) {
TagOp.setImm(0);
} else if (ST == SwapST) {
TagOp.setImm(MaxScoreST.Tag);
}
}
return MaxScoreST.FI;
}

bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
MRI = &MF->getRegInfo();
Expand Down Expand Up @@ -225,11 +366,35 @@ bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
}
}

// Take over from SSP. It does nothing for tagged slots, and should not really
// have been enabled in the first place.
for (int FI : TaggedSlots)
MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None);

if (ReTags.empty())
return false;

if (mayUseUncheckedLoadStore())
uncheckLoadsAndStores();

// Find a slot that is used with zero tag offset, like ADDG #fi, 0.
// If the base tagged pointer is set up to the address of this slot,
// the ADDG instruction can be eliminated.
Optional<int> BaseSlot = findFirstSlotCandidate();
if (BaseSlot)
AFI->setTaggedBasePointerIndex(*BaseSlot);

for (auto *I : ReTags) {
int FI = I->getOperand(1).getIndex();
int Tag = I->getOperand(4).getImm();
Register Base = I->getOperand(3).getReg();
if (Tag == 0 && FI == BaseSlot) {
BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY),
I->getOperand(0).getReg())
.addReg(Base);
I->eraseFromParent();
}
}

return true;
}
45 changes: 31 additions & 14 deletions llvm/test/CodeGen/AArch64/irg_sp_tagp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,48 @@
define i8* @small_alloca() {
entry:
; CHECK-LABEL: small_alloca:
; CHECK: irg x0, sp{{$}}
; CHECK: ret
%a = alloca i8, align 16
%q = call i8* @llvm.aarch64.irg.sp(i64 0)
%q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1)
ret i8* %q1
}

@sink = global i8* null, align 8

; Check that IRG is pinned to %b because the store instruction needs
; the address in a non-fixed physical register and can benefit from it
; being equal to the base tagged pointer.
define i8* @small_allocas() {
entry:
; CHECK-LABEL: small_allocas:
; CHECK: irg [[R:x[0-9]+]], sp{{$}}
; CHECK-NEXT: addg x0, [[R]], #0, #1
; CHECK: addg x0, [[R]], #16, #1
; CHECK: str [[R]], {{.*}}sink
; CHECK: ret
%a = alloca i8, align 16
%b = alloca i8, align 16
%q = call i8* @llvm.aarch64.irg.sp(i64 0)
%q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1)
%q2 = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %q, i64 2)
store i8* %q2, i8** @sink, align 8
ret i8* %q1
}

; Two large allocas. One's offset overflows addg immediate.
define void @huge_allocas() {
entry:
; CHECK-LABEL: huge_allocas:
; CHECK: irg [[R:x[0-9]+]], sp{{$}}
; CHECK: add [[TMP:x[0-9]+]], [[R]], #3088
; CHECK: irg x1, sp{{$}}
; CHECK: add [[TMP:x[0-9]+]], x1, #3088
; CHECK: addg x0, [[TMP]], #1008, #1
; CHECK: addg x1, [[R]], #0, #2
; CHECK: bl use2
%a = alloca i8, i64 4096, align 16
%b = alloca i8, i64 4096, align 16
%base = call i8* @llvm.aarch64.irg.sp(i64 0)
%a_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %base, i64 1)
%b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 2)
%b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 0)
call void @use2(i8* %a_t, i8* %b_t)
ret void
}
Expand All @@ -37,8 +56,7 @@ entry:
; CHECK-LABEL: realign:
; CHECK: mov x29, sp
; CHECK: and sp, x{{[0-9]*}}, #0xffffffffffffffc0
; CHECK: irg [[R:x[0-9]+]], sp{{$}}
; CHECK: addg x0, [[R]], #0, #1
; CHECK: irg x0, sp{{$}}
; CHECK: bl use
%a = alloca i8, i64 4096, align 64
%base = call i8* @llvm.aarch64.irg.sp(i64 0)
Expand All @@ -52,10 +70,9 @@ entry:
define void @dynamic_alloca(i64 %size) {
entry:
; CHECK-LABEL: dynamic_alloca:
; CHECK: sub [[R:x[0-9]+]], x29, #[[OFS:[0-9]+]]
; CHECK: irg [[R]], [[R]]
; CHECK: addg x1, [[R]], #0, #1
; CHECK: sub x0, x29, #[[OFS]]
; CHECK: sub x1, x29, #[[OFS:[0-9]+]]
; CHECK: irg x1, x1
; CHECK-DAG: sub x0, x29, #[[OFS]]
; CHECK: bl use2
%base = call i8* @llvm.aarch64.irg.sp(i64 0)
%a = alloca i128, i64 %size, align 16
Expand All @@ -74,9 +91,9 @@ entryz:
; CHECK-LABEL: dynamic_alloca_and_realign:
; CHECK: and sp, x{{.*}}, #0xffffffffffffffc0
; CHECK: mov x19, sp
; CHECK: irg [[R:x[0-9]+]], x19
; CHECK: addg x1, [[R]], #[[OFS:[0-9]+]], #1
; CHECK: add x0, x19, #[[OFS]]
; CHECK: add x1, x19, #[[OFS:[0-9]+]]
; CHECK: irg x1, x1
; CHECK-DAG: add x0, x19, #[[OFS]]
; CHECK: bl use2
%base = call i8* @llvm.aarch64.irg.sp(i64 0)
%a = alloca i128, i64 %size, align 64
Expand Down

0 comments on commit 2f63e57

Please sign in to comment.