Skip to content

Commit

Permalink
Support prefix padding for alignment purposes (Relaxable instructions…
Browse files Browse the repository at this point in the history
… only)

Now that D75203 has landed and baked for a few days, extend the basic approach to prefix padding as well. The patch itself is fairly straight forward.

For the moment, this patch adds the functional support and some basic testing there of, but defaults to not enabling prefix padding. I want to be able to phrase a separate patch which adds the target specific reasoning and test it cleanly. I haven't decided whether I want to common it with the nop logic or not.

Differential Revision: https://reviews.llvm.org/D75300
  • Loading branch information
preames committed Mar 16, 2020
1 parent f84beee commit a79863f
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 10 deletions.
108 changes: 99 additions & 9 deletions llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
Expand Up @@ -105,6 +105,10 @@ cl::opt<bool> X86AlignBranchWithin32BBoundaries(
"assumptions about labels corresponding to particular instructions, "
"and should be used with caution."));

cl::opt<unsigned> X86PadMaxPrefixSize(
"x86-pad-max-prefix-size", cl::init(0),
cl::desc("Maximum number of prefixes to use for padding"));

cl::opt<bool> X86PadForAlign(
"x86-pad-for-align", cl::init(true), cl::Hidden,
cl::desc("Pad previous instructions to implement align directives"));
Expand Down Expand Up @@ -186,8 +190,16 @@ class X86AsmBackend : public MCAsmBackend {
void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
MCInst &Res) const override;

bool padInstructionViaRelaxation(MCRelaxableFragment &RF,
MCCodeEmitter &Emitter,
unsigned &RemainingSize) const;

bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
unsigned &RemainingSize) const;

bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
unsigned &RemainingSize) const;

void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;

bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
Expand Down Expand Up @@ -734,19 +746,86 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst,
Res.setOpcode(RelaxedOp);
}

static bool canBeRelaxedForPadding(const MCRelaxableFragment &RF) {
// TODO: There are lots of other tricks we could apply for increasing
// encoding size without impacting performance.
/// Return true if this instruction has been fully relaxed into it's most
/// general available form.
static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
auto &Inst = RF.getInst();
auto &STI = *RF.getSubtargetInfo();
bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
return getRelaxedOpcode(Inst, Is16BitMode) != Inst.getOpcode();
return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
}

bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
MCCodeEmitter &Emitter,
unsigned &RemainingSize) const {
if (!canBeRelaxedForPadding(RF))

static bool shouldAddPrefix(const MCInst &Inst, const MCInstrInfo &MCII) {
// Linker may rewrite the instruction with variant symbol operand.
return !hasVariantSymbol(Inst);
}

static unsigned getRemainingPrefixSize(const MCInst &Inst,
const MCSubtargetInfo &STI,
MCCodeEmitter &Emitter) {
SmallString<256> Code;
raw_svector_ostream VecOS(Code);
Emitter.emitPrefix(Inst, VecOS, STI);
assert(Code.size() < 15 && "The number of prefixes must be less than 15.");

// TODO: It turns out we need a decent amount of plumbing for the target
// specific bits to determine number of prefixes its safe to add. Various
// targets (older chips mostly, but also Atom family) encounter decoder
// stalls with too many prefixes. For testing purposes, we set the value
// externally for the moment.
unsigned ExistingPrefixSize = Code.size();
unsigned TargetPrefixMax = X86PadMaxPrefixSize;
if (TargetPrefixMax <= ExistingPrefixSize)
return 0;
return TargetPrefixMax - ExistingPrefixSize;
}

bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
MCCodeEmitter &Emitter,
unsigned &RemainingSize) const {
if (!shouldAddPrefix(RF.getInst(), *MCII))
return false;
// If the instruction isn't fully relaxed, shifting it around might require a
// larger value for one of the fixups then can be encoded. The outer loop
// will also catch this before moving to the next instruction, but we need to
// prevent padding this single instruction as well.
if (!isFullyRelaxed(RF))
return false;

const unsigned OldSize = RF.getContents().size();
if (OldSize == 15)
return false;

const unsigned MaxPossiblePad = std::min(15 - OldSize, RemainingSize);
const unsigned PrefixBytesToAdd =
std::min(MaxPossiblePad,
getRemainingPrefixSize(RF.getInst(), STI, Emitter));
if (PrefixBytesToAdd == 0)
return false;

const uint8_t Prefix = determinePaddingPrefix(RF.getInst());

SmallString<256> Code;
Code.append(PrefixBytesToAdd, Prefix);
Code.append(RF.getContents().begin(), RF.getContents().end());
RF.getContents() = Code;

// Adjust the fixups for the change in offsets
for (auto &F : RF.getFixups()) {
F.setOffset(F.getOffset() + PrefixBytesToAdd);
}

RemainingSize -= PrefixBytesToAdd;
return true;
}

bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
MCCodeEmitter &Emitter,
unsigned &RemainingSize) const {
if (isFullyRelaxed(RF))
// TODO: There are lots of other tricks we could apply for increasing
// encoding size without impacting performance.
return false;

MCInst Relaxed;
Expand All @@ -769,6 +848,17 @@ bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
return true;
}

bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
MCCodeEmitter &Emitter,
unsigned &RemainingSize) const {
bool Changed = false;
if (RemainingSize != 0)
Changed |= padInstructionViaRelaxation(RF, Emitter, RemainingSize);
if (RemainingSize != 0)
Changed |= padInstructionViaPrefix(RF, Emitter, RemainingSize);
return Changed;
}

void X86AsmBackend::finishLayout(MCAssembler const &Asm,
MCAsmLayout &Layout) const {
// See if we can further relax some instructions to cut down on the number of
Expand Down Expand Up @@ -851,7 +941,7 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
// We don't need to worry about larger positive offsets as none of the
// possible offsets between this and our align are visible, and the
// ones afterwards aren't changing.
if (mayNeedRelaxation(RF.getInst(), *RF.getSubtargetInfo()))
if (!isFullyRelaxed(RF))
break;
}
Relaxable.clear();
Expand Down
76 changes: 76 additions & 0 deletions llvm/test/MC/X86/align-via-padding.s
@@ -0,0 +1,76 @@
# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s -x86-pad-max-prefix-size=5 | llvm-objdump -d --section=.text - | FileCheck %s

# This test file highlights the interactions between prefix padding and
# relaxation padding.

.file "test.c"
.text
.section .text
# We can both relax and prefix for padding purposes, but the moment, we
# can't prefix without first having relaxed.
# CHECK: .text
# CHECK: 0: eb 1f jmp
# CHECK: 2: eb 1d jmp
# CHECK: 4: eb 1b jmp
# CHECK: 6: eb 19 jmp
# CHECK: 8: eb 17 jmp
# CHECK: a: 2e 2e 2e 2e 2e e9 0d 00 00 00 jmp
# CHECK: 14: 2e 2e 2e 2e 2e e9 03 00 00 00 jmp
# CHECK: 1e: 66 90 nop
# CHECK: 20: cc int3
.p2align 4
jmp foo
jmp foo
jmp foo
jmp foo
jmp foo
jmp foo
jmp foo
.p2align 5
int3
foo:
ret

# Canonical toy loop to show benefit - we can align the loop header with
# fewer nops by relaxing the branch, even though we don't need to
# CHECK: <loop_preheader>:
# CHECK: 45: 48 85 c0 testq %rax, %rax
# CHECK: 48: 2e 2e 2e 2e 0f 8e 1e 00 00 00 jle 30 <loop_exit>
# CHECK: 52: 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax)
# CHECK: <loop_header>:
# CHECK: 60: 48 83 e8 01 subq $1, %rax
# CHECK: 64: 48 85 c0 testq %rax, %rax
# CHECK: 67: 7e 07 jle 7 <loop_exit>
# CHECK: 69: 2e 2e e9 f0 ff ff ff jmp
# CHECK: <loop_exit>:
# CHECK: 70: c3 retq
.p2align 5
.skip 5
loop_preheader:
testq %rax, %rax
jle loop_exit
.p2align 5
loop_header:
subq $1, %rax
testq %rax, %rax
jle loop_exit
jmp loop_header
.p2align 4
loop_exit:
ret

# Correctness cornercase - can't prefix pad jmp without having relaxed it
# first as doing so would make the relative offset too large
# CHECK: fd: cc int3
# CHECK: fe: eb 80 jmp -128 <loop_exit+0x10>
# CHECK: 100: cc int3
.p2align 5
.L1:
.rept 126
int3
.endr
jmp .L1
.rept 30
int3
.endr
.p2align 5
4 changes: 3 additions & 1 deletion llvm/test/MC/X86/align-via-relaxation.s
@@ -1,5 +1,7 @@
# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s
# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu -x86-pad-max-prefix-size=0 %s | llvm-objdump -d --section=.text - | FileCheck %s

# This test exercises only the padding via relaxation logic. The interaction
# etween prefix padding and relaxation logic can be seen in align-via-padding.s

.file "test.c"
.text
Expand Down

0 comments on commit a79863f

Please sign in to comment.