From a79863f2f72747e3abd60326359d0e34f93a6921 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Sun, 15 Mar 2020 18:10:50 -0700
Subject: [PATCH] Support prefix padding for alignment purposes (Relaxable
 instructions only)

Now that D75203 has landed and baked for a few days, extend the basic approach to prefix padding as well. The patch itself is fairly straight forward.

For the moment, this patch adds the functional support and some basic testing there of, but defaults to not enabling prefix padding. I want to be able to phrase a separate patch which adds the target specific reasoning and test it cleanly. I haven't decided whether I want to common it with the nop logic or not.

Differential Revision: https://reviews.llvm.org/D75300
---
 .../Target/X86/MCTargetDesc/X86AsmBackend.cpp | 108 ++++++++++++++++--
 llvm/test/MC/X86/align-via-padding.s          |  76 ++++++++++++
 llvm/test/MC/X86/align-via-relaxation.s       |   4 +-
 3 files changed, 178 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/MC/X86/align-via-padding.s
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index b4a86ea65fa3c..af915b12c8d02 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -105,6 +105,10 @@ cl::opt<bool> X86AlignBranchWithin32BBoundaries(
         "assumptions about labels corresponding to particular instructions, "
         "and should be used with caution."));
 
+cl::opt<unsigned> X86PadMaxPrefixSize(
+    "x86-pad-max-prefix-size", cl::init(0),
+    cl::desc("Maximum number of prefixes to use for padding"));
+
 cl::opt<bool> X86PadForAlign(
     "x86-pad-for-align", cl::init(true), cl::Hidden,
     cl::desc("Pad previous instructions to implement align directives"));
@@ -186,8 +190,16 @@ class X86AsmBackend : public MCAsmBackend {
   void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
                         MCInst &Res) const override;
 
+  bool padInstructionViaRelaxation(MCRelaxableFragment &RF,
+                                   MCCodeEmitter &Emitter,
+                                   unsigned &RemainingSize) const;
+
+  bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+                               unsigned &RemainingSize) const;
+
   bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
                               unsigned &RemainingSize) const;
+
   void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
@@ -734,19 +746,86 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst,
   Res.setOpcode(RelaxedOp);
 }
 
-static bool canBeRelaxedForPadding(const MCRelaxableFragment &RF) {
-  // TODO: There are lots of other tricks we could apply for increasing
-  // encoding size without impacting performance.
+/// Return true if this instruction has been fully relaxed into it's most
+/// general available form.
+static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
   auto &Inst = RF.getInst();
   auto &STI = *RF.getSubtargetInfo();
   bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
-  return getRelaxedOpcode(Inst, Is16BitMode) != Inst.getOpcode();
+  return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
 }
 
-bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
-                                           MCCodeEmitter &Emitter,
-                                           unsigned &RemainingSize) const {
-  if (!canBeRelaxedForPadding(RF))
+
+static bool shouldAddPrefix(const MCInst &Inst, const MCInstrInfo &MCII) {
+  // Linker may rewrite the instruction with variant symbol operand.
+  return !hasVariantSymbol(Inst);
+}
+
+static unsigned getRemainingPrefixSize(const MCInst &Inst,
+                                       const MCSubtargetInfo &STI,
+                                       MCCodeEmitter &Emitter) {
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  Emitter.emitPrefix(Inst, VecOS, STI);
+  assert(Code.size() < 15 && "The number of prefixes must be less than 15.");
+
+  // TODO: It turns out we need a decent amount of plumbing for the target
+  // specific bits to determine number of prefixes its safe to add.  Various
+  // targets (older chips mostly, but also Atom family) encounter decoder
+  // stalls with too many prefixes.  For testing purposes, we set the value
+  // externally for the moment.
+  unsigned ExistingPrefixSize = Code.size();
+  unsigned TargetPrefixMax = X86PadMaxPrefixSize;
+  if (TargetPrefixMax <= ExistingPrefixSize)
+    return 0;
+  return TargetPrefixMax - ExistingPrefixSize;
+}
+
+bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
+                                            MCCodeEmitter &Emitter,
+                                            unsigned &RemainingSize) const {
+  if (!shouldAddPrefix(RF.getInst(), *MCII))
+    return false;
+  // If the instruction isn't fully relaxed, shifting it around might require a
+  // larger value for one of the fixups then can be encoded.  The outer loop
+  // will also catch this before moving to the next instruction, but we need to
+  // prevent padding this single instruction as well.
+  if (!isFullyRelaxed(RF))
+    return false;
+
+  const unsigned OldSize = RF.getContents().size();
+  if (OldSize == 15)
+    return false;
+
+  const unsigned MaxPossiblePad = std::min(15 - OldSize, RemainingSize);
+  const unsigned PrefixBytesToAdd =
+    std::min(MaxPossiblePad,
+             getRemainingPrefixSize(RF.getInst(), STI, Emitter));
+  if (PrefixBytesToAdd == 0)
+    return false;
+
+  const uint8_t Prefix = determinePaddingPrefix(RF.getInst());
+
+  SmallString<256> Code;
+  Code.append(PrefixBytesToAdd, Prefix);
+  Code.append(RF.getContents().begin(), RF.getContents().end());
+  RF.getContents() = Code;
+
+  // Adjust the fixups for the change in offsets
+  for (auto &F : RF.getFixups()) {
+    F.setOffset(F.getOffset() + PrefixBytesToAdd);
+  }
+
+  RemainingSize -= PrefixBytesToAdd;
+  return true;
+}
+
+bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
+                                                MCCodeEmitter &Emitter,
+                                                unsigned &RemainingSize) const {
+  if (isFullyRelaxed(RF))
+    // TODO: There are lots of other tricks we could apply for increasing
+    // encoding size without impacting performance.
     return false;
 
   MCInst Relaxed;
@@ -769,6 +848,17 @@ bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
   return true;
 }
 
+bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+                                           MCCodeEmitter &Emitter,
+                                           unsigned &RemainingSize) const {
+  bool Changed = false;
+  if (RemainingSize != 0)
+    Changed |= padInstructionViaRelaxation(RF, Emitter, RemainingSize);
+  if (RemainingSize != 0)
+    Changed |= padInstructionViaPrefix(RF, Emitter, RemainingSize);
+  return Changed;
+}
+
 void X86AsmBackend::finishLayout(MCAssembler const &Asm,
                                  MCAsmLayout &Layout) const {
   // See if we can further relax some instructions to cut down on the number of
@@ -851,7 +941,7 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
         // We don't need to worry about larger positive offsets as none of the
         // possible offsets between this and our align are visible, and the
         // ones afterwards aren't changing.
-        if (mayNeedRelaxation(RF.getInst(), *RF.getSubtargetInfo()))
+        if (!isFullyRelaxed(RF))
           break;
       }
       Relaxable.clear();
diff --git a/llvm/test/MC/X86/align-via-padding.s b/llvm/test/MC/X86/align-via-padding.s
new file mode 100644
index 0000000000000..572af4b02961e
--- /dev/null
+++ b/llvm/test/MC/X86/align-via-padding.s
@@ -0,0 +1,76 @@
+# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s -x86-pad-max-prefix-size=5 | llvm-objdump -d --section=.text - | FileCheck %s
+
+# This test file highlights the interactions between prefix padding and
+# relaxation padding.
+
+  .file "test.c"
+  .text
+  .section  .text
+  # We can both relax and prefix for padding purposes, but the moment, we
+  # can't prefix without first having relaxed.
+  # CHECK: .text
+  # CHECK:  0: eb 1f                         jmp
+  # CHECK:  2: eb 1d                         jmp
+  # CHECK:  4: eb 1b                         jmp
+  # CHECK:  6: eb 19                         jmp
+  # CHECK:  8: eb 17                         jmp
+  # CHECK:  a: 2e 2e 2e 2e 2e e9 0d 00 00 00 jmp
+  # CHECK: 14: 2e 2e 2e 2e 2e e9 03 00 00 00 jmp
+  # CHECK: 1e: 66 90                         nop
+  # CHECK: 20: cc                            int3
+  .p2align 4
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  jmp foo
+  .p2align 5
+  int3
+foo:
+  ret
+
+  # Canonical toy loop to show benefit - we can align the loop header with
+  # fewer nops by relaxing the branch, even though we don't need to
+  # CHECK: <loop_preheader>:
+  # CHECK: 45: 48 85 c0                       testq %rax, %rax
+  # CHECK: 48: 2e 2e 2e 2e 0f 8e 1e 00 00 00  jle 30 <loop_exit>
+  # CHECK: 52: 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00    	nopw	%cs:(%rax,%rax)
+  # CHECK: <loop_header>:
+  # CHECK: 60: 48 83 e8 01                    subq $1, %rax
+  # CHECK: 64: 48 85 c0                       testq %rax, %rax
+  # CHECK: 67: 7e 07                          jle 7 <loop_exit>
+  # CHECK: 69: 2e 2e e9 f0 ff ff ff           jmp
+  # CHECK: <loop_exit>:
+  # CHECK: 70: c3                             retq
+  .p2align 5
+  .skip 5
+loop_preheader:
+  testq %rax, %rax
+  jle loop_exit
+  .p2align 5
+loop_header:
+  subq $1, %rax
+  testq %rax, %rax
+  jle loop_exit
+  jmp loop_header
+  .p2align 4
+loop_exit:
+  ret
+
+  # Correctness cornercase - can't prefix pad jmp without having relaxed it
+  # first as doing so would make the relative offset too large
+  # CHECK: fd: cc                             int3
+  # CHECK: fe: eb 80                          jmp -128 <loop_exit+0x10>
+  # CHECK: 100: cc                           	int3
+.p2align 5
+.L1:
+.rept 126
+  int3
+.endr
+  jmp .L1
+.rept 30
+  int3
+.endr
+.p2align 5
diff --git a/llvm/test/MC/X86/align-via-relaxation.s b/llvm/test/MC/X86/align-via-relaxation.s
index f81e4c73e4f22..7f372bba0f20c 100644
--- a/llvm/test/MC/X86/align-via-relaxation.s
+++ b/llvm/test/MC/X86/align-via-relaxation.s
@@ -1,5 +1,7 @@
-# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s
+# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu -x86-pad-max-prefix-size=0 %s | llvm-objdump -d --section=.text - | FileCheck %s
 
+# This test exercises only the padding via relaxation logic.  The  interaction
+# etween prefix padding and relaxation logic can be seen in align-via-padding.s
 
   .file "test.c"
   .text