[X86] Support AMX fast register allocation

Differential Revision: https://reviews.llvm.org/D100026
llvm · May 8, 2021 · d4bdeca · d4bdeca
1 parent 72bd011
commit d4bdeca
Show file tree

Hide file tree

Showing 24 changed files with 6,950 additions and 29 deletions.
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -101,6 +101,7 @@ TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
 TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")
 
 // AMX internal builtin
+TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
@@ -507,6 +507,9 @@ namespace llvm {
   /// or split the data to two <128 x i32>.
   FunctionPass *createX86LowerAMXTypePass();
 
+  /// The pass insert tile config intrinsics for AMX fast register allocation.
+  FunctionPass *createX86PreAMXConfigPass();
+
   /// The pass transforms amx intrinsics to scalar operation if the function has
   /// optnone attribute or it is O0.
   FunctionPass *createX86LowerAMXIntrinsicsPass();

diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -406,6 +406,10 @@ class TargetPassConfig : public ImmutablePass {
     return false;
   }
 
+  /// addPostFastRegAllocRewrite - Add passes to the optimized register
+  /// allocation pipeline after fast register allocation is complete.
+  virtual bool addPostFastRegAllocRewrite() { return false; }
+
   /// Add passes to be run immediately after virtual registers are rewritten
   /// to physical registers.
   virtual void addPostRewrite() { }

diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5042,6 +5042,9 @@ let TargetPrefix = "x86" in {
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
                          ImmArg<ArgIndex<2>>]>;
   // AMX - internal intrinsics
+  def int_x86_ldtilecfg_internal :
+              GCCBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
+              Intrinsic<[], [llvm_ptr_ty], []>;
   def int_x86_tileloadd64_internal :
               GCCBuiltin<"__builtin_ia32_tileloadd64_internal">,
               Intrinsic<[llvm_x86amx_ty],

diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1321,6 +1321,10 @@ bool TargetPassConfig::addRegAssignAndRewriteFast() {
     report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
 
   addPass(createRegAllocPass(false));
+
+  // Allow targets to change the register assignments after
+  // fast register allocation.
+  addPostFastRegAllocRewrite();
   return true;
 }
 

diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
@@ -34,8 +34,10 @@ set(sources
   X86DiscriminateMemOps.cpp
   X86LowerTileCopy.cpp
   X86LowerAMXType.cpp
+  X86PreAMXConfig.cpp
   X86LowerAMXIntrinsics.cpp
   X86TileConfig.cpp
+  X86FastTileConfig.cpp
   X86PreTileConfig.cpp
   X86ExpandPseudo.cpp
   X86FastISel.cpp

diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
@@ -79,6 +79,9 @@ FunctionPass *createX86WinAllocaExpander();
 /// Return a pass that config the tile registers.
 FunctionPass *createX86TileConfigPass();
 
+/// Return a pass that config the tile registers after fast reg allocation.
+FunctionPass *createX86FastTileConfigPass();
+
 /// Return a pass that insert pseudo tile config instruction.
 FunctionPass *createX86PreTileConfigPass();
 
@@ -172,8 +175,10 @@ void initializeX86PartialReductionPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
 void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
 void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86FastTileConfigPass(PassRegistry &);
 void initializeX86TileConfigPass(PassRegistry &);
 void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
+void initializeX86PreAMXConfigPassPass(PassRegistry &);
 void initializeX86LowerTileCopyPass(PassRegistry &);
 void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
 

diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -478,6 +478,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case TargetOpcode::ICALL_BRANCH_FUNNEL:
     ExpandICallBranchFunnel(&MBB, MBBI);
     return true;
+  case X86::PLDTILECFGV: {
+    MI.setDesc(TII->get(X86::LDTILECFG));
+    return true;
+  }
   case X86::PTILELOADDV: {
     for (unsigned i = 2; i > 0; --i)
       MI.RemoveOperand(i);

diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -0,0 +1,306 @@
+//===-- X86FastTileConfig.cpp - Fast Tile Register Configure---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to config the shape of AMX physical registers
+/// AMX register need to be configured before use. Before FastRegAllocation pass
+/// the ldtilecfg instruction is inserted, however at that time we don't
+/// know the shape of each physical tile registers, because the register
+/// allocation is not done yet. This pass runs after egister allocation
+/// pass. It collects the shape information of each physical tile register
+/// and store the shape in the stack slot that is allocated for load config
+/// to tile config register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "fasttileconfig"
+
+namespace {
+
+class X86FastTileConfig : public MachineFunctionPass {
+  // context
+  MachineFunction *MF = nullptr;
+  const X86Subtarget *ST = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+
+  MachineInstr *getTileConfigPoint();
+  void tileConfig();
+
+public:
+  X86FastTileConfig() : MachineFunctionPass(ID) {}
+
+  bool fastTileConfig();
+  bool isTileLoad(MachineInstr &MI);
+  bool isTileStore(MachineInstr &MI);
+  bool isAMXInstr(MachineInstr &MI);
+  void getTileStoreShape(MachineInstr &MI,
+                         SmallVector<MachineOperand *> &ShapedTiles);
+
+  MachineInstr *getKeyAMXInstr(MachineInstr *MI);
+  void getTileShapesCfg(MachineInstr *MI,
+                        SmallVector<MachineOperand *> &ShapedTiles);
+  void getShapeCfgInstrs(MachineInstr *MI,
+                         std::map<unsigned, MachineInstr *> &RowCfgs,
+                         std::map<unsigned, MachineInstr *> &ColCfgs);
+
+  /// Return the pass name.
+  StringRef getPassName() const override {
+    return "Fast Tile Register Configure";
+  }
+
+  void materializeTileCfg(MachineInstr *MI);
+
+  void rewriteTileCfg(SmallVector<MachineOperand *> &ShapedTiles,
+                      std::map<unsigned, MachineInstr *> &RowCfgs,
+                      std::map<unsigned, MachineInstr *> &ColCfgs);
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &MFunc) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char X86FastTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
+                      "Fast Tile Register Configure", false, false)
+INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
+                    "Fast Tile Register Configure", false, false)
+
+static bool isTilePhysReg(MachineOperand &Op) {
+  if (!Op.isReg())
+    return false;
+
+  Register Reg = Op.getReg();
+  if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+    return true;
+  return false;
+}
+
+static unsigned getTilePhysRegIdx(MachineOperand *Op) {
+  assert(isTilePhysReg(*Op) && "Tile Operand is invalid");
+  return Op->getReg() - X86::TMM0;
+}
+
+static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) {
+  unsigned Offset = 48 + TIdx;
+  MI->getOperand(3).ChangeToImmediate(Offset);
+}
+
+static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
+  unsigned Offset = 16 + TIdx * 2;
+  MI->getOperand(3).ChangeToImmediate(Offset);
+}
+
+bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
+  return MI.getOpcode() == X86::PTILELOADDV;
+}
+bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
+  return MI.getOpcode() == X86::PTILESTOREDV;
+}
+bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) {
+  // TODO: May need to handle some special nontile amx instrucion.
+  if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr())
+    return false;
+
+  for (MachineOperand &MO : MI.operands())
+    if (isTilePhysReg(MO))
+      return true;
+
+  return false;
+}
+
+MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) {
+  auto Cfg = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *KeyMI = nullptr;
+  int KeyAMXNum = 0;
+
+  for (auto II = Cfg; II != MBB->end(); II++) {
+    if (isTileLoad(*II)) {
+      KeyMI = &*II;
+      continue;
+    }
+
+    if (isTileStore(*II)) {
+      assert(KeyMI && "Key AMX Should be found before!");
+      break;
+    }
+
+    if (isAMXInstr(*II)) {
+      assert((KeyAMXNum == 0) && "Too many Key AMX instruction!");
+      KeyAMXNum++;
+      KeyMI = &*II;
+    }
+  }
+  assert(KeyMI && "There must be an AMX instruction.");
+  return KeyMI;
+}
+
+// Orderly get the tiles in key amx instruction, uses before defs.
+void X86FastTileConfig::getTileShapesCfg(
+    MachineInstr *CfgMI, SmallVector<MachineOperand *> &ShapedTiles) {
+  MachineInstr *KeyMI = getKeyAMXInstr(CfgMI);
+
+  SmallVector<MachineOperand *> DefTiles;
+  for (MachineOperand &MO : KeyMI->operands()) {
+    if (!isTilePhysReg(MO))
+      continue;
+    if (MO.isDef())
+      DefTiles.push_back(&MO);
+    else
+      ShapedTiles.push_back(&MO);
+  }
+  ShapedTiles.append(DefTiles);
+}
+
+// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and
+// amx.shape.N.col*" at pass "Pre AMX Tile Config".
+// The 'N' implies the order of tiles in key amx intrinsic.
+void X86FastTileConfig::getShapeCfgInstrs(
+    MachineInstr *MI, std::map<unsigned, MachineInstr *> &RowCfgs,
+    std::map<unsigned, MachineInstr *> &ColCfgs) {
+  auto Cfg = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock *MBB = MI->getParent();
+
+  for (auto II = Cfg; II != MBB->begin(); II--) {
+    if (isAMXInstr(*II) || II->isTerminator() || II->isCall())
+      break;
+    if (!II->mayStore() || !II->hasOneMemOperand())
+      continue;
+    const Value *MemPtr = II->memoperands()[0]->getValue();
+    if (!MemPtr)
+      continue;
+
+    StringRef Name = MemPtr->getName();
+    if (!Name.startswith("amx.tmm."))
+      continue;
+
+    // Get the 'N'th tile shape config in key amx instruction.
+    auto N = Name.find(".shape");
+    StringRef STileIdx = Name.slice(8, N);
+    unsigned Idx;
+    STileIdx.getAsInteger(10, Idx);
+
+    // And related them with their store instructions.
+    if (Name.contains("row"))
+      RowCfgs[Idx] = &*II;
+    else if (Name.contains("col"))
+      ColCfgs[Idx] = &*II;
+    else
+      llvm_unreachable("Invalid tile shape info!");
+  }
+  assert((RowCfgs.size() == ColCfgs.size()) &&
+         "The number of tile row and col must be equal!");
+}
+
+// Here is the data format for the tile config.
+// 0      palette   = 1 now.
+// 1      start_row = 0 now.
+// 2-15   reserved, must be zero
+// 16-17  tile0.colsb Tile 0 bytes per row.
+// 18-19  tile1.colsb Tile 1 bytes per row.
+// 20-21  tile2.colsb Tile 2 bytes per row.
+// ... (sequence continues)
+// 30-31  tile7.colsb Tile 7 bytes per row.
+// 32-47  reserved, must be zero
+// 48     tile0.rows Tile 0 rows.
+// 49     tile1.rows Tile 1 rows.
+// 50     tile2.rows Tile 2 rows.
+// ... (sequence continues)
+// 55     tile7.rows Tile 7 rows.
+// 56-63  reserved, must be zero
+void X86FastTileConfig::rewriteTileCfg(
+    SmallVector<MachineOperand *> &ShapedTiles,
+    std::map<unsigned, MachineInstr *> &RowCfgs,
+    std::map<unsigned, MachineInstr *> &ColCfgs) {
+  assert((RowCfgs.size() == ShapedTiles.size()) &&
+         "The number of tile shapes not equal with the number of tiles!");
+
+  // Orderly get the tiles and adjust the shape config.
+  for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) {
+    MachineOperand *MO = ShapedTiles[I];
+    unsigned TmmIdx = getTilePhysRegIdx(MO);
+    if (I == TmmIdx)
+      continue;
+    adjustRowCfg(TmmIdx, RowCfgs[I]);
+    adjustColCfg(TmmIdx, ColCfgs[I]);
+  }
+}
+
+// We have already preconfig the shapes before fast register allocation at
+// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register
+// allocation, the shapes pre-written before may not rightly corresponding
+// to the correct tmm registers, so we need adjust them.
+void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) {
+  SmallVector<MachineOperand *> ShapedTiles;
+  std::map<unsigned, MachineInstr *> RowCfgs;
+  std::map<unsigned, MachineInstr *> ColCfgs;
+
+  // Orderly keep the tile uses and def in ShapedTiles;
+  getTileShapesCfg(CfgMI, ShapedTiles);
+  assert(ShapedTiles.size() && "Not find shapes config!");
+
+  getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs);
+
+  rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs);
+}
+
+bool X86FastTileConfig::fastTileConfig() {
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    SmallVector<MachineInstr *, 2> CFGs;
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == X86::PLDTILECFGV)
+        CFGs.push_back(&MI);
+    for (auto *MI : CFGs)
+      materializeTileCfg(MI);
+    if (!CFGs.empty())
+      Changed = true;
+  }
+  return Changed;
+}
+
+bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
+  MF = &MFunc;
+  MRI = &MFunc.getRegInfo();
+  ST = &MFunc.getSubtarget<X86Subtarget>();
+  TRI = ST->getRegisterInfo();
+  TII = MFunc.getSubtarget().getInstrInfo();
+
+  return fastTileConfig();
+}
+
+FunctionPass *llvm::createX86FastTileConfigPass() {
+  return new X86FastTileConfig();
+}