Skip to content

Commit

Permalink
[X86] Support AMX fast register allocation
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D100026
  • Loading branch information
xiangzh1 committed May 8, 2021
1 parent 72bd011 commit d4bdeca
Show file tree
Hide file tree
Showing 24 changed files with 6,950 additions and 29 deletions.
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsX86_64.def
Expand Up @@ -101,6 +101,7 @@ TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")

// AMX internal builtin
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/CodeGen/Passes.h
Expand Up @@ -507,6 +507,9 @@ namespace llvm {
/// or split the data to two <128 x i32>.
FunctionPass *createX86LowerAMXTypePass();

/// The pass insert tile config intrinsics for AMX fast register allocation.
FunctionPass *createX86PreAMXConfigPass();

/// The pass transforms amx intrinsics to scalar operation if the function has
/// optnone attribute or it is O0.
FunctionPass *createX86LowerAMXIntrinsicsPass();
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/CodeGen/TargetPassConfig.h
Expand Up @@ -406,6 +406,10 @@ class TargetPassConfig : public ImmutablePass {
return false;
}

/// addPostFastRegAllocRewrite - Add passes to the optimized register
/// allocation pipeline after fast register allocation is complete.
virtual bool addPostFastRegAllocRewrite() { return false; }

/// Add passes to be run immediately after virtual registers are rewritten
/// to physical registers.
virtual void addPostRewrite() { }
Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Expand Up @@ -5042,6 +5042,9 @@ let TargetPrefix = "x86" in {
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
ImmArg<ArgIndex<2>>]>;
// AMX - internal intrinsics
def int_x86_ldtilecfg_internal :
GCCBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
Intrinsic<[], [llvm_ptr_ty], []>;
def int_x86_tileloadd64_internal :
GCCBuiltin<"__builtin_ia32_tileloadd64_internal">,
Intrinsic<[llvm_x86amx_ty],
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/CodeGen/TargetPassConfig.cpp
Expand Up @@ -1321,6 +1321,10 @@ bool TargetPassConfig::addRegAssignAndRewriteFast() {
report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");

addPass(createRegAllocPass(false));

// Allow targets to change the register assignments after
// fast register allocation.
addPostFastRegAllocRewrite();
return true;
}

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/X86/CMakeLists.txt
Expand Up @@ -34,8 +34,10 @@ set(sources
X86DiscriminateMemOps.cpp
X86LowerTileCopy.cpp
X86LowerAMXType.cpp
X86PreAMXConfig.cpp
X86LowerAMXIntrinsics.cpp
X86TileConfig.cpp
X86FastTileConfig.cpp
X86PreTileConfig.cpp
X86ExpandPseudo.cpp
X86FastISel.cpp
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/X86/X86.h
Expand Up @@ -79,6 +79,9 @@ FunctionPass *createX86WinAllocaExpander();
/// Return a pass that config the tile registers.
FunctionPass *createX86TileConfigPass();

/// Return a pass that config the tile registers after fast reg allocation.
FunctionPass *createX86FastTileConfigPass();

/// Return a pass that insert pseudo tile config instruction.
FunctionPass *createX86PreTileConfigPass();

Expand Down Expand Up @@ -172,8 +175,10 @@ void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
void initializeX86PreTileConfigPass(PassRegistry &);
void initializeX86FastTileConfigPass(PassRegistry &);
void initializeX86TileConfigPass(PassRegistry &);
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
void initializeX86PreAMXConfigPassPass(PassRegistry &);
void initializeX86LowerTileCopyPass(PassRegistry &);
void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86ExpandPseudo.cpp
Expand Up @@ -478,6 +478,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
case X86::PLDTILECFGV: {
MI.setDesc(TII->get(X86::LDTILECFG));
return true;
}
case X86::PTILELOADDV: {
for (unsigned i = 2; i > 0; --i)
MI.RemoveOperand(i);
Expand Down
306 changes: 306 additions & 0 deletions llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -0,0 +1,306 @@
//===-- X86FastTileConfig.cpp - Fast Tile Register Configure---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file Pass to config the shape of AMX physical registers
/// AMX register need to be configured before use. Before FastRegAllocation pass
/// the ldtilecfg instruction is inserted, however at that time we don't
/// know the shape of each physical tile registers, because the register
/// allocation is not done yet. This pass runs after egister allocation
/// pass. It collects the shape information of each physical tile register
/// and store the shape in the stack slot that is allocated for load config
/// to tile config register.
//
//===----------------------------------------------------------------------===//

#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/InitializePasses.h"

using namespace llvm;

#define DEBUG_TYPE "fasttileconfig"

namespace {

class X86FastTileConfig : public MachineFunctionPass {
// context
MachineFunction *MF = nullptr;
const X86Subtarget *ST = nullptr;
const TargetRegisterInfo *TRI = nullptr;
const TargetInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;

MachineInstr *getTileConfigPoint();
void tileConfig();

public:
X86FastTileConfig() : MachineFunctionPass(ID) {}

bool fastTileConfig();
bool isTileLoad(MachineInstr &MI);
bool isTileStore(MachineInstr &MI);
bool isAMXInstr(MachineInstr &MI);
void getTileStoreShape(MachineInstr &MI,
SmallVector<MachineOperand *> &ShapedTiles);

MachineInstr *getKeyAMXInstr(MachineInstr *MI);
void getTileShapesCfg(MachineInstr *MI,
SmallVector<MachineOperand *> &ShapedTiles);
void getShapeCfgInstrs(MachineInstr *MI,
std::map<unsigned, MachineInstr *> &RowCfgs,
std::map<unsigned, MachineInstr *> &ColCfgs);

/// Return the pass name.
StringRef getPassName() const override {
return "Fast Tile Register Configure";
}

void materializeTileCfg(MachineInstr *MI);

void rewriteTileCfg(SmallVector<MachineOperand *> &ShapedTiles,
std::map<unsigned, MachineInstr *> &RowCfgs,
std::map<unsigned, MachineInstr *> &ColCfgs);

/// Perform register allocation.
bool runOnMachineFunction(MachineFunction &MFunc) override;

MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoPHIs);
}

static char ID;
};

} // end anonymous namespace

char X86FastTileConfig::ID = 0;

INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
"Fast Tile Register Configure", false, false)
INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
"Fast Tile Register Configure", false, false)

static bool isTilePhysReg(MachineOperand &Op) {
if (!Op.isReg())
return false;

Register Reg = Op.getReg();
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
return true;
return false;
}

static unsigned getTilePhysRegIdx(MachineOperand *Op) {
assert(isTilePhysReg(*Op) && "Tile Operand is invalid");
return Op->getReg() - X86::TMM0;
}

static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) {
unsigned Offset = 48 + TIdx;
MI->getOperand(3).ChangeToImmediate(Offset);
}

static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) {
unsigned Offset = 16 + TIdx * 2;
MI->getOperand(3).ChangeToImmediate(Offset);
}

bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
return MI.getOpcode() == X86::PTILELOADDV;
}
bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
return MI.getOpcode() == X86::PTILESTOREDV;
}
bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) {
// TODO: May need to handle some special nontile amx instrucion.
if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr())
return false;

for (MachineOperand &MO : MI.operands())
if (isTilePhysReg(MO))
return true;

return false;
}

MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) {
auto Cfg = MachineBasicBlock::iterator(MI);
MachineBasicBlock *MBB = MI->getParent();
MachineInstr *KeyMI = nullptr;
int KeyAMXNum = 0;

for (auto II = Cfg; II != MBB->end(); II++) {
if (isTileLoad(*II)) {
KeyMI = &*II;
continue;
}

if (isTileStore(*II)) {
assert(KeyMI && "Key AMX Should be found before!");
break;
}

if (isAMXInstr(*II)) {
assert((KeyAMXNum == 0) && "Too many Key AMX instruction!");
KeyAMXNum++;
KeyMI = &*II;
}
}
assert(KeyMI && "There must be an AMX instruction.");
return KeyMI;
}

// Orderly get the tiles in key amx instruction, uses before defs.
void X86FastTileConfig::getTileShapesCfg(
MachineInstr *CfgMI, SmallVector<MachineOperand *> &ShapedTiles) {
MachineInstr *KeyMI = getKeyAMXInstr(CfgMI);

SmallVector<MachineOperand *> DefTiles;
for (MachineOperand &MO : KeyMI->operands()) {
if (!isTilePhysReg(MO))
continue;
if (MO.isDef())
DefTiles.push_back(&MO);
else
ShapedTiles.push_back(&MO);
}
ShapedTiles.append(DefTiles);
}

// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and
// amx.shape.N.col*" at pass "Pre AMX Tile Config".
// The 'N' implies the order of tiles in key amx intrinsic.
void X86FastTileConfig::getShapeCfgInstrs(
MachineInstr *MI, std::map<unsigned, MachineInstr *> &RowCfgs,
std::map<unsigned, MachineInstr *> &ColCfgs) {
auto Cfg = MachineBasicBlock::iterator(MI);
MachineBasicBlock *MBB = MI->getParent();

for (auto II = Cfg; II != MBB->begin(); II--) {
if (isAMXInstr(*II) || II->isTerminator() || II->isCall())
break;
if (!II->mayStore() || !II->hasOneMemOperand())
continue;
const Value *MemPtr = II->memoperands()[0]->getValue();
if (!MemPtr)
continue;

StringRef Name = MemPtr->getName();
if (!Name.startswith("amx.tmm."))
continue;

// Get the 'N'th tile shape config in key amx instruction.
auto N = Name.find(".shape");
StringRef STileIdx = Name.slice(8, N);
unsigned Idx;
STileIdx.getAsInteger(10, Idx);

// And related them with their store instructions.
if (Name.contains("row"))
RowCfgs[Idx] = &*II;
else if (Name.contains("col"))
ColCfgs[Idx] = &*II;
else
llvm_unreachable("Invalid tile shape info!");
}
assert((RowCfgs.size() == ColCfgs.size()) &&
"The number of tile row and col must be equal!");
}

// Here is the data format for the tile config.
// 0 palette = 1 now.
// 1 start_row = 0 now.
// 2-15 reserved, must be zero
// 16-17 tile0.colsb Tile 0 bytes per row.
// 18-19 tile1.colsb Tile 1 bytes per row.
// 20-21 tile2.colsb Tile 2 bytes per row.
// ... (sequence continues)
// 30-31 tile7.colsb Tile 7 bytes per row.
// 32-47 reserved, must be zero
// 48 tile0.rows Tile 0 rows.
// 49 tile1.rows Tile 1 rows.
// 50 tile2.rows Tile 2 rows.
// ... (sequence continues)
// 55 tile7.rows Tile 7 rows.
// 56-63 reserved, must be zero
void X86FastTileConfig::rewriteTileCfg(
SmallVector<MachineOperand *> &ShapedTiles,
std::map<unsigned, MachineInstr *> &RowCfgs,
std::map<unsigned, MachineInstr *> &ColCfgs) {
assert((RowCfgs.size() == ShapedTiles.size()) &&
"The number of tile shapes not equal with the number of tiles!");

// Orderly get the tiles and adjust the shape config.
for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) {
MachineOperand *MO = ShapedTiles[I];
unsigned TmmIdx = getTilePhysRegIdx(MO);
if (I == TmmIdx)
continue;
adjustRowCfg(TmmIdx, RowCfgs[I]);
adjustColCfg(TmmIdx, ColCfgs[I]);
}
}

// We have already preconfig the shapes before fast register allocation at
// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register
// allocation, the shapes pre-written before may not rightly corresponding
// to the correct tmm registers, so we need adjust them.
void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) {
SmallVector<MachineOperand *> ShapedTiles;
std::map<unsigned, MachineInstr *> RowCfgs;
std::map<unsigned, MachineInstr *> ColCfgs;

// Orderly keep the tile uses and def in ShapedTiles;
getTileShapesCfg(CfgMI, ShapedTiles);
assert(ShapedTiles.size() && "Not find shapes config!");

getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs);

rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs);
}

bool X86FastTileConfig::fastTileConfig() {
bool Changed = false;

for (MachineBasicBlock &MBB : *MF) {
SmallVector<MachineInstr *, 2> CFGs;
for (MachineInstr &MI : MBB)
if (MI.getOpcode() == X86::PLDTILECFGV)
CFGs.push_back(&MI);
for (auto *MI : CFGs)
materializeTileCfg(MI);
if (!CFGs.empty())
Changed = true;
}
return Changed;
}

bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) {
MF = &MFunc;
MRI = &MFunc.getRegInfo();
ST = &MFunc.getSubtarget<X86Subtarget>();
TRI = ST->getRegisterInfo();
TII = MFunc.getSubtarget().getInstrInfo();

return fastTileConfig();
}

FunctionPass *llvm::createX86FastTileConfigPass() {
return new X86FastTileConfig();
}

0 comments on commit d4bdeca

Please sign in to comment.