Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Add support for wide loads >= 256-bits
Browse files Browse the repository at this point in the history
Summary:
This adds support for the most commonly used wide load types:
<8xi32>, <16xi32>, <4xi64>, and <8xi64>

Reviewers: arsenm

Reviewed By: arsenm

Subscribers: hiraditya, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, dstuttard, tpr, t-tye, volkan, Petar.Avramovic, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D57399

llvm-svn: 365586
  • Loading branch information
tstellar committed Jul 10, 2019
1 parent 693936a commit d0ba79f
Show file tree
Hide file tree
Showing 7 changed files with 767 additions and 37 deletions.
72 changes: 72 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
Expand Up @@ -161,5 +161,77 @@ const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID,
return &ValMappingsSGPR64OnlyVGPR32[2];
}

const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] {
/* 256-bit load */ {0, 256, SGPRRegBank},
/* 512-bit load */ {0, 512, SGPRRegBank},
/* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
{64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
{128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
{192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
/* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
{64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
{128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
{192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
{256, 32, VGPRRegBank}, {288, 32, VGPRRegBank},
{320, 32, VGPRRegBank}, {352, 32, VGPRRegBank},
{384, 32, VGPRRegBank}, {416, 32, VGPRRegBank},
{448, 32, VGPRRegBank}, {480, 32, VGPRRegBank},
/* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
{128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
/* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
{128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
{256, 64, VGPRRegBank}, {320, 64, VGPRRegBank},
{384, 64, VGPRRegBank}, {448, 64, VGPRRegBank},

/* FIXME: The generic register bank select does not support complex
* break downs where the number of vector elements does not equal the
* number of breakdowns.
* FIXME: register bank select now tries to handle complex break downs,
* but it emits an illegal instruction:
* %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128)
*/
/* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
/* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
{256, 128, VGPRRegBank}, {384, 128, VGPRRegBank}
};

const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] {
/* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1},
/* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1},
/* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8},
/* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16},
/* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4},
/* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8}
};

const RegisterBankInfo::ValueMapping *
getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) {
unsigned Size = SizeTy.getSizeInBits();
if (Size < 256 || BankID == AMDGPU::SGPRRegBankID)
return getValueMapping(BankID, Size);

assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID);

// Default to using the non-split ValueMappings, we will use these if
// the register bank is SGPR or if we don't know how to handle the vector
// type.
unsigned Idx = Size == 256 ? 0 : 1;

// We need to split this load if it has a vgpr pointer.
if (BankID == AMDGPU::VGPRRegBankID) {
if (SizeTy == LLT::vector(8, 32))
Idx = 2;
else if (SizeTy == LLT::vector(16, 32))
Idx = 3;
else if (SizeTy == LLT::vector(4, 64))
Idx = 4;
else if (SizeTy == LLT::vector(8, 64))
Idx = 5;
}

return &ValMappingsLoadSGPROnly[Idx];
}


} // End AMDGPU namespace.
} // End llvm namespace.
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Expand Up @@ -517,7 +517,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,

case 256:
case 512:
// TODO: constant loads
// TODO: Possibly support loads of i256 and i512 . This will require
// adding i256 and i512 types to MVT in order for to be able to use
// TableGen.
// TODO: Add support for other vector types, this will require
// defining more value mappings for the new types.
return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
Ty0.getScalarType().getSizeInBits() == 64);

default:
return false;
}
Expand Down
172 changes: 136 additions & 36 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Expand Up @@ -37,22 +37,23 @@ using namespace llvm;
namespace {

// Observer to apply a register bank to new registers created by LegalizerHelper.
class ApplySALUMapping final : public GISelChangeObserver {
class ApplyRegBankMapping final : public GISelChangeObserver {
private:
MachineRegisterInfo &MRI;
const RegisterBank *NewBank;
SmallVector<MachineInstr *, 4> NewInsts;

public:
ApplySALUMapping(MachineRegisterInfo &MRI_)
: MRI(MRI_) {}
ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
: MRI(MRI_), NewBank(RB) {}

~ApplySALUMapping() {
~ApplyRegBankMapping() {
for (MachineInstr *MI : NewInsts)
applySALUBank(*MI);
applyBank(*MI);
}

/// Set any registers that don't have a set register class or bank to SALU.
void applySALUBank(MachineInstr &MI) {
void applyBank(MachineInstr &MI) {
for (MachineOperand &Op : MI.operands()) {
if (!Op.isReg())
continue;
Expand All @@ -61,10 +62,13 @@ class ApplySALUMapping final : public GISelChangeObserver {
if (MRI.getRegClassOrRegBank(Reg))
continue;

const RegisterBank *RB = NewBank;
// FIXME: This might not be enough to detect when SCC should be used.
const RegisterBank &RB = MRI.getType(Reg) == LLT::scalar(1) ?
AMDGPU::SCCRegBank : AMDGPU::SGPRRegBank;
MRI.setRegBank(Reg, RB);
if (MRI.getType(Reg) == LLT::scalar(1))
RB = (NewBank == &AMDGPU::SGPRRegBank ?
&AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);

MRI.setRegBank(Reg, *RB);
}
}

Expand All @@ -80,7 +84,6 @@ class ApplySALUMapping final : public GISelChangeObserver {
};

}

AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
: AMDGPUGenRegisterBankInfo(),
TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
Expand Down Expand Up @@ -128,6 +131,12 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
unsigned AMDGPURegisterBankInfo::getBreakDownCost(
const ValueMapping &ValMapping,
const RegisterBank *CurBank) const {
// Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
// VGPR.
// FIXME: Is there a better way to do this?
if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
return 10; // This is expensive.

assert(ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 &&
ValMapping.BreakDown[0].StartIdx == 0 &&
Expand Down Expand Up @@ -302,6 +311,14 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
}
}

static bool isInstrUniform(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())
return false;

const MachineMemOperand *MMO = *MI.memoperands_begin();
return AMDGPUInstrInfo::isUniformMMO(MMO);
}

RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {
Expand Down Expand Up @@ -356,29 +373,29 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
}
case TargetOpcode::G_LOAD: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
// FIXME: Should we be hard coding the size for these mappings?
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
2); // Num Operands
AltMappings.push_back(&SSMapping);
if (isInstrUniform(MI)) {
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
2); // Num Operands
AltMappings.push_back(&SSMapping);
}

const InstructionMapping &VVMapping = getInstructionMapping(
2, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
{AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);

// FIXME: Should this be the pointer-size (64-bits) or the size of the
// register that will hold the bufffer resourc (128-bits).
const InstructionMapping &VSMapping = getInstructionMapping(
3, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
2); // Num Operands
AltMappings.push_back(&VSMapping);
// It may be possible to have a vgpr = load sgpr mapping here, because
// the mubuf instructions support this kind of load, but probably for only
// gfx7 and older. However, the addressing mode matching in the instruction
// selector should be able to do a better job of detecting and selecting
// these kinds of loads from the vgpr = load vgpr mapping.

return AltMappings;

Expand Down Expand Up @@ -874,6 +891,91 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
MI.getOperand(OpIdx).setReg(SGPR);
}

// When regbankselect repairs registers, it will insert a repair instruction
// which defines the repaired register. Then it calls applyMapping and expects
// that the targets will either delete or rewrite the originally wrote to the
// repaired registers. Beccause of this, we end up in a situation where
// we have 2 instructions defining the same registers.
static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
Register Reg,
const MachineInstr &MI) {
// Is there some way we can assert that there are exactly 2 def instructions?
for (MachineInstr &Other : MRI.def_instructions(Reg)) {
if (&Other != &MI)
return &Other;
}

return nullptr;
}

bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI) const {
Register DstReg = MI.getOperand(0).getReg();
const LLT LoadTy = MRI.getType(DstReg);
unsigned LoadSize = LoadTy.getSizeInBits();
const unsigned MaxNonSmrdLoadSize = 128;
// 128-bit loads are supported for all instruction types.
if (LoadSize <= MaxNonSmrdLoadSize)
return false;

SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));

// If the pointer is an SGPR, we have nothing to do.
if (SrcRegs.empty())
return false;

assert(LoadSize % MaxNonSmrdLoadSize == 0);

// We want to get the repair instruction now, because it will help us
// determine which instruction the legalizer inserts that will also
// write to DstReg.
MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);

// RegBankSelect only emits scalar types, so we need to reset the pointer
// operand to a pointer type.
Register BasePtrReg = SrcRegs[0];
LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
MRI.setType(BasePtrReg, PtrTy);

MachineIRBuilder B(MI);

unsigned SplitElts =
MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
GISelObserverWrapper Observer(&O);
B.setChangeObserver(Observer);
LegalizerHelper Helper(B.getMF(), Observer, B);
if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
return false;

// At this point, the legalizer has split the original load into smaller
// loads. At the end of lowering, it inserts an instruction (LegalizedInst)
// that combines the outputs of the lower loads and writes it to DstReg.
// The register bank selector has also added the RepairInst which writes to
// DstReg as well.

MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);

// Replace the output of the LegalizedInst with a temporary register, since
// RepairInst already defines DstReg.
Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
LegalizedInst->getOperand(0).setReg(TmpReg);
B.setInsertPt(*RepairInst->getParent(), RepairInst);

for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
B.buildConstant(IdxReg, DefIdx);
MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
}

MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
return true;
}

// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static void substituteSimpleCopyRegs(
Expand Down Expand Up @@ -1008,7 +1110,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
ApplySALUMapping ApplySALU(MRI);
ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
LegalizerHelper Helper(*MF, Observer, B);

Expand All @@ -1028,7 +1130,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(

MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
ApplySALUMapping ApplySALU(MRI);
ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
LegalizerHelper Helper(*MF, Observer, B);

Expand Down Expand Up @@ -1212,21 +1314,18 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
}
break;
}
case AMDGPU::G_LOAD: {
if (applyMappingWideLoad(MI, OpdMapper, MRI))
return;
break;
}
default:
break;
}

return applyDefaultMapping(OpdMapper);
}

static bool isInstrUniform(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())
return false;

const MachineMemOperand *MMO = *MI.memoperands_begin();
return AMDGPUInstrInfo::isUniformMMO(MMO);
}

bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
Expand Down Expand Up @@ -1322,6 +1421,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);

const ValueMapping *ValMapping;
Expand All @@ -1332,7 +1432,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
} else {
ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
// FIXME: What would happen if we used SGPRRegBankID here?
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
Expand Up @@ -44,6 +44,9 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {

void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI,
unsigned OpIdx) const;
bool applyMappingWideLoad(MachineInstr &MI,
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI) const;

/// See RegisterBankInfo::applyMapping.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
Expand Down

0 comments on commit d0ba79f

Please sign in to comment.