Skip to content

Commit

Permalink
AMDGPU/GISel: Factor out AMDGPURegisterBankInfo::buildReadFirstLane
Browse files Browse the repository at this point in the history
A later change will add a 3rd user, so factoring out the common code
seems useful.

Reorganizing the executeInWaterfallLoop causes some more COPYs to be
generated, but those all fold away during instruction selection.
Generating the comparisons uses generic instructions over machine
instructions now which admittedly shouldn't make a difference
(though it should make it easier to move the waterfall loop generation
to another place).

(Resubmit with missing test added.)

Differential Revision: https://reviews.llvm.org/D125324
  • Loading branch information
nhaehnle committed May 25, 2022
1 parent afc9010 commit affa1b1
Show file tree
Hide file tree
Showing 43 changed files with 3,584 additions and 3,170 deletions.
262 changes: 98 additions & 164 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Expand Up @@ -678,6 +678,62 @@ static LLT getHalfSizedType(LLT Ty) {
return LLT::scalar(Ty.getScalarSizeInBits() / 2);
}

// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
// source value into a scalar register.
Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
Register Src) const {
LLT Ty = MRI.getType(Src);
const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);

if (Bank == &AMDGPU::SGPRRegBank)
return Src;

unsigned Bits = Ty.getSizeInBits();
assert(Bits % 32 == 0);

if (Bank != &AMDGPU::VGPRRegBank) {
// We need to copy from AGPR to VGPR
Src = B.buildCopy(Ty, Src).getReg(0);
MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
}

LLT S32 = LLT::scalar(32);
unsigned NumParts = Bits / 32;
SmallVector<Register, 8> SrcParts;
SmallVector<Register, 8> DstParts;

if (Bits == 32) {
SrcParts.push_back(Src);
} else {
auto Unmerge = B.buildUnmerge(S32, Src);
for (unsigned i = 0; i < NumParts; ++i)
SrcParts.push_back(Unmerge.getReg(i));
}

for (unsigned i = 0; i < NumParts; ++i) {
Register SrcPart = SrcParts[i];
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);

const TargetRegisterClass *Constrained =
constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
(void)Constrained;
assert(Constrained && "Failed to constrain readfirstlane src reg");

B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});

DstParts.push_back(DstPart);
}

if (Bits == 32)
return DstParts[0];

Register Dst = B.buildMerge(Ty, DstParts).getReg(0);
MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
return Dst;
}

/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
/// execute the instruction for each unique combination of values in all lanes
Expand Down Expand Up @@ -710,8 +766,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineFunction *MF = &B.getMF();

const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
const unsigned WaveAndOpc = Subtarget.isWave32() ?
AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
const unsigned MovExecOpc =
Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const unsigned MovExecTermOpc =
Expand Down Expand Up @@ -783,9 +837,9 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
auto NewEnd = BodyBB->end();

MachineBasicBlock::iterator I = LoopBB->end();
B.setMBB(*LoopBB);

LLT S1 = LLT::scalar(1);
Register CondReg;

assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
Expand Down Expand Up @@ -819,161 +873,59 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.setMBB(*LoopBB);
}

unsigned OpSize = OpTy.getSizeInBits();
Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);

// Can only do a readlane of 32-bit pieces.
if (OpSize == 32) {
// Avoid extra copies in the simple case of one 32-bit register.
Register CurrentLaneOpReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MRI.setType(CurrentLaneOpReg, OpTy);

constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpReg)
.addReg(OpReg);

Register NewCondReg = MRI.createVirtualRegister(WaveRC);
bool First = CondReg == AMDGPU::NoRegister;
if (First)
CondReg = NewCondReg;

// Compare the just read M0 value to all possible Idx values.
B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
.addReg(OpReg);
Op.setReg(CurrentLaneOpReg);

if (!First) {
Register AndReg = MRI.createVirtualRegister(WaveRC);

// If there are multiple operands to consider, and the conditions.
B.buildInstr(WaveAndOpc)
.addDef(AndReg)
.addReg(NewCondReg)
.addReg(CondReg);
CondReg = AndReg;
}
// Build the comparison(s).
unsigned OpSize = OpTy.getSizeInBits();
bool Is64 = OpSize % 64 == 0;
unsigned PartSize = Is64 ? 64 : 32;
LLT PartTy = LLT::scalar(PartSize);
unsigned NumParts = OpSize / PartSize;
SmallVector<Register, 8> OpParts;
SmallVector<Register, 8> CurrentLaneParts;

if (NumParts == 1) {
OpParts.push_back(OpReg);
CurrentLaneParts.push_back(CurrentLaneReg);
} else {
LLT S32 = LLT::scalar(32);
SmallVector<Register, 8> ReadlanePieces;

// The compares can be done as 64-bit, but the extract needs to be done
// in 32-bit pieces.

bool Is64 = OpSize % 64 == 0;

unsigned UnmergeTySize = Is64 ? 64 : 32;
unsigned CmpOp =
Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;

// Insert the unmerge before the loop.

B.setMBB(MBB);
unsigned NumPieces = OpSize / UnmergeTySize;
SmallVector<Register, 8> UnmergePieces;
if (NumPieces == 1) {
UnmergePieces.push_back(OpReg);
} else {
LLT UnmergeTy = LLT::scalar(UnmergeTySize);
MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
for (unsigned i = 0; i < NumParts; ++i) {
OpParts.push_back(UnmergeOp.getReg(i));
CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
}
B.setMBB(*LoopBB);

for (Register UnmergePiece : UnmergePieces) {
Register CurrentLaneOpReg;
if (Is64) {
Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);

MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);

// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpRegLo)
.addReg(UnmergePiece, 0, AMDGPU::sub0);

// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpRegHi)
.addReg(UnmergePiece, 0, AMDGPU::sub1);

CurrentLaneOpReg =
B.buildMerge(LLT::scalar(64),
{CurrentLaneOpRegLo, CurrentLaneOpRegHi})
.getReg(0);

MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);

if (OpTy.getScalarSizeInBits() == 64) {
// If we need to produce a 64-bit element vector, so use the
// merged pieces
ReadlanePieces.push_back(CurrentLaneOpReg);
} else {
// 32-bit element type.
ReadlanePieces.push_back(CurrentLaneOpRegLo);
ReadlanePieces.push_back(CurrentLaneOpRegHi);
}
} else {
CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);

// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpReg)
.addReg(UnmergePiece);
ReadlanePieces.push_back(CurrentLaneOpReg);
}
}

Register NewCondReg = MRI.createVirtualRegister(WaveRC);
bool First = CondReg == AMDGPU::NoRegister;
if (First)
CondReg = NewCondReg;

B.buildInstr(CmpOp)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
.addReg(UnmergePiece);

if (!First) {
Register AndReg = MRI.createVirtualRegister(WaveRC);

// If there are multiple operands to consider, and the conditions.
B.buildInstr(WaveAndOpc)
.addDef(AndReg)
.addReg(NewCondReg)
.addReg(CondReg);
CondReg = AndReg;
}
}
for (unsigned i = 0; i < NumParts; ++i) {
auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
OpParts[i]).getReg(0);
MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);

// FIXME: Build merge seems to switch to CONCAT_VECTORS but not
// BUILD_VECTOR
if (OpTy.isVector()) {
auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
Op.setReg(Merge.getReg(0));
MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
} else if (ReadlanePieces.size() > 1) {
auto Merge = B.buildMerge(OpTy, ReadlanePieces);
Op.setReg(Merge.getReg(0));
MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
if (!CondReg) {
CondReg = CmpReg;
} else {
Op.setReg(ReadlanePieces[0]);
CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
}
}

Op.setReg(CurrentLaneReg);

// Make sure we don't re-process this register again.
WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
}
}

// The ballot becomes a no-op during instruction selection.
CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
{LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
false)
.addReg(CondReg)
.getReg(0);
MRI.setRegClass(CondReg, WaveRC);

// Update EXEC, save the original EXEC value to VCC.
B.buildInstr(AndSaveExecOpc)
.addDef(NewExec)
Expand Down Expand Up @@ -1061,28 +1013,10 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
if (Bank == &AMDGPU::SGPRRegBank)
return;

LLT Ty = MRI.getType(Reg);
MachineIRBuilder B(MI);

if (Bank != &AMDGPU::VGPRRegBank) {
// We need to copy from AGPR to VGPR
Reg = B.buildCopy(Ty, Reg).getReg(0);
MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
}

Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
.addDef(SGPR)
.addReg(Reg);

MRI.setType(SGPR, Ty);

const TargetRegisterClass *Constrained =
constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
(void)Constrained;
assert(Constrained && "Failed to constrain readfirstlane src reg");

MI.getOperand(OpIdx).setReg(SGPR);
Reg = buildReadFirstLane(B, MRI, Reg);
MI.getOperand(OpIdx).setReg(Reg);
}

/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
Expand Up @@ -59,6 +59,9 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
SmallSet<Register, 4> &SGPROperandRegs,
MachineRegisterInfo &MRI) const;

Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Src) const;

bool executeInWaterfallLoop(MachineIRBuilder &B,
MachineInstr &MI,
MachineRegisterInfo &MRI,
Expand Down

0 comments on commit affa1b1

Please sign in to comment.