Skip to content

Commit

Permalink
[AMDGPU] Lowering VGPR to SGPR copies to v_readfirstlane_b32 if profi…
Browse files Browse the repository at this point in the history
…table.

Since the divergence-driven instruction selection has been enabled for AMDGPU,
 all the uniform instructions are expected to be selected to SALU form, except those not having one.
 VGPR to SGPR copies appear in MIR to connect values producers and consumers. This change implements an algorithm
 that evolves a reasonable tradeoff between the profit achieved from keeping the uniform instructions in SALU form
 and overhead introduced by the data transfer between the VGPRs and SGPRs.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D128252
  • Loading branch information
alex-t committed Jul 14, 2022
1 parent 5b0788f commit 2e29b01
Show file tree
Hide file tree
Showing 27 changed files with 3,033 additions and 2,273 deletions.
323 changes: 285 additions & 38 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Expand Up @@ -67,6 +67,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
Expand All @@ -81,9 +82,9 @@ static cl::opt<bool> EnableM0Merge(
cl::init(true));

namespace {

class SIFixSGPRCopies : public MachineFunctionPass {
MachineDominatorTree *MDT;
unsigned NextVGPRToSGPRCopyID;

public:
static char ID;
Expand All @@ -92,9 +93,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;

SIFixSGPRCopies() : MachineFunctionPass(ID) {}
SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {}

bool runOnMachineFunction(MachineFunction &MF) override;
unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
void lowerVGPR2SGPRCopies(MachineFunction &MF);
// Handles copies which source register is:
// 1. Physical register
// 2. AGPR
// 3. Defined by the instruction the merely moves the immediate
bool lowerSpecialCase(MachineInstr &MI);

MachineBasicBlock *processPHINode(MachineInstr &MI);

Expand Down Expand Up @@ -569,6 +577,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();

// We have to lower VGPR to SGPR copies before the main loop
// because the REG_SEQUENCE and PHI lowering in main loop
// convert the def-use chains to VALU and close the opportunities
// for keeping them scalar.
// TODO: REG_SEQENCE and PHIs are semantically copies. The next patch
// addresses their lowering and unify the processing in one main loop.
lowerVGPR2SGPRCopies(MF);

for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock *MBB = &*BI;
Expand Down Expand Up @@ -640,42 +656,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
continue;
}

if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
Register SrcReg = MI.getOperand(1).getReg();
if (!SrcReg.isVirtual()) {
MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
if (NewBB && NewBB != MBB) {
MBB = NewBB;
E = MBB->end();
BI = MachineFunction::iterator(MBB);
BE = MF.end();
}
assert((!NewBB || NewBB == I->getParent()) &&
"moveToVALU did not return the right basic block");
break;
}

MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
unsigned SMovOp;
int64_t Imm;
// If we are just copying an immediate, we can replace the copy with
// s_mov_b32.
if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
MI.getOperand(1).ChangeToImmediate(Imm);
MI.addImplicitDefUseOperands(MF);
MI.setDesc(TII->get(SMovOp));
break;
}
MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
if (NewBB && NewBB != MBB) {
MBB = NewBB;
E = MBB->end();
BI = MachineFunction::iterator(MBB);
BE = MF.end();
}
assert((!NewBB || NewBB == I->getParent()) &&
"moveToVALU did not return the right basic block");
} else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
}

Expand Down Expand Up @@ -916,3 +897,269 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
}
return CreatedBB;
}

bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);

// We return true to indicate that no further processing needed
if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
return true;

Register SrcReg = MI.getOperand(1).getReg();
if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
TII->moveToVALU(MI, MDT);
return true;
}

unsigned SMovOp;
int64_t Imm;
// If we are just copying an immediate, we can replace the copy with
// s_mov_b32.
if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
MI.getOperand(1).ChangeToImmediate(Imm);
MI.addImplicitDefUseOperands(*MBB->getParent());
MI.setDesc(TII->get(SMovOp));
return true;
}
return false;
}

class V2SCopyInfo {
public:
// VGPR to SGPR copy being processed
MachineInstr *Copy;
// All SALU instructions reachable from this copy in SSA graph
DenseSet<MachineInstr *> SChain;
// Number of SGPR to VGPR copies that are used to put the SALU computation
// results back to VALU.
unsigned NumSVCopies;

unsigned Score;
// Actual count of v_readfirstlane_b32
// which need to be inserted to keep SChain SALU
unsigned NumReadfirstlanes;
// Current score state. To speedup selection V2SCopyInfos for processing
bool NeedToBeConvertedToVALU = false;
// Unique ID. Used as a key for mapping to keep permanent order.
unsigned ID;

// Count of another VGPR to SGPR copies that contribute to the
// current copy SChain
unsigned SiblingPenalty = 0;
SetVector<unsigned> Siblings;
V2SCopyInfo() : Copy(nullptr), ID(0){};
V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
: Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump() {
dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
<< "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
<< "\nScore: " << Score << "\n";
}
#endif
};

void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {

DenseMap<unsigned, V2SCopyInfo> Copies;
DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;

// The main function that computes the VGPR to SGPR copy score
// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
if (I->SChain.empty())
return true;
I->Siblings = SiblingPenalty[*std::max_element(
I->SChain.begin(), I->SChain.end(),
[&](MachineInstr *A, MachineInstr *B) -> bool {
return SiblingPenalty[A].size() < SiblingPenalty[B].size();
})];
I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; });
// The loop below computes the number of another VGPR to SGPR copies
// which contribute to the current copy SALU chain. We assume that all the
// copies with the same source virtual register will be squashed to one by
// regalloc. Also we take careof the copies of the differnt subregs of the
// same register.
SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
for (auto J : I->Siblings) {
auto InfoIt = Copies.find(J);
if (InfoIt != Copies.end()) {
MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
if (SiblingCopy->isImplicitDef())
// the COPY has already been MoveToVALUed
continue;

SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
SiblingCopy->getOperand(1).getSubReg()));
}
}
I->SiblingPenalty = SrcRegs.size();

unsigned Penalty =
I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes;
unsigned Profit = I->SChain.size();
I->Score = Penalty > Profit ? 0 : Profit - Penalty;
I->NeedToBeConvertedToVALU = I->Score < 3;
return I->NeedToBeConvertedToVALU;
};

auto needProcessing = [](MachineInstr &MI) -> bool {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::WQM:
case AMDGPU::STRICT_WQM:
case AMDGPU::SOFT_WQM:
case AMDGPU::STRICT_WWM:
return true;
default:
return false;
}
};

for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
++BI) {
MachineBasicBlock *MBB = &*BI;
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
++I) {
MachineInstr &MI = *I;
if (!needProcessing(MI))
continue;
if (lowerSpecialCase(MI))
continue;

// Compute the COPY width to pass it to V2SCopyInfo Ctor
Register DstReg = MI.getOperand(0).getReg();

const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg);

V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI,
TRI->getRegSizeInBits(*DstRC));

SmallVector<MachineInstr *, 8> AnalysisWorklist;
// Needed because the SSA is not a tree but a graph and may have
// forks and joins. We should not then go same way twice.
DenseSet<MachineInstr *> Visited;
AnalysisWorklist.push_back(&MI);
while (!AnalysisWorklist.empty()) {

MachineInstr *Inst = AnalysisWorklist.pop_back_val();

if (!Visited.insert(Inst).second)
continue;

// Copies and REG_SEQUENCE do not contribute to the final assembly
// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
if (Inst->isCopy() || Inst->isRegSequence()) {
if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
if (!Inst->isCopy() ||
!tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
In.NumSVCopies++;
continue;
}
}
}

SiblingPenalty[Inst].insert(In.ID);

SmallVector<MachineInstr *, 4> Users;
if ((TII->isSALU(*Inst) && Inst->isCompare()) ||
(Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) {
auto I = Inst->getIterator();
auto E = Inst->getParent()->end();
while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) {
if (I->readsRegister(AMDGPU::SCC))
Users.push_back(&*I);
}
} else if (Inst->getNumExplicitDefs() != 0) {
Register Reg = Inst->getOperand(0).getReg();
if (TRI->isSGPRReg(*MRI, Reg))
for (auto &U : MRI->use_instructions(Reg))
Users.push_back(&U);
}
for (auto U : Users) {
if (TII->isSALU(*U))
In.SChain.insert(U);
AnalysisWorklist.push_back(U);
}
}
Copies[In.ID] = In;
}
}

SmallVector<unsigned, 8> LoweringWorklist;
for (auto &C : Copies) {
if (needToBeConvertedToVALU(&C.second))
LoweringWorklist.push_back(C.second.ID);
}

while (!LoweringWorklist.empty()) {
unsigned CurID = LoweringWorklist.pop_back_val();
auto CurInfoIt = Copies.find(CurID);
if (CurInfoIt != Copies.end()) {
V2SCopyInfo C = CurInfoIt->getSecond();
LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
for (auto S : C.Siblings) {
auto SibInfoIt = Copies.find(S);
if (SibInfoIt != Copies.end()) {
V2SCopyInfo &SI = SibInfoIt->getSecond();
LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
if (!SI.NeedToBeConvertedToVALU) {
set_subtract(SI.SChain, C.SChain);
if (needToBeConvertedToVALU(&SI))
LoweringWorklist.push_back(SI.ID);
}
SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; });
}
}
LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
<< " is being turned to VALU\n");
Copies.erase(C.ID);
TII->moveToVALU(*C.Copy, MDT);
}
}

// Now do actual lowering
for (auto C : Copies) {
MachineInstr *MI = C.second.Copy;
MachineBasicBlock *MBB = MI->getParent();
// We decide to turn V2S copy to v_readfirstlane_b32
// remove it from the V2SCopies and remove it from all its siblings
LLVM_DEBUG(dbgs() << "V2S copy " << *MI
<< " is being turned to v_readfirstlane_b32"
<< " Score: " << C.second.Score << "\n");
Register DstReg = MI->getOperand(0).getReg();
Register SrcReg = MI->getOperand(1).getReg();
unsigned SubReg = MI->getOperand(1).getSubReg();
const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg);
SrcRC = TRI->getSubRegClass(SrcRC, SubReg);
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
if (SrcSize == 16) {
// HACK to handle possible 16bit VGPR source
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
} else if (SrcSize == 32) {
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
MIB.addReg(SrcReg, 0, SubReg);
} else {
auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::REG_SEQUENCE), DstReg);
int N = TRI->getRegSizeInBits(*SrcRC) / 32;
for (int i = 0; i < N; i++) {
Register PartialSrc = TII->buildExtractSubReg(
Result, *MRI, MI->getOperand(1), SrcRC,
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
Register PartialDst =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, *Result, Result->getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
.addReg(PartialSrc);
Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
}
}
MI->eraseFromParent();
}
}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/add3.ll
Expand Up @@ -223,7 +223,7 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float
; VI-NEXT: v_mov_b32_e32 v2, 0x40400000
; VI-NEXT: v_add_f32_e32 v2, s4, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: add3_uniform_vgpr:
Expand Down

0 comments on commit 2e29b01

Please sign in to comment.