Skip to content

Commit

Permalink
AMDGPU: Add SIWholeQuadMode pass
Browse files Browse the repository at this point in the history
Summary:
Whole quad mode is already enabled for pixel shaders that compute
derivatives, but it must be suspended for instructions that cause a
shader to have side effects (i.e. stores and atomics).

This pass addresses the issue by storing the real (initial) live mask
in a register, masking EXEC before instructions that require exact
execution and (re-)enabling WQM where required.

This pass is run before register coalescing so that we can use
machine SSA for analysis.

The changes in this patch expose a problem with the second machine
scheduling pass: target independent instructions like COPY implicitly
use EXEC when they operate on VGPRs, but this fact is not encoded in
the MIR. This can lead to miscompilation because instructions are
moved past changes to EXEC.

This patch fixes the problem by adding use-implicit operands to
target independent instructions. Some general codegen passes are
relaxed to work with such implicit use operands.

Reviewers: arsenm, tstellarAMD, mareko

Subscribers: MatzeB, arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18162

llvm-svn: 263982
  • Loading branch information
nhaehnle committed Mar 21, 2016
1 parent b14f4fd commit 213e87f
Show file tree
Hide file tree
Showing 10 changed files with 863 additions and 15 deletions.
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Expand Up @@ -44,6 +44,7 @@ FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSILowerControlFlowPass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIFixSGPRCopiesPass();
Expand All @@ -70,6 +71,9 @@ extern char &SILowerI1CopiesID;
void initializeSILoadStoreOptimizerPass(PassRegistry &);
extern char &SILoadStoreOptimizerID;

void initializeSIWholeQuadModePass(PassRegistry &);
extern char &SIWholeQuadModeID;

void initializeSILowerControlFlowPass(PassRegistry &);
extern char &SILowerControlFlowPassID;

Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
Expand Up @@ -62,7 +62,6 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const override;


/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -57,6 +57,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertNopsPass(*PR);
initializeSIInsertWaitsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
}

Expand Down Expand Up @@ -346,6 +347,7 @@ void GCNPassConfig::addPreRegAlloc() {
insertPass(&MachineSchedulerID, &RegisterCoalescerID);
}
addPass(createSIShrinkInstructionsPass(), false);
addPass(createSIWholeQuadModePass());
}

void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Expand Up @@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
SIRegisterInfo.cpp
SIShrinkInstructions.cpp
SITypeRewriter.cpp
SIWholeQuadMode.cpp
)

add_subdirectory(AsmParser)
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -1248,6 +1248,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
.addImm(0); // omod
}

bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const {
// Target-independent instructions do not have an implicit-use of EXEC, even
// when they operate on VGPRs. Treating EXEC modifications as scheduling
// boundaries prevents incorrect movements of such instructions.
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
if (MI->modifiesRegister(AMDGPU::EXEC, TRI))
return true;

return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF);
}

bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
int64_t SVal = Imm.getSExtValue();
if (SVal >= -16 && SVal <= 64)
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Expand Up @@ -149,6 +149,10 @@ class SIInstrInfo final : public AMDGPUInstrInfo {
MachineBasicBlock::iterator &MI,
LiveVariables *LV) const override;

bool isSchedulingBoundary(const MachineInstr *MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const override;

static bool isSALU(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::SALU;
}
Expand Down
33 changes: 21 additions & 12 deletions llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
Expand Up @@ -78,7 +78,7 @@ class SILowerControlFlow : public MachineFunctionPass {
void SkipIfDead(MachineInstr &MI);

void If(MachineInstr &MI);
void Else(MachineInstr &MI);
void Else(MachineInstr &MI, bool ExecModified);
void Break(MachineInstr &MI);
void IfBreak(MachineInstr &MI);
void ElseBreak(MachineInstr &MI);
Expand Down Expand Up @@ -215,7 +215,7 @@ void SILowerControlFlow::If(MachineInstr &MI) {
MI.eraseFromParent();
}

void SILowerControlFlow::Else(MachineInstr &MI) {
void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
Expand All @@ -225,6 +225,15 @@ void SILowerControlFlow::Else(MachineInstr &MI) {
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
.addReg(Src); // Saved EXEC

if (ExecModified) {
// Adjust the saved exec to account for the modifications during the flow
// block that contains the ELSE. This can happen when WQM mode is switched
// off.
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
}

BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
Expand Down Expand Up @@ -488,7 +497,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

bool HaveKill = false;
bool NeedWQM = false;
bool NeedFlat = false;
unsigned Depth = 0;

Expand All @@ -498,17 +506,24 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
MachineBasicBlock *EmptyMBBAtEnd = NULL;
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
bool ExecModified = false;

for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);

MachineInstr &MI = *I;
if (TII->isWQM(MI) || TII->isDS(MI))
NeedWQM = true;

// Flat uses m0 in case it needs to access LDS.
if (TII->isFLAT(MI))
NeedFlat = true;

for (const auto &Def : I->defs()) {
if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
ExecModified = true;
break;
}
}

switch (MI.getOpcode()) {
default: break;
case AMDGPU::SI_IF:
Expand All @@ -517,7 +532,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
break;

case AMDGPU::SI_ELSE:
Else(MI);
Else(MI, ExecModified);
break;

case AMDGPU::SI_BREAK:
Expand Down Expand Up @@ -599,12 +614,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
}
}

if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
MachineBasicBlock &MBB = MF.front();
BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
}

if (NeedFlat && MFI->IsKernel) {
// TODO: What to use with function calls?
// We will need to Initialize the flat scratch register pair.
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Expand Up @@ -72,9 +72,12 @@ struct SIRegisterInfo final : public AMDGPURegisterInfo {
}

bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
const TargetRegisterClass *RC;
if (TargetRegisterInfo::isVirtualRegister(Reg))
return isSGPRClass(MRI.getRegClass(Reg));
return getPhysRegClass(Reg);
RC = MRI.getRegClass(Reg);
else
RC = getPhysRegClass(Reg);
return isSGPRClass(RC);
}

/// \returns true if this class contains VGPR registers.
Expand Down

0 comments on commit 213e87f

Please sign in to comment.