AMDGPU: Add SIWholeQuadMode pass

Summary: Whole quad mode is already enabled for pixel shaders that compute derivatives, but it must be suspended for instructions that cause a shader to have side effects (i.e. stores and atomics). This pass addresses the issue by storing the real (initial) live mask in a register, masking EXEC before instructions that require exact execution and (re-)enabling WQM where required. This pass is run before register coalescing so that we can use machine SSA for analysis. The changes in this patch expose a problem with the second machine scheduling pass: target independent instructions like COPY implicitly use EXEC when they operate on VGPRs, but this fact is not encoded in the MIR. This can lead to miscompilation because instructions are moved past changes to EXEC. This patch fixes the problem by adding use-implicit operands to target independent instructions. Some general codegen passes are relaxed to work with such implicit use operands. Reviewers: arsenm, tstellarAMD, mareko Subscribers: MatzeB, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18162 llvm-svn: 263982
llvm · Mar 21, 2016 · 213e87f · 213e87f
1 parent b14f4fd
commit 213e87f
Show file tree

Hide file tree

Showing 10 changed files with 863 additions and 15 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -44,6 +44,7 @@ FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSIWholeQuadModePass();
 FunctionPass *createSILowerControlFlowPass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
@@ -70,6 +71,9 @@ extern char &SILowerI1CopiesID;
 void initializeSILoadStoreOptimizerPass(PassRegistry &);
 extern char &SILoadStoreOptimizerID;
 
+void initializeSIWholeQuadModePass(PassRegistry &);
+extern char &SIWholeQuadModeID;
+
 void initializeSILowerControlFlowPass(PassRegistry &);
 extern char &SILowerControlFlowPassID;
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -62,7 +62,6 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-
   /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -57,6 +57,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertNopsPass(*PR);
   initializeSIInsertWaitsPass(*PR);
+  initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
 }
 
@@ -346,6 +347,7 @@ void GCNPassConfig::addPreRegAlloc() {
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
   addPass(createSIShrinkInstructionsPass(), false);
+  addPass(createSIWholeQuadModePass());
 }
 
 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {

diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
+  SIWholeQuadMode.cpp
   )
 
 add_subdirectory(AsmParser)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1248,6 +1248,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
                  .addImm(0); // omod
 }
 
+bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                       const MachineBasicBlock *MBB,
+                                       const MachineFunction &MF) const {
+  // Target-independent instructions do not have an implicit-use of EXEC, even
+  // when they operate on VGPRs. Treating EXEC modifications as scheduling
+  // boundaries prevents incorrect movements of such instructions.
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (MI->modifiesRegister(AMDGPU::EXEC, TRI))
+    return true;
+
+  return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   int64_t SVal = Imm.getSExtValue();
   if (SVal >= -16 && SVal <= 64)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -149,6 +149,10 @@ class SIInstrInfo final : public AMDGPUInstrInfo {
                                       MachineBasicBlock::iterator &MI,
                                       LiveVariables *LV) const override;
 
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   static bool isSALU(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SALU;
   }

diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -78,7 +78,7 @@ class SILowerControlFlow : public MachineFunctionPass {
   void SkipIfDead(MachineInstr &MI);
 
   void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
+  void Else(MachineInstr &MI, bool ExecModified);
   void Break(MachineInstr &MI);
   void IfBreak(MachineInstr &MI);
   void ElseBreak(MachineInstr &MI);
@@ -215,7 +215,7 @@ void SILowerControlFlow::If(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlow::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
@@ -225,6 +225,15 @@ void SILowerControlFlow::Else(MachineInstr &MI) {
           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
           .addReg(Src); // Saved EXEC
 
+  if (ExecModified) {
+    // Adjust the saved exec to account for the modifications during the flow
+    // block that contains the ELSE. This can happen when WQM mode is switched
+    // off.
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+            .addReg(AMDGPU::EXEC)
+            .addReg(Dst);
+  }
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
           .addReg(AMDGPU::EXEC)
           .addReg(Dst);
@@ -488,7 +497,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
-  bool NeedWQM = false;
   bool NeedFlat = false;
   unsigned Depth = 0;
 
@@ -498,17 +506,24 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock *EmptyMBBAtEnd = NULL;
     MachineBasicBlock &MBB = *BI;
     MachineBasicBlock::iterator I, Next;
+    bool ExecModified = false;
+
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isWQM(MI) || TII->isDS(MI))
-        NeedWQM = true;
 
       // Flat uses m0 in case it needs to access LDS.
       if (TII->isFLAT(MI))
         NeedFlat = true;
 
+      for (const auto &Def : I->defs()) {
+        if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
+          ExecModified = true;
+          break;
+        }
+      }
+
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
@@ -517,7 +532,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
           break;
 
         case AMDGPU::SI_ELSE:
-          Else(MI);
+          Else(MI, ExecModified);
           break;
 
         case AMDGPU::SI_BREAK:
@@ -599,12 +614,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
-    MachineBasicBlock &MBB = MF.front();
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
-  }
-
   if (NeedFlat && MFI->IsKernel) {
     // TODO: What to use with function calls?
     // We will need to Initialize the flat scratch register pair.

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -72,9 +72,12 @@ struct SIRegisterInfo final : public AMDGPURegisterInfo {
   }
 
   bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    const TargetRegisterClass *RC;
     if (TargetRegisterInfo::isVirtualRegister(Reg))
-      return isSGPRClass(MRI.getRegClass(Reg));
-    return getPhysRegClass(Reg);
+      RC = MRI.getRegClass(Reg);
+    else
+      RC = getPhysRegClass(Reg);
+    return isSGPRClass(RC);
   }
 
   /// \returns true if this class contains VGPR registers.