[AMDGPU] Bundle loads before post-RA scheduler

We are relying on atrificial DAG edges inserted by the MemOpClusterMutation to keep loads and stores together in the post-RA scheduler. This does not work all the time since it allows to schedule a completely independent instruction in the middle of the cluster. Removed the DAG mutation and added pass to bundle already clustered instructions. These bundles are unpacked before the memory legalizer because it does not work with bundles but also because it allows to insert waitcounts in the middle of a store cluster. Removing artificial edges also allows a more relaxed scheduling. Differential Revision: https://reviews.llvm.org/D72737
llvm · Jan 24, 2020 · 555d8f4 · 555d8f4
1 parent b35b7da
commit 555d8f4
Show file tree

Hide file tree

Showing 50 changed files with 627 additions and 421 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -59,6 +59,7 @@ FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
+FunctionPass *createSIPostRABundlerPass();
 FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
                                                const TargetMachine *);
 FunctionPass *createAMDGPUUseNativeCallsPass();
@@ -229,6 +230,9 @@ extern char &SIInsertWaitcntsID;
 void initializeSIFormMemoryClausesPass(PassRegistry&);
 extern char &SIFormMemoryClausesID;
 
+void initializeSIPostRABundlerPass(PassRegistry&);
+extern char &SIPostRABundlerID;
+
 void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
 extern char &AMDGPUUnifyDivergentExitNodesID;
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -754,53 +754,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
 }
 
 namespace {
-struct MemOpClusterMutation : ScheduleDAGMutation {
-  const SIInstrInfo *TII;
-
-  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
-
-  void apply(ScheduleDAGInstrs *DAG) override {
-    SUnit *SUa = nullptr;
-    // Search for two consequent memory operations and link them
-    // to prevent scheduler from moving them apart.
-    // In DAG pre-process SUnits are in the original order of
-    // the instructions before scheduling.
-    for (SUnit &SU : DAG->SUnits) {
-      MachineInstr &MI2 = *SU.getInstr();
-      if (!MI2.mayLoad() && !MI2.mayStore()) {
-        SUa = nullptr;
-        continue;
-      }
-      if (!SUa) {
-        SUa = &SU;
-        continue;
-      }
-
-      MachineInstr &MI1 = *SUa->getInstr();
-      if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
-          (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
-          (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
-          (TII->isDS(MI1)   && TII->isDS(MI2))) {
-        SU.addPredBarrier(SUa);
-
-        for (const SDep &SI : SU.Preds) {
-          if (SI.getSUnit() != SUa)
-            SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
-        }
-
-        if (&SU != &DAG->ExitSU) {
-          for (const SDep &SI : SUa->Succs) {
-            if (SI.getSUnit() != &SU)
-              SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
-          }
-        }
-      }
-
-      SUa = &SU;
-    }
-  }
-};
-
 struct FillMFMAShadowMutation : ScheduleDAGMutation {
   const SIInstrInfo *TII;
 
@@ -927,7 +880,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
 
 void GCNSubtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 }
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -235,6 +235,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIOptimizeExecMaskingPass(*PR);
   initializeSIPreAllocateWWMRegsPass(*PR);
   initializeSIFormMemoryClausesPass(*PR);
+  initializeSIPostRABundlerPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
   initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -980,6 +981,7 @@ void GCNPassConfig::addPostRegAlloc() {
 }
 
 void GCNPassConfig::addPreSched2() {
+  addPass(&SIPostRABundlerID);
 }
 
 void GCNPassConfig::addPreEmitPass() {

diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -118,6 +118,7 @@ add_llvm_target(AMDGPUCodeGen
   SIOptimizeExecMasking.cpp
   SIOptimizeExecMaskingPreRA.cpp
   SIPeepholeSDWA.cpp
+  SIPostRABundler.cpp
   SIRegisterInfo.cpp
   SIRemoveShortExecBranches.cpp
   SIShrinkInstructions.cpp

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1663,22 +1663,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
-  case TargetOpcode::BUNDLE: {
-    if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
-      return false;
-
-    // If it is a load it must be a memory clause
-    for (MachineBasicBlock::instr_iterator I = MI.getIterator();
-         I->isBundledWithSucc(); ++I) {
-      I->unbundleFromSucc();
-      for (MachineOperand &MO : I->operands())
-        if (MO.isReg())
-          MO.setIsInternalRead(false);
-    }
-
-    MI.eraseFromParent();
-    break;
-  }
   }
   return true;
 }

diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1289,6 +1289,21 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+
+      if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
+        MachineBasicBlock::instr_iterator II(MI->getIterator());
+        for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
+             I != E && I->isBundledWithPred(); ++I) {
+          I->unbundleFromPred();
+          for (MachineOperand &MO : I->operands())
+            if (MO.isReg())
+              MO.setIsInternalRead(false);
+        }
+
+        MI->eraseFromParent();
+        MI = II->getIterator();
+      }
+
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 

diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -0,0 +1,138 @@
+//===-- SIPostRABundler.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass creates bundles of memory instructions to protect adjacent loads
+/// and stores from beeing rescheduled apart from each other post-RA.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-post-ra-bundler"
+
+namespace {
+
+class SIPostRABundler : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIPostRABundler() : MachineFunctionPass(ID) {
+    initializeSIPostRABundlerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI post-RA bundler";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  const SIRegisterInfo *TRI;
+
+  SmallSet<Register, 16> Defs;
+
+  bool isDependentLoad(const MachineInstr &MI) const;
+
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)
+
+char SIPostRABundler::ID = 0;
+
+char &llvm::SIPostRABundlerID = SIPostRABundler::ID;
+
+FunctionPass *llvm::createSIPostRABundlerPass() {
+  return new SIPostRABundler();
+}
+
+bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
+  if (!MI.mayLoad())
+    return false;
+
+  for (const MachineOperand &Op : MI.explicit_operands()) {
+    if (!Op.isReg())
+      continue;
+    Register Reg = Op.getReg();
+    for (const Register Def : Defs)
+      if (TRI->regsOverlap(Reg, Def))
+        return true;
+  }
+
+  return false;
+}
+
+bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+  bool Changed = false;
+  const unsigned MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
+                            SIInstrFlags::SMRD | SIInstrFlags::DS |
+                            SIInstrFlags::FLAT | SIInstrFlags::MIMG;
+
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::instr_iterator Next;
+    MachineBasicBlock::instr_iterator B = MBB.instr_begin();
+    MachineBasicBlock::instr_iterator E = MBB.instr_end();
+    for (auto I = B; I != E; I = Next) {
+      Next = std::next(I);
+
+      if (I->isBundled() || !I->mayLoadOrStore() ||
+          B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
+          (B->getDesc().TSFlags & MemFlags) !=
+          (I->getDesc().TSFlags & MemFlags) ||
+          isDependentLoad(*I)) {
+
+        if (B != I) {
+          if (std::next(B) != I) {
+            finalizeBundle(MBB, B, I);
+            Changed = true;
+          }
+          Next = I;
+        }
+
+        B = Next;
+        Defs.clear();
+        continue;
+      }
+
+      if (I->getNumExplicitDefs() == 0)
+        continue;
+
+      Defs.insert(I->defs().begin()->getReg());
+    }
+
+    if (B != E && std::next(B) != E) {
+      finalizeBundle(MBB, B, E);
+      Changed = true;
+    }
+
+    Defs.clear();
+  }
+
+  return Changed;
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -807,9 +807,9 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; MOVREL-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; MOVREL-NEXT:    v_mov_b32_e32 v34, s19
-; MOVREL-NEXT:    v_mov_b32_e32 v33, s18
 ; MOVREL-NEXT:    v_mov_b32_e32 v32, s17
 ; MOVREL-NEXT:    v_mov_b32_e32 v31, s16
+; MOVREL-NEXT:    v_mov_b32_e32 v33, s18
 ; MOVREL-NEXT:    v_mov_b32_e32 v30, s15
 ; MOVREL-NEXT:    v_mov_b32_e32 v29, s14
 ; MOVREL-NEXT:    v_mov_b32_e32 v28, s13
@@ -1113,10 +1113,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
 ; MOVREL-NEXT:    v_movreld_b32_e32 v2, v0
 ; MOVREL-NEXT:    v_movreld_b32_e32 v3, v1
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
-; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off
 ; MOVREL-NEXT:    s_endpgm
 entry:
@@ -2053,24 +2053,24 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
 ; MOVREL-NEXT:    s_movreld_b64 s[2:3], s[18:19]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
 ; MOVREL-NEXT:    v_mov_b32_e32 v4, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v8, s8
-; MOVREL-NEXT:    v_mov_b32_e32 v12, s12
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s1
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s8
 ; MOVREL-NEXT:    v_mov_b32_e32 v5, s5
 ; MOVREL-NEXT:    v_mov_b32_e32 v6, s6
 ; MOVREL-NEXT:    v_mov_b32_e32 v7, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s12
 ; MOVREL-NEXT:    v_mov_b32_e32 v9, s9
 ; MOVREL-NEXT:    v_mov_b32_e32 v10, s10
 ; MOVREL-NEXT:    v_mov_b32_e32 v11, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v13, s13
 ; MOVREL-NEXT:    v_mov_b32_e32 v14, s14
 ; MOVREL-NEXT:    v_mov_b32_e32 v15, s15
+; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
-; MOVREL-NEXT:    ; implicit-def: $vcc_hi
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
 ; MOVREL-NEXT:    s_endpgm
 entry:

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -374,10 +374,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v4
-; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
@@ -399,10 +399,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm
@@ -695,10 +695,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v1
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v4
-; CI-NEXT:    v_mov_b32_e32 v5, s1
 ; CI-NEXT:    v_mov_b32_e32 v4, 42
-; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[2:3], v0
 ; CI-NEXT:    s_endpgm
@@ -720,10 +720,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v1
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, 42
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    flat_atomic_dec v0, v[0:1], v4 glc
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[2:3], v0
 ; VI-NEXT:    s_endpgm