Skip to content

Commit

Permalink
[AMDGPU] Bundle loads before post-RA scheduler
Browse files Browse the repository at this point in the history
We are relying on atrificial DAG edges inserted by the
MemOpClusterMutation to keep loads and stores together in the
post-RA scheduler. This does not work all the time since it
allows to schedule a completely independent instruction in the
middle of the cluster.

Removed the DAG mutation and added pass to bundle already
clustered instructions. These bundles are unpacked before the
memory legalizer because it does not work with bundles but also
because it allows to insert waitcounts in the middle of a store
cluster.

Removing artificial edges also allows a more relaxed scheduling.

Differential Revision: https://reviews.llvm.org/D72737
  • Loading branch information
rampitec committed Jan 24, 2020
1 parent b35b7da commit 555d8f4
Show file tree
Hide file tree
Showing 50 changed files with 627 additions and 421 deletions.
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Expand Up @@ -59,6 +59,7 @@ FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIPreAllocateWWMRegsPass();
FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createSIPostRABundlerPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
Expand Down Expand Up @@ -229,6 +230,9 @@ extern char &SIInsertWaitcntsID;
void initializeSIFormMemoryClausesPass(PassRegistry&);
extern char &SIFormMemoryClausesID;

void initializeSIPostRABundlerPass(PassRegistry&);
extern char &SIPostRABundlerID;

void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
extern char &AMDGPUUnifyDivergentExitNodesID;

Expand Down
48 changes: 0 additions & 48 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -754,53 +754,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
}

namespace {
struct MemOpClusterMutation : ScheduleDAGMutation {
const SIInstrInfo *TII;

MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}

void apply(ScheduleDAGInstrs *DAG) override {
SUnit *SUa = nullptr;
// Search for two consequent memory operations and link them
// to prevent scheduler from moving them apart.
// In DAG pre-process SUnits are in the original order of
// the instructions before scheduling.
for (SUnit &SU : DAG->SUnits) {
MachineInstr &MI2 = *SU.getInstr();
if (!MI2.mayLoad() && !MI2.mayStore()) {
SUa = nullptr;
continue;
}
if (!SUa) {
SUa = &SU;
continue;
}

MachineInstr &MI1 = *SUa->getInstr();
if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
(TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
(TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
(TII->isDS(MI1) && TII->isDS(MI2))) {
SU.addPredBarrier(SUa);

for (const SDep &SI : SU.Preds) {
if (SI.getSUnit() != SUa)
SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
}

if (&SU != &DAG->ExitSU) {
for (const SDep &SI : SUa->Succs) {
if (SI.getSUnit() != &SU)
SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
}
}
}

SUa = &SU;
}
}
};

struct FillMFMAShadowMutation : ScheduleDAGMutation {
const SIInstrInfo *TII;

Expand Down Expand Up @@ -927,7 +880,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {

void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
}

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -235,6 +235,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIPreAllocateWWMRegsPass(*PR);
initializeSIFormMemoryClausesPass(*PR);
initializeSIPostRABundlerPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
Expand Down Expand Up @@ -980,6 +981,7 @@ void GCNPassConfig::addPostRegAlloc() {
}

void GCNPassConfig::addPreSched2() {
addPass(&SIPostRABundlerID);
}

void GCNPassConfig::addPreEmitPass() {
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Expand Up @@ -118,6 +118,7 @@ add_llvm_target(AMDGPUCodeGen
SIOptimizeExecMasking.cpp
SIOptimizeExecMaskingPreRA.cpp
SIPeepholeSDWA.cpp
SIPostRABundler.cpp
SIRegisterInfo.cpp
SIRemoveShortExecBranches.cpp
SIShrinkInstructions.cpp
Expand Down
16 changes: 0 additions & 16 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Expand Up @@ -1663,22 +1663,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
case TargetOpcode::BUNDLE: {
if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
return false;

// If it is a load it must be a memory clause
for (MachineBasicBlock::instr_iterator I = MI.getIterator();
I->isBundledWithSucc(); ++I) {
I->unbundleFromSucc();
for (MachineOperand &MO : I->operands())
if (MO.isReg())
MO.setIsInternalRead(false);
}

MI.eraseFromParent();
break;
}
}
return true;
}
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Expand Up @@ -1289,6 +1289,21 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {

for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {

if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
MachineBasicBlock::instr_iterator II(MI->getIterator());
for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
I != E && I->isBundledWithPred(); ++I) {
I->unbundleFromPred();
for (MachineOperand &MO : I->operands())
if (MO.isReg())
MO.setIsInternalRead(false);
}

MI->eraseFromParent();
MI = II->getIterator();
}

if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;

Expand Down
138 changes: 138 additions & 0 deletions llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -0,0 +1,138 @@
//===-- SIPostRABundler.cpp -----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass creates bundles of memory instructions to protect adjacent loads
/// and stores from beeing rescheduled apart from each other post-RA.
///
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/InitializePasses.h"

using namespace llvm;

#define DEBUG_TYPE "si-post-ra-bundler"

namespace {

class SIPostRABundler : public MachineFunctionPass {
public:
static char ID;

public:
SIPostRABundler() : MachineFunctionPass(ID) {
initializeSIPostRABundlerPass(*PassRegistry::getPassRegistry());
}

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {
return "SI post-RA bundler";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}

private:
const SIRegisterInfo *TRI;

SmallSet<Register, 16> Defs;

bool isDependentLoad(const MachineInstr &MI) const;

};

} // End anonymous namespace.

INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)

char SIPostRABundler::ID = 0;

char &llvm::SIPostRABundlerID = SIPostRABundler::ID;

FunctionPass *llvm::createSIPostRABundlerPass() {
return new SIPostRABundler();
}

bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
if (!MI.mayLoad())
return false;

for (const MachineOperand &Op : MI.explicit_operands()) {
if (!Op.isReg())
continue;
Register Reg = Op.getReg();
for (const Register Def : Defs)
if (TRI->regsOverlap(Reg, Def))
return true;
}

return false;
}

bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;

TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
bool Changed = false;
const unsigned MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
SIInstrFlags::SMRD | SIInstrFlags::DS |
SIInstrFlags::FLAT | SIInstrFlags::MIMG;

for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::instr_iterator Next;
MachineBasicBlock::instr_iterator B = MBB.instr_begin();
MachineBasicBlock::instr_iterator E = MBB.instr_end();
for (auto I = B; I != E; I = Next) {
Next = std::next(I);

if (I->isBundled() || !I->mayLoadOrStore() ||
B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
(B->getDesc().TSFlags & MemFlags) !=
(I->getDesc().TSFlags & MemFlags) ||
isDependentLoad(*I)) {

if (B != I) {
if (std::next(B) != I) {
finalizeBundle(MBB, B, I);
Changed = true;
}
Next = I;
}

B = Next;
Defs.clear();
continue;
}

if (I->getNumExplicitDefs() == 0)
continue;

Defs.insert(I->defs().begin()->getReg());
}

if (B != E && std::next(B) != E) {
finalizeBundle(MBB, B, E);
Changed = true;
}

Defs.clear();
}

return Changed;
}
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
Expand Up @@ -807,9 +807,9 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
; MOVREL-NEXT: v_mov_b32_e32 v34, s19
; MOVREL-NEXT: v_mov_b32_e32 v33, s18
; MOVREL-NEXT: v_mov_b32_e32 v32, s17
; MOVREL-NEXT: v_mov_b32_e32 v31, s16
; MOVREL-NEXT: v_mov_b32_e32 v33, s18
; MOVREL-NEXT: v_mov_b32_e32 v30, s15
; MOVREL-NEXT: v_mov_b32_e32 v29, s14
; MOVREL-NEXT: v_mov_b32_e32 v28, s13
Expand Down Expand Up @@ -1113,10 +1113,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: v_movreld_b32_e32 v2, v0
; MOVREL-NEXT: v_movreld_b32_e32 v3, v1
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off
; MOVREL-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -2053,24 +2053,24 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
; MOVREL-NEXT: s_movreld_b64 s[2:3], s[18:19]
; MOVREL-NEXT: v_mov_b32_e32 v0, s0
; MOVREL-NEXT: v_mov_b32_e32 v4, s4
; MOVREL-NEXT: v_mov_b32_e32 v8, s8
; MOVREL-NEXT: v_mov_b32_e32 v12, s12
; MOVREL-NEXT: v_mov_b32_e32 v1, s1
; MOVREL-NEXT: v_mov_b32_e32 v2, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s3
; MOVREL-NEXT: v_mov_b32_e32 v8, s8
; MOVREL-NEXT: v_mov_b32_e32 v5, s5
; MOVREL-NEXT: v_mov_b32_e32 v6, s6
; MOVREL-NEXT: v_mov_b32_e32 v7, s7
; MOVREL-NEXT: v_mov_b32_e32 v12, s12
; MOVREL-NEXT: v_mov_b32_e32 v9, s9
; MOVREL-NEXT: v_mov_b32_e32 v10, s10
; MOVREL-NEXT: v_mov_b32_e32 v11, s11
; MOVREL-NEXT: v_mov_b32_e32 v13, s13
; MOVREL-NEXT: v_mov_b32_e32 v14, s14
; MOVREL-NEXT: v_mov_b32_e32 v15, s15
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
; MOVREL-NEXT: s_endpgm
entry:
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
Expand Up @@ -374,10 +374,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
Expand All @@ -399,10 +399,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
Expand Down Expand Up @@ -695,10 +695,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
Expand All @@ -720,10 +720,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
Expand Down

0 comments on commit 555d8f4

Please sign in to comment.