Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Late temporal divergence lowering for SDAG #67033

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions llvm/include/llvm/ADT/GenericUniformityImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
using DivergenceDescriptorT =
typename SyncDependenceAnalysisT::DivergenceDescriptor;
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
using UseOutsideCycleInfoT =
typename std::tuple<ConstValueRefT, const InstructionT *,
SmallVector<BlockT *, 4>>;

GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI)
Expand Down Expand Up @@ -396,6 +399,8 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {

void print(raw_ostream &out) const;

iterator_range<const UseOutsideCycleInfoT *> uses_outside_cycle() const;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct that this only tracks uses outside a cycle with divergent exits? This should be in the name or at least in a comment.


protected:
/// \brief Value/block pair representing a single phi input.
struct PhiInput {
Expand Down Expand Up @@ -427,6 +432,7 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {

// Recognized cycles with divergent exits.
SmallPtrSet<const CycleT *, 16> DivergentExitCycles;
SmallVector<UseOutsideCycleInfoT, 4> UsesOutsideCycle;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct that this only tracks uses outside a cycle with divergent exits? This should be in the name or at least in a comment.


// Cycles assumed to be divergent.
//
Expand Down Expand Up @@ -470,6 +476,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
/// \brief Whether \p Def is divergent when read in \p ObservingBlock.
bool isTemporalDivergent(const BlockT &ObservingBlock,
const InstructionT &Def) const;

void recordUseOutsideCycle(ConstValueRefT Src, const InstructionT *UserInstr,
const CycleT &DefCycle);
};

template <typename ImplT>
Expand Down Expand Up @@ -1210,6 +1219,18 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
}
}

template <typename ContextT>
using UseOutsideCycleInfoT =
typename std::tuple<typename ContextT::ConstValueRefT,
const typename ContextT::InstructionT *,
SmallVector<typename ContextT::BlockT *, 4>>;

template <typename ContextT>
iterator_range<const UseOutsideCycleInfoT<ContextT> *>
GenericUniformityAnalysisImpl<ContextT>::uses_outside_cycle() const {
return make_range(UsesOutsideCycle.begin(), UsesOutsideCycle.end());
}

template <typename ContextT>
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
return DA->hasDivergence();
Expand Down Expand Up @@ -1248,6 +1269,12 @@ void GenericUniformityInfo<ContextT>::print(raw_ostream &out) const {
DA->print(out);
}

template <typename ContextT>
iterator_range<const UseOutsideCycleInfoT<ContextT> *>
GenericUniformityInfo<ContextT>::uses_outside_cycle() const {
return DA->uses_outside_cycle();
}

template <typename ContextT>
void llvm::ModifiedPostOrder<ContextT>::computeStackPO(
SmallVectorImpl<const BlockT *> &Stack, const CycleInfoT &CI,
Expand Down Expand Up @@ -1367,6 +1394,14 @@ void llvm::ModifiedPostOrder<ContextT>::compute(const CycleInfoT &CI) {
computeStackPO(Stack, CI, nullptr, Finalized);
}

template <typename ContextT>
void GenericUniformityAnalysisImpl<ContextT>::recordUseOutsideCycle(
ConstValueRefT Src, const InstructionT *UserInstr, const CycleT &DefCycle) {
SmallVector<BlockT *, 4> TmpExitBlocks;
DefCycle.getExitBlocks(TmpExitBlocks);
UsesOutsideCycle.push_back({Src, UserInstr, TmpExitBlocks});
}

} // namespace llvm

#undef DEBUG_TYPE
Expand Down
5 changes: 5 additions & 0 deletions llvm/include/llvm/ADT/GenericUniformityInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ template <typename ContextT> class GenericUniformityInfo {

using CycleInfoT = GenericCycleInfo<ContextT>;
using CycleT = typename CycleInfoT::CycleT;
using UseOutsideCycleInfoT =
typename std::tuple<ConstValueRefT, const InstructionT *,
SmallVector<BlockT *, 4>>;

GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI = nullptr);
Expand Down Expand Up @@ -78,6 +81,8 @@ template <typename ContextT> class GenericUniformityInfo {

void print(raw_ostream &Out) const;

iterator_range<const UseOutsideCycleInfoT *> uses_outside_cycle() const;

private:
using ImplT = GenericUniformityAnalysisImpl<ContextT>;

Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Analysis/UniformityAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,16 @@ template <>
void llvm::GenericUniformityAnalysisImpl<
SSAContext>::propagateTemporalDivergence(const Instruction &I,
const Cycle &DefCycle) {
if (isDivergent(I))
return;
for (auto *User : I.users()) {
auto *UserInstr = cast<Instruction>(User);
if (DefCycle.contains(UserInstr->getParent()))
continue;

recordUseOutsideCycle(cast<Value>(&I), UserInstr, DefCycle);

if (isDivergent(I))
continue;

markDivergent(*UserInstr);
}
}
Expand Down
15 changes: 4 additions & 11 deletions llvm/lib/CodeGen/MachineSink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB,
if (!Reg)
continue;
if (MO.isUse()) {
if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
if (Reg.isPhysical() &&
(TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg))))
continue;
if (PI->modifiesRegister(Reg, TRI))
return true;
Expand Down Expand Up @@ -1006,24 +1007,16 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
if (MBB == SuccToSinkTo)
return nullptr;

if (!SuccToSinkTo)
return nullptr;

// It's not safe to sink instructions to EH landing pad. Control flow into
// landing pad is implicitly defined.
if (SuccToSinkTo->isEHPad())
if (SuccToSinkTo && SuccToSinkTo->isEHPad())
return nullptr;

// It ought to be okay to sink instructions into an INLINEASM_BR target, but
// only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
// the source block (which this code does not yet do). So for now, forbid
// doing so.
if (SuccToSinkTo->isInlineAsmBrIndirectTarget())
return nullptr;

MachineBasicBlock::const_iterator InsertPos =
SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI))
if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
return nullptr;

return SuccToSinkTo;
Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,16 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
if (!Op.getReg().isVirtual())
continue;
auto Reg = Op.getReg();
if (isDivergent(Reg))
continue;

for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
if (DefCycle.contains(UserInstr.getParent()))
continue;

recordUseOutsideCycle(Reg, &UserInstr, DefCycle);

if (isDivergent(Reg))
continue;

markDivergent(UserInstr);
}
}
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createAMDGPUTemporalDivergenceLoweringPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
Expand Down Expand Up @@ -151,6 +152,9 @@ extern char &SILowerWWMCopiesID;
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;

void initializeAMDGPUTemporalDivergenceLoweringPass(PassRegistry &);
extern char &AMDGPUTemporalDivergenceLoweringID;

void initializeSILowerSGPRSpillsPass(PassRegistry &);
extern char &SILowerSGPRSpillsID;

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
initializeAMDGPUTemporalDivergenceLoweringPass(*PR);
initializeSILowerWWMCopiesPass(*PR);
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
Expand Down Expand Up @@ -1240,6 +1241,8 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
}

void GCNPassConfig::addPreRegAlloc() {
addPass(createAMDGPUTemporalDivergenceLoweringPass());

if (LateCFGStructurize) {
addPass(createAMDGPUMachineCFGStructurizerPass());
}
Expand Down
121 changes: 121 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTemporalDivergenceLowering.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
//===- AMDGPUTemporalDivergenceLowering.cpp -------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/InitializePasses.h"

#define DEBUG_TYPE "temporal-divergence-lowering"

using namespace llvm;

namespace {

class AMDGPUTemporalDivergenceLowering : public MachineFunctionPass {
public:
static char ID;

public:
AMDGPUTemporalDivergenceLowering() : MachineFunctionPass(ID) {
initializeAMDGPUTemporalDivergenceLoweringPass(
*PassRegistry::getPassRegistry());
}

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {
return "Temporal divergence lowering";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineCycleInfoWrapperPass>();
AU.addRequired<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};

} // End anonymous namespace.

INITIALIZE_PASS_BEGIN(AMDGPUTemporalDivergenceLowering, DEBUG_TYPE,
"Temporal divergence lowering", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(AMDGPUTemporalDivergenceLowering, DEBUG_TYPE,
"Temporal divergence lowering", false, false)

char AMDGPUTemporalDivergenceLowering::ID = 0;

char &llvm::AMDGPUTemporalDivergenceLoweringID =
AMDGPUTemporalDivergenceLowering::ID;

FunctionPass *llvm::createAMDGPUTemporalDivergenceLoweringPass() {
return new AMDGPUTemporalDivergenceLowering();
}

static void replaceUseRegisterWith(const MachineInstr *MI, Register Reg,
Register Newreg) {
for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
const MachineOperand &Op = MI->getOperand(i);
if (Op.isReg() && Op.getReg() == Reg) {
const_cast<MachineInstr *>(MI)->getOperand(i).setReg(Newreg);
}
}
}
// Get poiners to build instruction just after MI (skips phis if needed)
static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
getInsertAfterPtrs(MachineInstr *MI) {
MachineBasicBlock *InsertMBB = MI->getParent();
return std::make_pair(
InsertMBB, InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
}

bool AMDGPUTemporalDivergenceLowering::runOnMachineFunction(
MachineFunction &MF) {

MachineCycleInfo &CycleInfo =
getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>();

MachineUniformityInfo MUI =
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), true);

MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();

// Temporal divergence lowering is required for uniform UniformSourceReg
// and divergent UserInstr. UserInstr is uniform only when loop is uniform.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes should consistently use the term "cycle" and not "loop" everywhere.

What does this comment mean? Is it a pre-condition or a post-condition for this pass? By "loop is uniform", do you mean the cycle does not have divergent exits? Since UserInstr is outside the cycle, it could be divergent for any other reason too, like some operand which is not inside the cycle.

Copy link
Collaborator Author

@petar-avramovic petar-avramovic Sep 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was meant to be a simple comment explaining when we need to insert a copy to vgpr, also it is there to give better context for fix me comment below. UserInstr in reported as uniform in some cases (cycle had divergent exit)

I would expect that it was enough to ask if SrcReg was uniform (UserInstr should be divergent because of temporal divergence unless it was a uniform loop)

However, machine uniformity analysis detects all temporal divergence cases that require lowering.

By "loop is uniform", do you mean the cycle does not have divergent exits?

I am not sure about the terminology can you point me some reference? But at this point cycles have exactly one entry point (is that good enough to call them natural loops?) and exactly one exit. I only looked in tests where something was wrong and all of them had "divergent exit from the cycle"

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I finally parsed this statement correctly: "UserInstr is uniform only when loop is uniform." I got confused by the "only when". About the term "loop", how do you know that every cycle is reducible at this point? This pass is fairly generic, and could be moved around. If this pass actually assumes that every cycle is reducible, then there should be asserts about that. Also separately, instead of saying "is uniform", it's less confusing to say "has divergent exits".

for (auto [SrcReg, UserInstr, CycleExitBlocks] : MUI.uses_outside_cycle()) {
if (!MUI.isUniform(SrcReg) || !MUI.isDivergent(UserInstr))
continue;

MachineInstr *UniformSourceInstr = MRI.getVRegDef(SrcReg);

// FixMe: SrcReg is lane mask in this case. Find a better way to detect it.
if (UniformSourceInstr->getOpcode() == AMDGPU::SI_IF_BREAK ||
UserInstr->getOpcode() == AMDGPU::SI_IF)
continue;

unsigned Size = TRI.getRegSizeInBits(*MRI.getRegClassOrNull(SrcReg));
Register VgprDst =
MRI.createVirtualRegister(TRI.getVGPRClassForBitWidth(Size));

auto [MBB, AfterUniformSourceReg] = getInsertAfterPtrs(UniformSourceInstr);
BuildMI(*MBB, AfterUniformSourceReg, {}, TII.get(AMDGPU::COPY))
.addDef(VgprDst)
.addReg(SrcReg)
.addReg(AMDGPU::EXEC, RegState::Implicit);

replaceUseRegisterWith(UserInstr, SrcReg, VgprDst);
}

return true;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUTargetMachine.cpp
AMDGPUTargetObjectFile.cpp
AMDGPUTargetTransformInfo.cpp
AMDGPUTemporalDivergenceLowering.cpp
AMDGPUUnifyDivergentExitNodes.cpp
AMDGPUUnifyMetadata.cpp
R600MachineCFGStructurizer.cpp
Expand Down
13 changes: 11 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,12 @@
; GCN-O0-NEXT: Finalize ISel and expand pseudo-instructions
; GCN-O0-NEXT: Local Stack Slot Allocation
; GCN-O0-NEXT: Register Usage Information Propagation
; GCN-O0-NEXT: Machine Cycle Info Analysis
; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: Temporal divergence lowering
; GCN-O0-NEXT: Eliminate PHI nodes for register allocation
; GCN-O0-NEXT: SI Lower control flow pseudo instructions
; GCN-O0-NEXT: Two-Address instruction pass
; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
; GCN-O0-NEXT: MachinePostDominator Tree Construction
Expand Down Expand Up @@ -322,12 +324,13 @@
; GCN-O1-NEXT: Remove dead machine instructions
; GCN-O1-NEXT: SI Shrink Instructions
; GCN-O1-NEXT: Register Usage Information Propagation
; GCN-O1-NEXT: MachineDominator Tree Construction
; GCN-O1-NEXT: Temporal divergence lowering
; GCN-O1-NEXT: Detect Dead Lanes
; GCN-O1-NEXT: Remove dead machine instructions
; GCN-O1-NEXT: Process Implicit Definitions
; GCN-O1-NEXT: Remove unreachable machine basic blocks
; GCN-O1-NEXT: Live Variable Analysis
; GCN-O1-NEXT: MachineDominator Tree Construction
; GCN-O1-NEXT: SI Optimize VGPR LiveRange
; GCN-O1-NEXT: Eliminate PHI nodes for register allocation
; GCN-O1-NEXT: SI Lower control flow pseudo instructions
Expand Down Expand Up @@ -616,6 +619,8 @@
; GCN-O1-OPTS-NEXT: Remove dead machine instructions
; GCN-O1-OPTS-NEXT: SI Shrink Instructions
; GCN-O1-OPTS-NEXT: Register Usage Information Propagation
; GCN-O1-OPTS-NEXT: Machine Cycle Info Analysis
; GCN-O1-OPTS-NEXT: Temporal divergence lowering
; GCN-O1-OPTS-NEXT: Detect Dead Lanes
; GCN-O1-OPTS-NEXT: Remove dead machine instructions
; GCN-O1-OPTS-NEXT: Process Implicit Definitions
Expand Down Expand Up @@ -919,6 +924,8 @@
; GCN-O2-NEXT: Remove dead machine instructions
; GCN-O2-NEXT: SI Shrink Instructions
; GCN-O2-NEXT: Register Usage Information Propagation
; GCN-O2-NEXT: Machine Cycle Info Analysis
; GCN-O2-NEXT: Temporal divergence lowering
; GCN-O2-NEXT: Detect Dead Lanes
; GCN-O2-NEXT: Remove dead machine instructions
; GCN-O2-NEXT: Process Implicit Definitions
Expand Down Expand Up @@ -1235,6 +1242,8 @@
; GCN-O3-NEXT: Remove dead machine instructions
; GCN-O3-NEXT: SI Shrink Instructions
; GCN-O3-NEXT: Register Usage Information Propagation
; GCN-O3-NEXT: Machine Cycle Info Analysis
; GCN-O3-NEXT: Temporal divergence lowering
; GCN-O3-NEXT: Detect Dead Lanes
; GCN-O3-NEXT: Remove dead machine instructions
; GCN-O3-NEXT: Process Implicit Definitions
Expand Down
Loading