-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GlobalISelDivergenceLowering: select divergent i1 phis #78482
AMDGPU/GlobalISelDivergenceLowering: select divergent i1 phis #78482
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesImplement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper. Use machine uniformity analysis to find divergent i1 phis and select them as lane mask phis in same way SILowerI1Copies select VReg_1 phis. Note that divergent i1 phis include phis created by LCSSA and all cases of uses outside of cycle are actually covered by "lowering LCSSA phis". GlobalISel lane masks are registers with sgpr register class and S1 LLT. TODO: General goal is that instructions created in this pass are fully instruction-selected so that selection of lane mask phis is not split across multiple passes. patch 3 from: #73337 Patch is 132.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78482.diff 19 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
index e6da099751e7ae..1039ac4e5189b3 100644
--- a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
@@ -32,6 +32,25 @@ MachineUniformityInfo computeMachineUniformityInfo(
MachineFunction &F, const MachineCycleInfo &cycleInfo,
const MachineDomTree &domTree, bool HasBranchDivergence);
+/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
+class MachineUniformityAnalysisPass : public MachineFunctionPass {
+ MachineUniformityInfo UI;
+
+public:
+ static char ID;
+
+ MachineUniformityAnalysisPass();
+
+ MachineUniformityInfo &getUniformityInfo() { return UI; }
+ const MachineUniformityInfo &getUniformityInfo() const { return UI; }
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+ // TODO: verify analysis
+};
+
} // namespace llvm
#endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 3e0fe2b1ba087f..131138e0649e4c 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -165,25 +165,6 @@ MachineUniformityInfo llvm::computeMachineUniformityInfo(
namespace {
-/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
-class MachineUniformityAnalysisPass : public MachineFunctionPass {
- MachineUniformityInfo UI;
-
-public:
- static char ID;
-
- MachineUniformityAnalysisPass();
-
- MachineUniformityInfo &getUniformityInfo() { return UI; }
- const MachineUniformityInfo &getUniformityInfo() const { return UI; }
-
- bool runOnMachineFunction(MachineFunction &F) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override;
- void print(raw_ostream &OS, const Module *M = nullptr) const override;
-
- // TODO: verify analysis
-};
-
class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
public:
static char ID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index 4cd8b1ec1051f4..9ac74ef55b275f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -16,7 +16,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "SILowerI1Copies.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/InitializePasses.h"
#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
@@ -42,14 +45,152 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
+class DivergenceLoweringHelper : public PhiLoweringHelper {
+public:
+ DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT,
+ MachineUniformityInfo *MUI);
+
+private:
+ MachineUniformityInfo *MUI = nullptr;
+
+public:
+ void markAsLaneMask(Register DstReg) const override;
+ void getCandidatesForLowering(
+ SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
+ void collectIncomingValuesFromPhi(
+ const MachineInstr *MI,
+ SmallVectorImpl<Incoming> &Incomings) const override;
+ void replaceDstReg(Register NewReg, Register OldReg,
+ MachineBasicBlock *MBB) override;
+ void buildMergeLaneMasks(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ Register DstReg, Register PrevReg,
+ Register CurReg) override;
+ void constrainAsLaneMask(Incoming &In) override;
+};
+
+DivergenceLoweringHelper::DivergenceLoweringHelper(
+ MachineFunction *MF, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
+ : PhiLoweringHelper(MF, DT, PDT), MUI(MUI) {}
+
+// _(s1) -> SReg_32/64(s1)
+void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
+ assert(MRI->getType(DstReg) == LLT::scalar(1));
+
+ if (MRI->getRegClassOrNull(DstReg)) {
+ MRI->constrainRegClass(DstReg, ST->getBoolRC());
+ return;
+ }
+
+ MRI->setRegClass(DstReg, ST->getBoolRC());
+}
+
+void DivergenceLoweringHelper::getCandidatesForLowering(
+ SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
+ LLT S1 = LLT::scalar(1);
+
+ // Add divergent i1 phis to the list
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB.phis()) {
+ Register Dst = MI.getOperand(0).getReg();
+ if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
+ Vreg1Phis.push_back(&MI);
+ }
+ }
+}
+
+void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
+ const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
+ for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
+ Incomings.emplace_back(MI->getOperand(i).getReg(),
+ MI->getOperand(i + 1).getMBB(), Register());
+ }
+}
+
+void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
+ MachineBasicBlock *MBB) {
+ BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
+ .addReg(NewReg);
+}
+
+// Get pointers to build instruction just after MI (skips phis if needed)
+static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
+getInsertAfterPtrs(MachineInstr *MI) {
+ MachineBasicBlock *InsertMBB = MI->getParent();
+ return {InsertMBB,
+ InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator()))};
+}
+
+// bb.previous
+// %PrevReg = ...
+//
+// bb.current
+// %CurReg = ...
+//
+// %DstReg - not defined
+//
+// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
+//
+// bb.previous
+// %PrevReg = ...
+// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
+//
+// bb.current
+// %CurReg = ...
+// %CurRegCopy:sreg_32(s1) = COPY %CurReg
+// ...
+// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
+// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
+// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
+//
+// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
+void DivergenceLoweringHelper::buildMergeLaneMasks(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
+ Register DstReg, Register PrevReg, Register CurReg) {
+ // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
+ // TODO: check if inputs are constants or results of a compare.
+
+ Register PrevRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+ auto [PrevMBB, AfterPrevReg] = getInsertAfterPtrs(MRI->getVRegDef(PrevReg));
+ BuildMI(*PrevMBB, AfterPrevReg, DL, TII->get(AMDGPU::COPY), PrevRegCopy)
+ .addReg(PrevReg);
+ Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+ BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
+ .addReg(PrevRegCopy)
+ .addReg(ExecReg);
+
+ Register CurRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+ auto [CurMBB, AfterCurReg] = getInsertAfterPtrs(MRI->getVRegDef(CurReg));
+ BuildMI(*CurMBB, AfterCurReg, DL, TII->get(AMDGPU::COPY), CurRegCopy)
+ .addReg(CurReg);
+ Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+ BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
+ .addReg(ExecReg)
+ .addReg(CurRegCopy);
+
+ BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
+ .addReg(PrevMaskedReg)
+ .addReg(CurMaskedReg);
+}
+
+void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
+
} // End anonymous namespace.
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
"AMDGPU GlobalISel divergence lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
"AMDGPU GlobalISel divergence lowering", false, false)
@@ -64,5 +205,14 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
MachineFunction &MF) {
- return false;
+ MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
+ MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
+ MachineUniformityInfo &MUI =
+ getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
+
+ DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
+
+ bool Changed = false;
+ Changed |= Helper.lowerPhis();
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1d31c6b8fde93a..5c37f163e37075 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -210,6 +210,14 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI->getType(DefReg);
+ // Lane mask PHIs, PHI where all register operands have sgpr register class
+ // with S1 LLT, are already selected in divergence lowering pass.
+ if (I.getOpcode() == AMDGPU::PHI) {
+ assert(MRI->getType(DefReg) == LLT::scalar(1));
+ assert(TRI.isSGPRClass(MRI->getRegClass(DefReg)));
+ return true;
+ }
+
if (DefTy == LLT::scalar(1)) {
if (!AllowRiskySelect) {
LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index cfa0c21def791d..db06ef0250470d 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -78,7 +78,7 @@ class Vreg1LoweringHelper : public PhiLoweringHelper {
MachineBasicBlock::iterator I, const DebugLoc &DL,
Register DstReg, Register PrevReg,
Register CurReg) override;
- void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
+ void constrainAsLaneMask(Incoming &In) override;
bool lowerCopiesFromI1();
bool lowerCopiesToI1();
@@ -619,7 +619,7 @@ bool PhiLoweringHelper::lowerPhis() {
for (auto &Incoming : Incomings) {
MachineBasicBlock &IMBB = *Incoming.Block;
if (PIA.isSource(IMBB)) {
- constrainIncomingRegisterTakenAsIs(Incoming);
+ constrainAsLaneMask(Incoming);
SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
} else {
Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
@@ -911,6 +911,4 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
}
}
-void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
- return;
-}
+void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
index 5099d39c2d1415..f0fe9bb4d03469 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -91,7 +91,7 @@ class PhiLoweringHelper {
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register DstReg,
Register PrevReg, Register CurReg) = 0;
- virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
+ virtual void constrainAsLaneMask(Incoming &In) = 0;
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 7a68aec1a1c555..06a8f80e6aa34a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
; Divergent phis that don't require lowering using lane mask merging
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index d314ebe355f51d..55f22b0bbb4df6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -1,5 +1,9 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
+
+# Test is updated but copies between S1-register-with-reg-class and
+# register-with-reg-class-no-LLT fail machine verification
+# REQUIRES: do-not-run-me-with-machine-verifier
--- |
define void @divergent_i1_phi_uniform_branch() {ret void}
@@ -46,7 +50,7 @@ body: |
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.4(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
; GFX10-NEXT: G_BR %bb.4
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.3:
@@ -126,6 +130,7 @@ body: |
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2
@@ -136,12 +141,17 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+ ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+ ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
; GFX10-NEXT: S_ENDPGM 0
bb.0:
@@ -191,30 +201,37 @@ body: |
; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
; GFX10-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
- ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
- ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
- ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+ ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+ ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+ ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
- ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
- ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
; GFX10-NEXT: SI_RETURN
bb.0:
@@ -279,25 +296,28 @@ body: |
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; GFX10-NEXT: ...
[truncated]
|
// Lane mask PHIs, PHI where all register operands have sgpr register class | ||
// with S1 LLT, are already selected in divergence lowering pass. | ||
if (I.getOpcode() == AMDGPU::PHI) { | ||
assert(MRI->getType(DefReg) == LLT::scalar(1)); | ||
assert(TRI.isSGPRClass(MRI->getRegClass(DefReg))); | ||
return true; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove this, we shouldn't be trying to select any PHI as-is. You should instead remove the special case isPHI handling in ::select and just make it a regular switch case over G_PHI
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just have some nits to make the pass easier to follow, as I think this is going to be a core component of the GISel flow so the code should be as approachable as possible to encourage contributions and make bugfixing easier
void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) { | ||
return; | ||
} | ||
void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) { return; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can omit return;
if (MRI->getRegClassOrNull(DstReg)) { | ||
MRI->constrainRegClass(DstReg, ST->getBoolRC()); | ||
return; | ||
} | ||
|
||
MRI->setRegClass(DstReg, ST->getBoolRC()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you need to check the return result of constrainRegClass
as it can return nullptr
on failure
// bb.previous | ||
// %PrevReg = ... | ||
// | ||
// bb.current | ||
// %CurReg = ... | ||
// | ||
// %DstReg - not defined | ||
// | ||
// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) | ||
// | ||
// bb.previous | ||
// %PrevReg = ... | ||
// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg | ||
// | ||
// bb.current | ||
// %CurReg = ... | ||
// %CurRegCopy:sreg_32(s1) = COPY %CurReg | ||
// ... | ||
// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 | ||
// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 | ||
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg | ||
// | ||
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indent the code in the BBs so it's easier to follow
// bb.previous | |
// %PrevReg = ... | |
// | |
// bb.current | |
// %CurReg = ... | |
// | |
// %DstReg - not defined | |
// | |
// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) | |
// | |
// bb.previous | |
// %PrevReg = ... | |
// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg | |
// | |
// bb.current | |
// %CurReg = ... | |
// %CurRegCopy:sreg_32(s1) = COPY %CurReg | |
// ... | |
// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 | |
// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 | |
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg | |
// | |
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg | |
// bb.previous | |
// %PrevReg = ... | |
// | |
// bb.current | |
// %CurReg = ... | |
// | |
// %DstReg - not defined | |
// | |
// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) | |
// | |
// bb.previous | |
// %PrevReg = ... | |
// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg | |
// | |
// bb.current | |
// %CurReg = ... | |
// %CurRegCopy:sreg_32(s1) = COPY %CurReg | |
// ... | |
// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 | |
// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 | |
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg | |
// | |
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg |
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg | ||
// | ||
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg | ||
void DivergenceLoweringHelper::buildMergeLaneMasks( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change getInsertAfterPtrs
so it's a method that takes a Register
, and make it directly return the RegCopy
: make it call createLaneMaskReg
+ BuildMI.
It's only ever used in that context anyway. You could name it something like buildRegCopy
?
Then the function becomes a bit easier to follow.
Also does the order work? You're inserting everything before I
. Don't the AndN2/And/Or need to follow each other in order? I'd use a MachineIRBuilder
to further remove noise from the function (no need to pass MMB/I/DL everytime)
bool Changed = false; | ||
Changed |= Helper.lowerPhis(); | ||
return Changed; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool Changed = false; | |
Changed |= Helper.lowerPhis(); | |
return Changed; | |
return Helper.lowerPhis(); |
Implement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper. Use machine uniformity analysis to find divergent i1 phis and select them as lane mask phis in same way SILowerI1Copies select VReg_1 phis. Note that divergent i1 phis include phis created by LCSSA and all cases of uses outside of cycle are actually covered by "lowering LCSSA phis". GlobalISel lane masks are registers with sgpr register class and S1 LLT. TODO: General goal is that instructions created in this pass are fully instruction-selected so that selection of lane mask phis is not split across multiple passes. patch 3 from: llvm#73337
376ef1c
to
eeebfe9
Compare
Please check #73337 for remaining two patches. |
This results in a stack trace in AMDGPU OpenMP Offload bots. |
Thanks! If you need assistance in reproducing / looking into, feel free to reach out. |
Implement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper. Use machine uniformity analysis to find divergent i1 phis and select them as lane mask phis in same way SILowerI1Copies select VReg_1 phis. Note that divergent i1 phis include phis created by LCSSA and all cases of uses outside of cycle are actually covered by "lowering LCSSA phis". GlobalISel lane masks are registers with sgpr register class and S1 LLT.
TODO: General goal is that instructions created in this pass are fully instruction-selected so that selection of lane mask phis is not split across multiple passes.
patch 3 from: #73337