-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Refactor hazard recognizer for VALU-pipeline hazards. NFCI. #168801
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Refactor hazard recognizer for VALU-pipeline hazards. NFCI. #168801
Conversation
This is in preparation of handling these in scheduler. I do not expect any changes to the produce code here, it is just an infrastructure. Our current problem with the VALU pipeline hazards is that we only insert V_NOP instructions in the hazard recognizer mode, but ignore it during scheduling. This patch is meant to create a mechanism to actually account for that during scheduling.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesThis is in preparation of handling these in scheduler. I do not expect Full diff: https://github.com/llvm/llvm-project/pull/168801.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 29d22f27a2d8e..d07909251dcfb 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -437,9 +437,6 @@ void GCNHazardRecognizer::RecedeCycle() {
enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
-using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
-using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
-
// Search for a hazard in a block and its predecessors.
template <typename StateT>
static bool
@@ -546,11 +543,14 @@ hasHazard(StateT InitialState,
// Returns a minimum wait states since \p I walking all predecessors.
// Only scans until \p IsExpired does not return true.
// Can only be run in a hazard recognizer mode.
-static int getWaitStatesSince(
- GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
- MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
- IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
- GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
+static int
+getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ const MachineBasicBlock *MBB,
+ MachineBasicBlock::const_reverse_instr_iterator I,
+ int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired,
+ DenseSet<const MachineBasicBlock *> &Visited,
+ GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
+ SIInstrInfo::getNumWaitStates) {
for (auto E = MBB->instr_rend(); I != E; ++I) {
// Don't add WaitStates for parent BUNDLE instructions.
if (I->isBundle())
@@ -582,20 +582,26 @@ static int getWaitStatesSince(
return MinWaitStates;
}
-static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
- const MachineInstr *MI, IsExpiredFn IsExpired) {
+static int
+getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ const MachineInstr *MI,
+ GCNHazardRecognizer::IsExpiredFn IsExpired,
+ GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates =
+ SIInstrInfo::getNumWaitStates) {
DenseSet<const MachineBasicBlock *> Visited;
return getWaitStatesSince(IsHazard, MI->getParent(),
std::next(MI->getReverseIterator()), 0, IsExpired,
- Visited, SIInstrInfo::getNumWaitStates);
+ Visited, GetNumWaitStates);
}
-int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+int GCNHazardRecognizer::getWaitStatesSince(
+ IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {
if (IsHazardRecognizerMode) {
auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
return WaitStates >= Limit;
};
- return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
+ return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,
+ GetNumWaitStates);
}
int WaitStates = 0;
@@ -607,7 +613,7 @@ int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
if (MI->isInlineAsm())
continue;
}
- ++WaitStates;
+ WaitStates += MI ? GetNumWaitStates(*MI) : 1;
if (WaitStates >= Limit)
break;
@@ -1243,6 +1249,20 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
}
+// emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we need
+// to insert, negative means not needed.
+bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {
+ if (WaitStatesNeeded <= 0)
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ for (int I = 0; I < WaitStatesNeeded; ++I)
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_NOP_e32));
+
+ return true;
+}
+
void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVMEMtoScalarWriteHazards(MI);
fixVcmpxPermlaneHazards(MI);
@@ -1257,7 +1277,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVALUTransUseHazard(MI);
fixVALUTransCoexecutionHazards(MI);
fixWMMAHazards(MI); // fall-through if co-execution is enabled.
- fixWMMACoexecutionHazards(MI);
+ emitVNops(MI, checkWMMACoexecutionHazards(MI));
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
fixRequiredExportPriority(MI);
@@ -2045,13 +2065,13 @@ static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
return false;
}
-bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
+int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!AMDGPU::isGFX1250(ST))
- return false;
+ return 0;
const SIInstrInfo *TII = ST.getInstrInfo();
if (!TII->isXDLWMMA(*MI) && !isCoexecutableVALUInst(*MI))
- return false;
+ return 0;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -2129,9 +2149,6 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
};
int Limit = 0;
- auto IsExpiredFn = [&Limit](const MachineInstr &, int WaitStates) {
- return WaitStates >= Limit;
- };
auto GetWaitStatesFn = [](const MachineInstr &I) {
return SIInstrInfo::isVALU(I) ? 1 : 0;
@@ -2141,38 +2158,26 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
if (TII->isXDLWMMA(*MI)) {
for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
Limit = WMMAWaitStates[Category]; // for IsExpiredFn.
- DenseSet<const MachineBasicBlock *> Visited;
- // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // 'getWaitStatesSince' returns the number of VALUs in between if hazard
// exists, and INT_MAX if there is no hazard. As a result, a negative
// WaitStatesNeeded here means no hazard, and we will continue to search
// for other categories.
WaitStatesNeeded =
- Limit - ::getWaitStatesSince(IsWMMAHazardFn, MI->getParent(),
- std::next(MI->getReverseIterator()), 0,
- IsExpiredFn, Visited, GetWaitStatesFn);
+ Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
}
} else { // Must be a co-executable VALU.
for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
Limit = VALUWaitStates[Category]; // for IsExpiredFn.
- DenseSet<const MachineBasicBlock *> Visited;
- // '::getWaitStatesSince' returns the number of VALUs in between if hazard
+ // 'getWaitStatesSince' returns the number of VALUs in between if hazard
// exists, and INT_MAX if there is no hazard. As a result, a negative
// WaitStatesNeeded here means no hazard, and we will continue to search
// for other categories.
WaitStatesNeeded =
- Limit - ::getWaitStatesSince(IsVALUHazardFn, MI->getParent(),
- std::next(MI->getReverseIterator()), 0,
- IsExpiredFn, Visited, GetWaitStatesFn);
+ Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
}
}
- // WaitStatesNeeded now is the number of V_NOPs we need to insert, negative
- // means not needed.
- for (int i = 0; i < WaitStatesNeeded; i++)
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_NOP_e32));
-
- return true;
+ return WaitStatesNeeded;
}
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 67beffadc0913..e5df71ca3788c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+#include "SIInstrInfo.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
@@ -25,13 +26,14 @@ class MachineFunction;
class MachineInstr;
class MachineOperand;
class MachineRegisterInfo;
-class SIInstrInfo;
class SIRegisterInfo;
class GCNSubtarget;
class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
public:
typedef function_ref<bool(const MachineInstr &)> IsHazardFn;
+ typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
+ typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
private:
// Distinguish if we are called from scheduler or hazard recognizer
@@ -74,7 +76,9 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
// used on a newly inserted instruction before returning from PreEmitNoops.
void runOnInstruction(MachineInstr *MI);
- int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
+ int getWaitStatesSince(
+ IsHazardFn IsHazard, int Limit,
+ GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates);
int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
@@ -94,6 +98,9 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
int checkReadM0Hazards(MachineInstr *SMovRel);
int checkNSAtoVMEMHazard(MachineInstr *MI);
int checkFPAtomicToDenormModeHazard(MachineInstr *MI);
+ // emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we
+ // need to insert, negative means not needed.
+ bool emitVNops(MachineInstr *MI, int WaitStatesNeeded);
void fixHazards(MachineInstr *MI);
bool fixVcmpxPermlaneHazards(MachineInstr *MI);
bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
@@ -106,7 +113,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixVALUTransUseHazard(MachineInstr *MI);
bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
bool fixWMMAHazards(MachineInstr *MI);
- bool fixWMMACoexecutionHazards(MachineInstr *MI);
+ int checkWMMACoexecutionHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
bool fixRequiredExportPriority(MachineInstr *MI);
|
🐧 Linux x64 Test Results
|
| int checkNSAtoVMEMHazard(MachineInstr *MI); | ||
| int checkFPAtomicToDenormModeHazard(MachineInstr *MI); | ||
| // emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we | ||
| // need to insert, negative means not needed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems silly to use int here. Zero already means "not needed" so you don't need negative values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree, but all other check* functions return int as well.
| #ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H | ||
| #define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H | ||
|
|
||
| #include "SIInstrInfo.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't see why you would need to move the include here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I need SIInstrInfo::getNumWaitStates as a default value in getWaitStatesSince declaration.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also just not use the defaulted argument and be explicit everywhere
| int checkReadM0Hazards(MachineInstr *SMovRel); | ||
| int checkNSAtoVMEMHazard(MachineInstr *MI); | ||
| int checkFPAtomicToDenormModeHazard(MachineInstr *MI); | ||
| // emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| // emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we | |
| // Emit V_NOP instructions. \p WaitStatesNeeded is the number of V_NOPs we |
| #ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H | ||
| #define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H | ||
|
|
||
| #include "SIInstrInfo.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also just not use the defaulted argument and be explicit everywhere

This is in preparation of handling these in scheduler. I do not expect
any changes to the produced code here, it is just an infrastructure.
Our current problem with the VALU pipeline hazards is that we only
insert V_NOP instructions in the hazard recognizer mode, but ignore
it during scheduling. This patch is meant to create a mechanism to
actually account for that during scheduling.