-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Introduce iglp_opt(2): Generalized exp/mfma interleaving for select kernels #81342
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesThis is a redesign of #80370 . The implementation is a bit more organized and less repetitive. This implements the basic pipelining structure of exp/mfma interleaving for better extensibility. While it does have improved extensibility, there are controls which only enable it for DAGs with certain characteristics (matching the DAGs it has been designed against). It is still a WIP while I do some last minute Patch is 326.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81342.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 4462cd8a31f13e..244252cfdeef9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -140,8 +140,6 @@ class SchedGroup {
// Count of the number of created SchedGroups, used to initialize SGID.
static unsigned NumSchedGroups;
- const SIInstrInfo *TII;
-
// Try to add and edge from SU A to SU B.
bool tryAddEdge(SUnit *A, SUnit *B);
@@ -154,6 +152,7 @@ class SchedGroup {
SmallVector<SUnit *, 32> Collection;
ScheduleDAGInstrs *DAG;
+ const SIInstrInfo *TII;
// Returns true if SU can be added to this SchedGroup.
bool canAddSU(SUnit &SU) const;
@@ -234,13 +233,13 @@ class SchedGroup {
SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
+ : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
SGID = NumSchedGroups++;
}
SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
+ : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
SGID = NumSchedGroups++;
}
};
@@ -838,6 +837,7 @@ void PipelineSolver::solve() {
enum IGLPStrategyID : int {
MFMASmallGemmOptID = 0,
MFMASmallGemmSingleWaveOptID = 1,
+ MFMAExpInterleave = 2
};
// Implement a IGLP scheduling strategy.
@@ -852,7 +852,7 @@ class IGLPStrategy {
virtual void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- bool IsReentry) = 0;
+ IGLPPhase Phase) = 0;
// Returns true if this strategy should be applied to a ScheduleDAG.
virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -871,7 +871,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- bool IsReentry) override;
+ IGLPPhase Phase) override;
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
@@ -884,7 +884,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
void MFMASmallGemmOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
- bool IsReentry) {
+ IGLPPhase Phase) {
// Count the number of MFMA instructions.
unsigned MFMACount = 0;
for (const MachineInstr &I : *DAG)
@@ -904,6 +904,859 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
}
}
+class MFMAExpInterleaveOpt final : public IGLPStrategy {
+private:
+ // Compute the heuristics for the pipeline, returning whether or not the DAG
+ // is well formatted for the mutation
+ bool analyzeDAG(SmallVectorImpl<SUnit *> &MFMAChainSeeds,
+ const SIInstrInfo *TII);
+
+ /// Whether or not the instruction is a transitive predecessor of an MFMA
+ /// instruction
+ class IsPipeExp final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+ auto DAG = SyncPipe[0].DAG;
+ auto TII = SyncPipe[0].TII;
+
+ if (Cache->empty()) {
+ auto I = DAG->SUnits.rbegin();
+ auto E = DAG->SUnits.rend();
+ for (; I != E; I++) {
+ if (TII->isMFMAorWMMA(*(I->getInstr())))
+ Cache->push_back(&*I);
+ }
+ }
+
+ if (Cache->empty())
+ return false;
+
+ auto Reaches = (std::any_of(
+ Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
+ return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+ }));
+
+ return Reaches;
+ }
+ IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ /// Whether or not the instruction enables the exact MFMA that is the \p
+ /// Number th MFMA in the chain starting with \p ChainSeed
+ class EnablesNthMFMA final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ bool FoundTrans = false;
+ unsigned Counter = 1;
+ auto DAG = SyncPipe[0].DAG;
+
+ if (Cache->empty()) {
+ auto TII = SyncPipe[0].TII;
+ SmallVector<SUnit *, 8> Worklist;
+
+ auto I = DAG->SUnits.begin();
+ auto E = DAG->SUnits.end();
+ for (; I != E; I++) {
+ if (!FoundTrans) {
+ if (TII->isTRANS(I->getInstr()->getOpcode()))
+ FoundTrans = true;
+ continue;
+ } else {
+ if (TII->isMFMAorWMMA(*I->getInstr())) {
+ if (Counter == Number) {
+ Cache->push_back(&*I);
+ break;
+ }
+ ++Counter;
+ }
+ }
+ }
+ }
+ if (Cache->empty())
+ return false;
+
+ return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+ }
+
+ EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+ };
+
+ /// Whether or not the instruction enables a transitive predecessor of the
+ /// same MFMA instruction as an instruction in a SchedGroup \p Number steps
+ /// before
+ class EnablesNthMFMAInChain final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+ SUnit *ChainSeed;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ auto DAG = SyncPipe[0].DAG;
+ auto TII = SyncPipe[0].TII;
+
+ if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
+ return false;
+
+ if (Cache->empty()) {
+ auto TempSU = ChainSeed;
+ auto Depth = Number;
+ while (Depth > 0) {
+ --Depth;
+ bool Found = false;
+ for (auto &Succ : TempSU->Succs) {
+ if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
+ TempSU = Succ.getSUnit();
+ Found = true;
+ break;
+ }
+ }
+ if (!Found)
+ return false;
+ }
+
+ Cache->push_back(TempSU);
+ }
+ if (Cache->empty())
+ return false;
+
+ return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+ }
+
+ EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
+ const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number),
+ ChainSeed(ChainSeed) {}
+ };
+
+ /// Whether or not the instruction has less than \p Size immediate successors.
+ /// If \p HasIntermediary is true, this tests also whether all successors of
+ /// the SUnit have less than \p Size successors.
+ class LessThanNSuccs final : public InstructionRule {
+ private:
+ unsigned Size = 1;
+ bool HasIntermediary = false;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ if (!SyncPipe.size())
+ return false;
+
+ auto SuccSize = std::count_if(
+ SU->Succs.begin(), SU->Succs.end(),
+ [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
+ if (SuccSize >= Size)
+ return false;
+
+ if (HasIntermediary) {
+ for (auto Succ : SU->Succs) {
+ auto SuccSize = std::count_if(
+ Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
+ [](const SDep &SuccSucc) {
+ return SuccSucc.getKind() == SDep::Data;
+ });
+ if (SuccSize >= Size)
+ return false;
+ }
+ }
+
+ return true;
+ }
+ LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
+ bool HasIntermediary = false, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Size(Size),
+ HasIntermediary(HasIntermediary) {}
+ };
+
+ // Whether or not the instruction is an V_CVT instruction.
+ class IsCvt final : public InstructionRule {
+ private:
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ auto Opc = SU->getInstr()->getOpcode();
+ return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
+ Opc == AMDGPU::V_CVT_I32_F32_e32;
+ }
+ IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ // Whether or not the instruction is an V_FMA_F32 instruction.
+ class IsFMA final : public InstructionRule {
+ private:
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
+ SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
+ }
+ IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ /// Whether or not the instruction is an immediate RAW successor
+ /// of the SchedGroup \p Distance steps before.
+ class IsSuccOfPrevNthGroup final : public InstructionRule {
+ private:
+ unsigned Distance = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ if (!SyncPipe.size())
+ return false;
+
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - Distance)
+ OtherGroup = &PipeSG;
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ for (auto &OtherEle : OtherGroup->Collection) {
+ for (auto &Succ : OtherEle->Succs) {
+ if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
+ return true;
+ }
+ }
+
+ return false;
+ }
+ IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+ };
+
+ /// Whether or not the instruction is a transitive successor of any
+ /// instruction the the SchedGroup \p Distance steps before.
+ class IsReachableFromPrevNthGroup final : public InstructionRule {
+ private:
+ unsigned Distance = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ if (!SyncPipe.size())
+ return false;
+
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - Distance)
+ OtherGroup = &PipeSG;
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ auto DAG = SyncPipe[0].DAG;
+
+ for (auto &OtherEle : OtherGroup->Collection)
+ if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
+ return true;
+
+ return false;
+ }
+ IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+ };
+
+ /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
+ class OccursAfterDSRead final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+ return SU->NodeNum >= Number;
+ }
+ OccursAfterDSRead(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+ };
+
+ /// Whether or not the SU is exactly the \p Number th MFMA in the chain
+ /// starting with \p ChainSeed
+ class IsExactMFMA final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+ SUnit *ChainSeed;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ auto TII = SyncPipe[0].TII;
+ if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
+ return false;
+
+ if (Cache->empty()) {
+ auto TempSU = ChainSeed;
+ auto Depth = Number;
+ while (Depth > 0) {
+ --Depth;
+ bool Found = false;
+ for (auto &Succ : TempSU->Succs) {
+ if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
+ TempSU = Succ.getSUnit();
+ Found = true;
+ break;
+ }
+ }
+ if (!Found) {
+ return false;
+ }
+ }
+ Cache->push_back(TempSU);
+ }
+
+ if (Cache->empty())
+ return false;
+
+ return (*Cache)[0] == SU;
+ }
+
+ IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number),
+ ChainSeed(ChainSeed) {}
+ };
+
+ // Whether the instruction occurs after the first TRANS instruction. This
+ // implies the instruction can not be a predecessor of the first TRANS
+ // insruction
+ class OccursAfterExp final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+ SmallVector<SUnit *, 12> Worklist;
+ auto DAG = SyncPipe[0].DAG;
+ auto TII = SyncPipe[0].TII;
+ if (Cache->empty()) {
+ for (auto &SU : DAG->SUnits)
+ if (TII->isTRANS(SU.getInstr()->getOpcode())) {
+ Cache->push_back(&SU);
+ break;
+ }
+ }
+
+ if (Cache->empty())
+ return false;
+
+ return SU->NodeNum > (*Cache)[0]->NodeNum;
+ }
+
+ OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+public:
+ void applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ IGLPPhase Phase) override;
+
+ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+
+ MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : IGLPStrategy(DAG, TII) {
+ IsBottomUp = 0;
+ }
+};
+
+// The count of TRANS SUs involved in the interleaved pipeline
+static unsigned TransPipeCount = 0;
+// The count of MFMA SUs involved in the interleaved pipeline
+static unsigned MFMAPipeCount = 0;
+// The number of transitive MFMA successors for each TRANS SU
+static unsigned MFMAEnablement = 0;
+// The number of transitive TRANS predecessors for each MFMA SU
+static unsigned ExpRequirement = 0;
+// The count of independent "chains" of MFMA instructions in the pipeline
+static unsigned MFMAChains = 0;
+// The length of each independent "chain" of MFMA instructions
+static unsigned MFMAChainLength = 0;
+// Whether or not the pipeline has V_CVT instructions
+static bool HasCvt = false;
+// Whether or not there are instructions between the TRANS instruction and V_CVT
+static bool HasChainBetweenCvt;
+// The first occuring DS_READ which feeds an MFMA chain
+static std::optional<unsigned> FirstPipeDSR;
+
+bool MFMAExpInterleaveOpt::analyzeDAG(SmallVectorImpl<SUnit *> &MFMAChainSeeds,
+ const SIInstrInfo *TII) {
+ SmallVector<SUnit *, 10> ExpPipeCands;
+ SmallVector<SUnit *, 10> MFMAPipeCands;
+ SmallVector<SUnit *, 10> MFMAPipeSUs;
+ SmallVector<SUnit *, 10> PackSUs;
+ SmallVector<SUnit *, 10> CvtSUs;
+
+ auto isBitPack = [](unsigned Opc) {
+ return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
+ };
+
+ auto isCvt = [](unsigned Opc) {
+ return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
+ };
+
+ for (SUnit &SU : DAG->SUnits) {
+ auto Opc = SU.getInstr()->getOpcode();
+ if (TII->isTRANS(Opc)) {
+ // Avoid counting a potential bonus V_EXP which all the MFMA depend on
+ if (SU.Succs.size() >= 7)
+ continue;
+ for (auto &Succ : SU.Succs) {
+ if (Succ.getSUnit()->Succs.size() >= 7)
+ continue;
+ }
+ ExpPipeCands.push_back(&SU);
+ }
+
+ if (TII->isMFMAorWMMA(*SU.getInstr()))
+ MFMAPipeCands.push_back(&SU);
+
+ if (isBitPack(Opc))
+ PackSUs.push_back(&SU);
+
+ if (isCvt(Opc))
+ CvtSUs.push_back(&SU);
+ }
+
+ if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
+ return false;
+
+ TransPipeCount = 0;
+
+ std::optional<SUnit *> TempMFMA;
+ std::optional<SUnit *> TempExp;
+ // Count the number of EXPs that reach an MFMA
+ for (auto &PredSU : ExpPipeCands) {
+ for (auto &SuccSU : MFMAPipeCands) {
+ if (DAG->IsReachable(SuccSU, PredSU)) {
+ if (!TempExp.has_value()) {
+ TempExp = PredSU;
+ TempMFMA = SuccSU;
+ }
+ MFMAPipeSUs.push_back(SuccSU);
+ ++TransPipeCount;
+ break;
+ }
+ }
+ }
+
+ if (!TempExp.has_value())
+ return false;
+
+ HasChainBetweenCvt =
+ std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),
+ [&isCvt](SDep &Succ) {
+ return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
+ }) == (*TempExp)->Succs.end();
+
+ // Count the number of MFMAs that are reached by an EXP
+ MFMAPipeCount = 0;
+ for (auto &SuccSU : MFMAPipeCands) {
+ if (MFMAPipeSUs.size() &&
+ std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),
+ [&SuccSU](SUnit *PotentialMatch) {
+ return PotentialMatch->NodeNum == SuccSU->NodeNum;
+ }) != MFMAPipeSUs.end()) {
+ ++MFMAPipeCount;
+ continue;
+ }
+ for (auto &PredSU : ExpPipeCands) {
+ if (DAG->IsReachable(SuccSU, PredSU)) {
+ MFMAPipeSUs.push_back(SuccSU);
+ ++MFMAPipeCount;
+ break;
+ }
+ }
+ }
+
+ if (!TempMFMA.has_value() || !TempExp.has_value())
+ return false;
+
+ std::optional<SUnit *> TempCvt;
+ for (auto &SuccSU : CvtSUs) {
+ if (DAG->IsReachable(SuccSU, *TempExp)) {
+ TempCvt = SuccSU;
+ break;
+ }
+ }
+
+ HasCvt = false;
+ if (TempCvt.has_value()) {
+ for (auto &SuccSU : MFMAPipeSUs) {
+ if (DAG->IsReachable(SuccSU, *TempCvt)) {
+ HasCvt = true;
+ break;
+ }
+ }
+ }
+
+ MFMAChains = 0;
+ for (auto &MFMAPipeSU : MFMAPipeSUs) {
+ if (MFMAChainSeeds.size() &&
+ std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) !=
+ MFMAChainSeeds.end())
+ continue;
+ if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
+ [&TII](SDep &Succ) {
+ return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+ })) {
+ MFMAChainSeeds.push_back(MFMAPipeSU);
+ ++MFMAChains;
+ }
+ }
+
+ if (!MFMAChains)
+ return false;
+
+ for (auto Pred : MFMAChainSeeds[0]->Preds) {
+ if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
+ Pred.getSUnit()->getInstr()->mayLoad())
+ FirstPipeDSR = Pred.getSUnit()->NodeNum;
+ }
+
+ MFMAChainLength = MFMAPipeCount / MFMAChains;
+
+ // The number of bit pack operations that depend on a single V_EXP
+ unsigned PackSuccCount = s...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
04fde67
to
4fc6479
Compare
4fc6479
to
db4ec74
Compare
Delete support for tiny kernel since the kernel itself will not be released until next iteration. |
f7965c4
to
4ceb707
Compare
Don't allow multiple entries into initIGLPOpt |
4ceb707
to
055ed0c
Compare
055ed0c
to
7eb410c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with NIT
@@ -1726,6 +1749,8 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( | |||
// In loop MFMAs | |||
auto MFMAInLoop = MFMAPipeCount - (MFMAEnablement * 2); | |||
auto MFMALoopCount = MFMAInLoop / MFMARatio; | |||
auto VALUOps = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe just double-check whether any of these divs need a check for div-by-zero.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a couple checks to be safe, but we should be good
c7ad84a
to
74e7f54
Compare
Pushing to use GUI for merge. |
… select kernels Change-Id: If721ff16173e858f607ce604a9d0f40baaef0a2c
74e7f54
to
aeda1b4
Compare
… select kernels (llvm#81342) This implements the basic pipelining structure of exp/mfma interleaving for better extensibility. While it does have improved extensibility, there are controls which only enable it for DAGs with certain characteristics (matching the DAGs it has been designed against). Change-Id: I13b7d61298786fbba9165bff668d5139d4c8c37b
… select kernels (llvm#81342) This implements the basic pipelining structure of exp/mfma interleaving for better extensibility. While it does have improved extensibility, there are controls which only enable it for DAGs with certain characteristics (matching the DAGs it has been designed against). Change-Id: I5bacb525cfde499690ed5a9fb5b34e94a48aab7e
This is a redesign of #80370 . The implementation is a bit more organized and less repetitive.
This implements the basic pipelining structure of exp/mfma interleaving for better extensibility. While it does have improved extensibility, there are controls which only enable it for DAGs with certain characteristics (matching the DAGs it has been designed against).