[AMDGPU] Add iglp_opt(2) to provide initial MFMA/Exp interleaving #80370

jrbyrnes · 2024-02-02T01:23:22Z

This provides the iglp_opt(2) builtin which provides specialized mutations to produce custom scheduling.

It was designed against and specifically targets a subset of fused attention V_EXP -> V_MFMA kernels.

The intent is to provide some initial support and performance improvements on a subset of kernels, with a more robust, behavior-maintaining implementation available in a later implementation.

Change-Id: I3690e082b98b57392075cac783b853f3fb48b0e5

llvmbot · 2024-02-02T01:23:52Z

@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

Changes

This provides the iglp_opt(2) builtin which provides specialized mutations to produce custom scheduling.

It was designed against and specifically targets a subset of fused attention V_EXP -> V_MFMA kernels.

The intent is to provide some initial support and performance improvements on a subset of kernels, with a more robust, behavior-maintaining implementation available in a later implementation.

Patch is 332.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80370.diff

7 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (+889-32)
(modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h (+5-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+3-3)
(modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+4-3)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir (+2055)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir (+900)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.tiny.mir (+646)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 4462cd8a31f13..175356b55a931 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -140,8 +140,6 @@ class SchedGroup {
   // Count of the number of created SchedGroups, used to initialize SGID.
   static unsigned NumSchedGroups;
 
-  const SIInstrInfo *TII;
-
   // Try to add and edge from SU A to SU B.
   bool tryAddEdge(SUnit *A, SUnit *B);
 
@@ -154,6 +152,7 @@ class SchedGroup {
   SmallVector<SUnit *, 32> Collection;
 
   ScheduleDAGInstrs *DAG;
+  const SIInstrInfo *TII;
 
   // Returns true if SU can be added to this SchedGroup.
   bool canAddSU(SUnit &SU) const;
@@ -234,13 +233,13 @@ class SchedGroup {
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
+      : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
+      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 };
@@ -442,7 +441,8 @@ void PipelineSolver::convertSyncMapsToArrays() {
 template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
   for (; I != E; ++I) {
     auto &GroupA = *I;
-    for (auto J = std::next(I); J != E; ++J) {
+    auto J = std::next(I);
+    for (; J != E; ++J) {
       auto &GroupB = *J;
       GroupA.link(GroupB);
     }
@@ -488,7 +488,9 @@ int PipelineSolver::linkSUnit(
       continue;
     }
     auto Group = *I;
-    AddedCost += Group.link(*SU, MakePred, AddedEdges);
+    auto Temp = Group.link(*SU, MakePred, AddedEdges);
+
+    AddedCost += Temp;
     assert(AddedCost >= 0);
   }
   return AddedCost;
@@ -633,6 +635,7 @@ bool PipelineSolver::solveExact() {
   assert(static_cast<size_t>(CurrConflInstNo) <
          PipelineInstrs[CurrSyncGroupIdx].size());
   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+
   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
 
@@ -785,6 +788,7 @@ bool PipelineSolver::solveGreedy() {
 
   while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
     SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+
     IsBottomUp
         ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
         : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
@@ -838,6 +842,7 @@ void PipelineSolver::solve() {
 enum IGLPStrategyID : int {
   MFMASmallGemmOptID = 0,
   MFMASmallGemmSingleWaveOptID = 1,
+  MFMAExpInterleave = 2
 };
 
 // Implement a IGLP scheduling strategy.
@@ -852,7 +857,7 @@ class IGLPStrategy {
   virtual void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) = 0;
+      IGLPPhase Phase) = 0;
 
   // Returns true if this strategy should be applied to a ScheduleDAG.
   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -871,7 +876,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
   void applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) override;
+      IGLPPhase Phase) override;
 
   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
 
@@ -884,7 +889,7 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
 void MFMASmallGemmOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-    bool IsReentry) {
+    IGLPPhase Phase) {
   // Count the number of MFMA instructions.
   unsigned MFMACount = 0;
   for (const MachineInstr &I : *DAG)
@@ -904,6 +909,854 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
   }
 }
 
+class MFMAExpInterleaveOpt final : public IGLPStrategy {
+private:
+  /// Whether or not the instruction is a transitive predecessor of an MFMA
+  /// instruction
+  class IsPipeExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+
+      if (Cache->empty()) {
+        auto I = DAG->SUnits.rbegin();
+        auto E = DAG->SUnits.rend();
+        for (; I != E; I++) {
+          if (TII->isMFMAorWMMA(*(I->getInstr())))
+            Cache->push_back(&*I);
+        }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      auto Reaches = (std::any_of(
+          Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
+            return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+          }));
+
+      return Reaches;
+    }
+    IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the insturction is a transitive predecessor of the same
+  /// MFMA instruction as an instruction in a SchedGroup \p Number steps before
+  class ProduceSameMFMAWithPrevN final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Number) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      auto DAG = SyncPipe[0].DAG;
+
+      if (Cache->empty()) {
+        auto TII = SyncPipe[0].TII;
+        SmallVector<SUnit *, 8> Worklist;
+
+        auto I = DAG->SUnits.rbegin();
+        auto E = DAG->SUnits.rend();
+        for (; I != E; I++)
+          if (TII->isMFMAorWMMA(*(I->getInstr())))
+            Worklist.push_back(&*I);
+
+        for (auto BaseSU : OtherGroup->Collection) {
+          if (!Cache->empty())
+            break;
+          for (auto CandSU : Worklist) {
+            if (DAG->IsReachable(CandSU, BaseSU)) {
+              Cache->push_back(CandSU);
+              break;
+            }
+          }
+        }
+      }
+      if (Cache->empty())
+        return false;
+
+      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+    }
+
+    ProduceSameMFMAWithPrevN(unsigned Number, const SIInstrInfo *TII,
+                             unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  /// Whether or not the instruction has less than \p Size immediate successors
+  class LessThanNSuccs final : public InstructionRule {
+  private:
+    unsigned Size = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      if (!SyncPipe.size())
+        return false;
+
+      return SU->Succs.size() < Size;
+    }
+    LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Size(Size) {}
+  };
+
+  // Whether or not the instruction is an V_CVT instruction.
+  class IsCvt final : public InstructionRule {
+  private:
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      auto Opc = SU->getInstr()->getOpcode();
+      return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
+             Opc == AMDGPU::V_CVT_F16_F32_e32_gfx10 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32_gfx10 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32_gfx11;
+    }
+    IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether or not the instruction is an V_FMA_F32 instruction.
+  class IsFMA final : public InstructionRule {
+  private:
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64;
+    }
+    IsFMA(unsigned Val, const SIInstrInfo *TII, unsigned SGID,
+          bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the instruction is an immediate RAW successor
+  /// of the SchedGroup \p Distance steps before.
+  class IsSuccOfPrevNthGroup final : public InstructionRule {
+  private:
+    unsigned Distance = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      if (!SyncPipe.size())
+        return false;
+
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      for (auto &OtherEle : OtherGroup->Collection) {
+        for (auto &Succ : OtherEle->Succs) {
+          if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
+            return true;
+        }
+      }
+
+      return false;
+    }
+    IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+                         unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+  };
+
+  /// Whether or not the instruction is a transitive successor of any
+  /// instruction the the SchedGroup \p Distance steps before.
+  class IsReachableFromPrevNthGroup final : public InstructionRule {
+  private:
+    unsigned Distance = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      if (!SyncPipe.size())
+        return false;
+
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      auto DAG = SyncPipe[0].DAG;
+
+      for (auto &OtherEle : OtherGroup->Collection)
+        if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
+          return true;
+
+      return false;
+    }
+    IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+                                unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+  };
+
+  /// Whether or not the instruction is the \p Number th occuring DS_READ
+  /// instruciton
+  class IsNthDSR final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+      unsigned Counter = 0;
+      if (Cache->empty()) {
+        for (auto &ParseSU : DAG->SUnits) {
+          auto MI = ParseSU.getInstr();
+          if (TII->isDS(MI->getOpcode()) && MI->mayLoad()) {
+            if (Counter == Number) {
+              Cache->push_back(&ParseSU);
+              break;
+            }
+
+            ++Counter;
+          }
+        }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      return (*Cache)[0]->NodeNum <= SU->NodeNum;
+    }
+    IsNthDSR(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+             bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  // Whether or not the instruction is a transitive predecessor of any TRANS
+  // instruction
+  class IsPipeMFMA final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      SmallVector<SUnit *, 12> Worklist;
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+      if (Cache->empty()) {
+        for (auto &SU : DAG->SUnits)
+          if (TII->isTRANS(SU.getInstr()->getOpcode()))
+            Cache->push_back(&SU);
+      }
+
+      if (Cache->empty())
+        return false;
+
+      return  !(std::any_of(
+          Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *BaseSU) {
+            return DAG->IsReachable(BaseSU, const_cast<SUnit *>(SU));
+          }));
+    }
+
+    IsPipeMFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether the instruction occurs after the first TRANS instruction. This
+  // implies the instruction can not be a predecessor of the first TRANS
+  // insruction
+  class OccursAfterExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      SmallVector<SUnit *, 12> Worklist;
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+      if (Cache->empty()) {
+        for (auto &SU : DAG->SUnits)
+          if (TII->isTRANS(SU.getInstr()->getOpcode())) {
+            Cache->push_back(&SU);
+            break;
+          }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      return SU->NodeNum > (*Cache)[0]->NodeNum;
+    }
+
+    OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether the SU is a not a successor of any element in the previous
+  // SchedGroup
+  class IsNotSuccOfPrevGroup final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - 1) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      // Does the previous VALU have this DS_Write as a successor
+      return !(std::any_of(OtherGroup->Collection.begin(),
+                           OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
+                             return std::any_of(Elt->Succs.begin(),
+                                                Elt->Succs.end(),
+                                                [&SU](SDep &Succ) {
+                                                  return Succ.getSUnit() == SU;
+                                                });
+                           }));
+    }
+    IsNotSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
+                         bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+public:
+  void applyIGLPStrategy(
+      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      IGLPPhase Phase) override;
+
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+
+  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : IGLPStrategy(DAG, TII) {
+    IsBottomUp = 0;
+  }
+};
+
+static unsigned TransPipeCount = 0;
+static unsigned MFMAPipeCount = 0;
+static unsigned MFMAEnablement = 0;
+static unsigned ExpRequirement = 0;
+
+void MFMAExpInterleaveOpt::applyIGLPStrategy(
+    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    IGLPPhase Phase) {
+
+  const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  if (Phase == IGLPPhase::Initial) {
+    SmallVector<SUnit *, 10> ExpPipeCands;
+    SmallVector<SUnit *, 10> MFMAPipeCands;
+    SmallVector<SUnit *, 10> MFMAPipeSUs;
+    SmallVector<SUnit *, 10> PackSUs;
+
+    auto isBitPack = [](unsigned Opc) {
+      return Opc == AMDGPU::V_PACK_B32_F16_e64 ||
+             Opc == AMDGPU::V_PACK_B32_F16_gfx10 ||
+             Opc == AMDGPU::V_PACK_B32_F16_e64_gfx11 ||
+             Opc == AMDGPU::V_PERM_B32_e64 ||
+             Opc == AMDGPU::V_PERM_B32_e64_gfx11;
+    };
+    for (SUnit &SU : DAG->SUnits) {
+      auto Opc = SU.getInstr()->getOpcode();
+      if (TII->isTRANS(Opc)) {
+        // Avoid counting a potential bonus V_EXP which all the MFMA depend on
+        if (SU.Succs.size() >= 7)
+          continue;
+        ExpPipeCands.push_back(&SU);
+      }
+
+      if (TII->isMFMAorWMMA(*SU.getInstr()))
+        MFMAPipeCands.push_back(&SU);
+
+      if (isBitPack(Opc))
+        PackSUs.push_back(&SU);
+    }
+
+    if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
+      return;
+
+    TransPipeCount = 0;
+    MFMAPipeCount = 0;
+    MFMAEnablement = 0;
+    ExpRequirement = 0;
+
+    std::optional<SUnit *> TempMFMA;
+    std::optional<SUnit *> TempExp;
+    // Count the number of EXPs that reach an MFMA
+    for (auto &PredSU : ExpPipeCands) {
+      for (auto &SuccSU : MFMAPipeCands) {
+        if (DAG->IsReachable(SuccSU, PredSU)) {
+          if (!TempExp.has_value()) {
+            TempExp = PredSU;
+            TempMFMA = SuccSU;
+          }
+          MFMAPipeSUs.push_back(SuccSU);
+          ++TransPipeCount;
+          break;
+        }
+      }
+    }
+
+    if (!TempExp.has_value())
+      return;
+
+    // Count the number of MFMAs that are reached by an EXP
+    for (auto &SuccSU : MFMAPipeCands) {
+      if (std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),
+                       [&SuccSU](SUnit *PotentialMatch) {
+                         return PotentialMatch == SuccSU;
+                       })) {
+        ++MFMAPipeCount;
+        continue;
+      }
+      for (auto &PredSU : ExpPipeCands) {
+        if (DAG->IsReachable(SuccSU, PredSU)) {
+          ++MFMAPipeCount;
+          break;
+        }
+      }
+    }
+
+    if (!TempMFMA.has_value() || !TempExp.has_value())
+      return;
+
+    // The number of bit pack operations that depend on a single V_EXP
+    unsigned PackSuccCount = std::count_if(
+        PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
+          return DAG->IsReachable(VPack, *TempExp);
+        });
+
+    // The number of bit pack operations an MFMA depends on
+    unsigned PackPredCount =
+        std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+                      [&isBitPack](SDep &Pred) {
+                        auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+                        return isBitPack(Opc);
+                      });
+
+    auto PackPred =
+        std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+                     [&isBitPack](SDep &Pred) {
+                       auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+                       return isBitPack(Opc);
+                     });
+
+    // How many MFMAs depend on a single bit pack operation
+    MFMAEnablement =
+        std::count_if(PackPred->getSUnit()->Succs.begin(),
+                      PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
+                        return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+                      });
+
+    // The number of MFMAs that depend on a single V_EXP
+    MFMAEnablement *= PackSuccCount;
+
+    // The number of V_EXPs required to resolve all dependencies for an MFMA
+    ExpRequirement =
+        std::count_if(ExpPipeCands.begin(), ExpPipeCands...
[truncated]

github-actions · 2024-02-02T01:26:06Z

✅ With the latest revision this PR passed the C/C++ code formatter.