[AMDGPU] Tune scheduler on GFX10 and GFX11 for regions with spilling

Unlike older ASICs GFX10+ have a lot of VGPRs. Therefore, it is possible to achieve high occupancy even with all or almost all addressable VGPRs used. Our scheduler was never tuned for this scenario. The VGPR Critical Limit threshold always comes very high, even if maximum occupancy is targeted. For example on gfx1100 it is set to 192 registers even with the requested occupancy 16. As a result scheduler starts prioritizing register pressure reduction very late and we easily end up spilling. This patch makes VGPR critical limit similar to what we would have on pre-gfx10 targets with much more limited VGPR budget while still trying to maintain occupancy as it does now. Pre-gfx10 ASICs shall not be affected as the limit shall be the same as before, and on gfx10+ it shall only affect regions where we have to spill. Fixes: SWDEV-377300 Differential Revision: https://reviews.llvm.org/D141876
llvm · Jan 23, 2023 · d1c0feb · d1c0feb
1 parent cc9fa50
commit d1c0feb
Show file tree

Hide file tree

Showing 3 changed files with 328 additions and 3 deletions.
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -70,8 +70,23 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   TargetOccupancy = MFI.getOccupancy();
   SGPRCriticalLimit =
       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
-  VGPRCriticalLimit =
-      std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
+
+  if (!KnownExcessRP) {
+    VGPRCriticalLimit =
+        std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
+  } else {
+    // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
+    // returns a reasonably small number for targets with lots of VGPRs, such
+    // as GFX10 and GFX11.
+    LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
+                         "VGPRCriticalLimit calculation method.\n");
+
+    unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
+    unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
+    unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
+    VGPRBudget = std::max(VGPRBudget, Granule);
+    VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
+  }
 
   // Subtract error margin and bias from register limits and avoid overflow.
   SGPRCriticalLimit =
@@ -86,6 +101,11 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   SGPRExcessLimit = std::min(SGPRExcessLimit - ErrorMargin, SGPRExcessLimit);
   VGPRExcessLimit = std::min(VGPRExcessLimit - VGPRLimitBias, VGPRExcessLimit);
   VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit);
+
+  LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
+                    << ", VGPRExcessLimit = " << VGPRExcessLimit
+                    << ", SGPRCriticalLimit = " << SGPRCriticalLimit
+                    << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
 }
 
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -802,6 +822,7 @@ bool GCNSchedStage::initGCNRegion() {
              << "Region register pressure: " << print(PressureBefore));
 
   S.HasHighPressure = false;
+  S.KnownExcessRP = isRegionWithExcessRP();
 
   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
@@ -1142,7 +1163,7 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
   if (WavesAfter <= MFI.getMinWavesPerEU() &&
       !PressureAfter.less(ST, PressureBefore) &&
-      DAG.RegionsWithExcessRP[RegionIdx]) {
+      isRegionWithExcessRP()) {
     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     return true;
   }

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -75,6 +75,10 @@ class GCNSchedStrategy : public GenericScheduler {
   // track register pressure for actual scheduling heuristics.
   bool HasHighPressure;
 
+  // Schedule known to have excess register pressure. Be more conservative in
+  // increasing ILP and preserving VGPRs.
+  bool KnownExcessRP = false;
+
   // An error margin is necessary because of poor performance of the generic RP
   // tracker and can be adjusted up for tuning heuristics to try and more
   // aggressively reduce register pressure.
@@ -302,6 +306,11 @@ class GCNSchedStage {
   // Returns true if scheduling should be reverted.
   virtual bool shouldRevertScheduling(unsigned WavesAfter);
 
+  // Returns true if current region has known excess pressure.
+  bool isRegionWithExcessRP() const {
+    return DAG.RegionsWithExcessRP[RegionIdx];
+  }
+
   // Returns true if the new schedule may result in more spilling.
   bool mayCauseSpilling(unsigned WavesAfter);