[AMDGPU] Callee must always spill writelane VGPRs

Since the writelane instruction used for SGPR spills can modify inactive lanes, the callee must preserve the VGPR this instruction modifies even if it was marked Caller-saved. Reviewed By: arsenm, nhaehnle Differential Revision: https://reviews.llvm.org/D124192
llvm · Dec 17, 2022 · 5692a7e · 5692a7e
1 parent cc037e1
commit 5692a7e
Show file tree

Hide file tree

Showing 17 changed files with 455 additions and 387 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -98,10 +98,12 @@ static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
     if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
       // 3: There's no free lane to spill, and no free register to save FP/BP,
       // so we're forced to spill another VGPR to use for the spill.
+      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+      MFI->allocateWWMSpill(MF, Spill.VGPR);
+
       FrameIndex = NewFI;
 
       LLVM_DEBUG(
-          auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
           dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
                  << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
     } else {
@@ -777,17 +779,14 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   std::optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
 
   // VGPRs used for SGPR->VGPR spills
-  for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
-       FuncInfo->getSGPRSpillVGPRs()) {
-    if (!Reg.FI)
-      continue;
-
+  for (const auto &Reg : FuncInfo->getWWMSpills()) {
+    Register VGPR = Reg.first;
+    int FI = Reg.second;
     if (!ScratchExecCopy)
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
                                              /*IsProlog*/ true);
 
-    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, Reg.VGPR,
-                     *Reg.FI);
+    buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR, FI);
   }
 
   for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
@@ -1054,17 +1053,15 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 
   Register ScratchExecCopy;
-  for (const SIMachineFunctionInfo::SGPRSpillVGPR &Reg :
-       FuncInfo->getSGPRSpillVGPRs()) {
-    if (!Reg.FI)
-      continue;
-
+  for (const auto &Reg : FuncInfo->getWWMSpills()) {
+    Register VGPR = Reg.first;
+    int FI = Reg.second;
     if (!ScratchExecCopy)
       ScratchExecCopy =
           buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, /*IsProlog*/ false);
 
-    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
-                       Reg.VGPR, *Reg.FI);
+    buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, VGPR,
+                       FI);
   }
 
   for (auto ReservedWWM : FuncInfo->wwmAllocation()) {
@@ -1262,6 +1259,32 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // WRITELANE instructions used for SGPR spills can overwrite the inactive
+      // lanes of VGPRs and callee must spill and restore them even if they are
+      // marked Caller-saved.
+
+      // TODO: Handle this elsewhere at an early point. Walking through all MBBs
+      // here would be a bad heuristic. A better way should be by calling
+      // allocateWWMSpill during the regalloc pipeline whenever a physical
+      // register is allocated for the intended virtual registers. That will
+      // also help excluding the general use of WRITELANE/READLANE intrinsics
+      // that won't really need any such special handling.
+      if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
+        MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
+      else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
+        MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
+    }
+  }
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto &Reg : MFI->getWWMSpills())
+      MBB.addLiveIn(Reg.first);
+
+    MBB.sortUniqueLiveIns();
+  }
+
   // Ignore the SGPRs the default implementation found.
   SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
 
@@ -1285,8 +1308,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
   // so don't allow the default insertion to handle them.
-  for (auto SSpill : MFI->getSGPRSpillVGPRs())
-    SavedVGPRs.reset(SSpill.VGPR);
+  for (auto &Reg : MFI->getWWMSpills())
+    SavedVGPRs.reset(Reg.first);
 
   LivePhysRegs LiveRegs;
   LiveRegs.init(*TRI);

diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -309,8 +309,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
 
     // FIXME: Adding to live-ins redundant with reserving registers.
     for (MachineBasicBlock &MBB : MF) {
-      for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
-        MBB.addLiveIn(SSpill.VGPR);
+      for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
+        MBB.addLiveIn(Reg);
       MBB.sortUniqueLiveIns();
 
       // FIXME: The dead frame indices are replaced with a null register from

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -270,6 +270,16 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
   return ArgInfo.LDSKernelId.getRegister();
 }
 
+void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
+                                             uint64_t Size, Align Alignment) {
+  // Skip if it is an entry function or the register is already added.
+  if (isEntryFunction() || WWMSpills.count(VGPR))
+    return;
+
+  WWMSpills.insert(std::make_pair(
+      VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
+}
+
 bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
                                              MCPhysReg Reg) {
   for (unsigned I = 0; CSRegs[I]; ++I) {
@@ -340,21 +350,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
         return false;
       }
 
-      std::optional<int> SpillFI;
-      // We need to preserve inactive lanes, so always save, even caller-save
-      // registers.
-      if (!isEntryFunction()) {
-        SpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
-      }
-
-      SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI));
+      SpillVGPRs.push_back(LaneVGPR);
 
       // Add this register as live-in to all blocks to avoid machine verifier
       // complaining about use of an undefined physical register.
       for (MachineBasicBlock &BB : MF)
         BB.addLiveIn(LaneVGPR);
     } else {
-      LaneVGPR = SpillVGPRs.back().VGPR;
+      LaneVGPR = SpillVGPRs.back();
     }
 
     SpillLanes.push_back(SIRegisterInfo::SpilledReg(LaneVGPR, VGPRIndex));

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -431,17 +431,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   MCPhysReg getNextSystemSGPR() const;
 
 public:
-  struct SGPRSpillVGPR {
-    // VGPR used for SGPR spills
-    Register VGPR;
-
-    // If the VGPR is used for SGPR spills in a non-entrypoint function, the
-    // stack slot used to save/restore it in the prolog/epilog.
-    std::optional<int> FI;
-
-    SGPRSpillVGPR(Register V, std::optional<int> F) : VGPR(V), FI(F) {}
-  };
-
   struct VGPRSpillToAGPR {
     SmallVector<MCPhysReg, 32> Lanes;
     bool FullyAllocated = false;
@@ -471,7 +460,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // frameindex key.
   DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRToVGPRSpills;
   unsigned NumVGPRSpillLanes = 0;
-  SmallVector<SGPRSpillVGPR, 2> SpillVGPRs;
+  SmallVector<Register, 2> SpillVGPRs;
+  using WWMSpillsMap = MapVector<Register, int>;
+  // To track the registers used in instructions that can potentially modify the
+  // inactive lanes. The WWM instructions and the writelane instructions for
+  // spilling SGPRs to VGPRs fall under such category of operations. The VGPRs
+  // modified by them should be spilled/restored at function prolog/epilog to
+  // avoid any undesired outcome. Each entry in this map holds a pair of values,
+  // the VGPR and its stack slot index.
+  WWMSpillsMap WWMSpills;
 
   DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
 
@@ -540,7 +537,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
                : makeArrayRef(I->second);
   }
 
-  ArrayRef<SGPRSpillVGPR> getSGPRSpillVGPRs() const { return SpillVGPRs; }
+  ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; }
+  const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
+
+  void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4,
+                        Align Alignment = Align(4));
 
   ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
     return SpillAGPR;

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -711,8 +711,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
     reserveRegisterTuples(Reserved, Reg);
 
-  for (auto SSpill : MFI->getSGPRSpillVGPRs())
-    reserveRegisterTuples(Reserved, SSpill.VGPR);
+  for (auto Reg : MFI->getSGPRSpillVGPRs())
+    reserveRegisterTuples(Reserved, Reg);
 
   return Reserved;
 }

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -8,7 +8,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    s_or_saveexec_b32 s4, -1
-; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v15, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v14, v2
@@ -25,14 +25,14 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v11
 ; CHECK-NEXT:    v_mov_b32_e32 v6, v10
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v9
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s4, s8
 ; CHECK-NEXT:    s_mov_b32 s5, s8
@@ -47,19 +47,19 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_mov_b32 s5, s6
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 s4, exec_lo
 ; CHECK-NEXT:    v_writelane_b32 v8, s4, 4
 ; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v9
 ; CHECK-NEXT:    v_mov_b32_e32 v6, v10
@@ -115,8 +115,8 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, s4
 ; CHECK-NEXT:    v_writelane_b32 v8, s4, 13
 ; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_readlane_b32 s4, v8, 13
 ; CHECK-NEXT:    v_readlane_b32 s8, v8, 5
 ; CHECK-NEXT:    v_readlane_b32 s9, v8, 6
@@ -133,20 +133,20 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    v_readlane_b32 s4, v8, 4
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    s_or_saveexec_b32 s4, -1
-; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0