-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Insert spill codes for the SGPRs used for EXEC copy #79428
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Christudasan Devadasan (cdevadas) ChangesThe SGPR registers used for preserving EXEC mask while lowering the whole-wave register spills and copies should be preserved at the prolog and epilog if they are in the CSR range. It isn't happening when there is only wwm-copy lowered and there are no wwm-spills. This patch addresses that problem. Full diff: https://github.com/llvm/llvm-project/pull/79428.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9d062eb156d5c5a..9bd8996f22fca25 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1487,8 +1487,7 @@ void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
// The special SGPR spills like the one needed for FP, BP or any reserved
// registers delayed until frame lowering.
void SIFrameLowering::determinePrologEpilogSGPRSaves(
- MachineFunction &MF, BitVector &SavedVGPRs,
- bool NeedExecCopyReservedReg) const {
+ MachineFunction &MF, BitVector &SavedVGPRs) const {
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -1504,9 +1503,10 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
- if (NeedExecCopyReservedReg) {
+ if (MFI->shouldPreserveExecCopyReservedReg()) {
Register ReservedReg = MFI->getSGPRForEXECCopy();
assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
+ MRI.reserveReg(ReservedReg, TRI);
Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
if (UnusedScratchReg) {
// If found any unused scratch SGPR, reserve the register itself for Exec
@@ -1570,7 +1570,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
- bool NeedExecCopyReservedReg = false;
+ bool NeedsExecCopyReservedReg = false;
MachineInstr *ReturnMI = nullptr;
for (MachineBasicBlock &MBB : MF) {
@@ -1588,7 +1588,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
- NeedExecCopyReservedReg = true;
+ NeedsExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
(MFI->isChainFunction() &&
@@ -1602,6 +1602,10 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
}
}
+ // If found any wwm-spills, preserve the register(s) used for exec copy.
+ if (NeedsExecCopyReservedReg)
+ MFI->setPreserveExecCopyReservedReg();
+
// Remove any VGPRs used in the return value because these do not need to be saved.
// This prevents CSR restore from clobbering return VGPRs.
if (ReturnMI) {
@@ -1620,7 +1624,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (!ST.hasGFX90AInsts())
SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
- determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
+ determinePrologEpilogSGPRSaves(MF, SavedVGPRs);
// The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
// allow the default insertion to handle them.
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed811fb..926801dcecc3386 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -34,8 +34,8 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
RegScavenger *RS = nullptr) const override;
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const;
- void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs,
- bool NeedExecCopyReservedReg) const;
+ void determinePrologEpilogSGPRSaves(MachineFunction &MF,
+ BitVector &SavedRegs) const;
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, DebugLoc &DL,
LiveRegUnits &LiveUnits, Register FrameReg,
diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
index 9c3cd1bbd6b0fb6..acea9103a925d66 100644
--- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
@@ -137,5 +137,10 @@ bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) {
}
}
+ // SGPRs reserved for exec copy should be preserved as we encountered WWM_COPY
+ // instances.
+ if (Changed)
+ MFI->setPreserveExecCopyReservedReg();
+
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b94d143a75e5edf..803feaf9cc5a968 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -671,8 +671,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
const llvm::MachineFunction &MF)
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
- GDSSize(MFI.getGDSSize()),
- DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
+ GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
+ IsEntryFunction(MFI.isEntryFunction()),
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
@@ -684,9 +684,9 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
BytesInStackArgArea(MFI.getBytesInStackArgArea()),
ReturnsVoid(MFI.returnsVoid()),
+ PreserveExecCopyReservedReg(MFI.shouldPreserveExecCopyReservedReg()),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
- PSInputAddr(MFI.getPSInputAddr()),
- PSInputEnable(MFI.getPSInputEnable()),
+ PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
Mode(MFI.getMode()) {
for (Register Reg : MFI.getWWMReservedRegs())
WWMReservedRegs.push_back(regToString(Reg, TRI));
@@ -728,6 +728,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
ReturnsVoid = YamlMFI.ReturnsVoid;
+ PreserveExecCopyReservedReg = YamlMFI.PreserveExecCopyReservedReg;
if (YamlMFI.ScavengeFI) {
auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0336ec4985ea747..fcd6422ab2cc2a7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -277,6 +277,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
unsigned BytesInStackArgArea = 0;
bool ReturnsVoid = true;
+ bool PreserveExecCopyReservedReg = false;
std::optional<SIArgumentInfo> ArgInfo;
@@ -321,6 +322,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue("$sp_reg"));
YamlIO.mapOptional("bytesInStackArgArea", MFI.BytesInStackArgArea, 0u);
YamlIO.mapOptional("returnsVoid", MFI.ReturnsVoid, true);
+ YamlIO.mapOptional("preserveExecCopyReservedReg",
+ MFI.PreserveExecCopyReservedReg, false);
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u);
YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u);
@@ -530,6 +533,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// To save/restore EXEC MASK around WWM spills and copies.
Register SGPRForEXECCopy;
+ // To save/restore the exec copy reserved register at function entry/exit.
+ // Since the handling of wwm copies and spills are scattered across multiple
+ // passes, this field would help efficiently insert its spills.
+ bool PreserveExecCopyReservedReg = false;
+
DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
// AGPRs used for VGPR spills.
@@ -688,6 +696,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
void setSGPRForEXECCopy(Register Reg) { SGPRForEXECCopy = Reg; }
+ void setPreserveExecCopyReservedReg() { PreserveExecCopyReservedReg = true; }
+
+ bool shouldPreserveExecCopyReservedReg() const {
+ return PreserveExecCopyReservedReg;
+ }
+
ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const {
return SpillVGPR;
}
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index b63c8038f9c81bd..5d580f838e0e0e7 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -32,9 +32,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: v_writelane_b32 v2, s24, 5
; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11]
; GFX906-NEXT: v_writelane_b32 v2, s26, 6
+; GFX906-NEXT: v_writelane_b32 v41, s34, 2
; GFX906-NEXT: v_writelane_b32 v2, s27, 7
+; GFX906-NEXT: v_writelane_b32 v41, s35, 3
; GFX906-NEXT: v_writelane_b32 v2, s8, 8
-; GFX906-NEXT: v_writelane_b32 v41, s16, 2
+; GFX906-NEXT: v_writelane_b32 v41, s16, 4
; GFX906-NEXT: v_writelane_b32 v2, s9, 9
; GFX906-NEXT: v_writelane_b32 v41, s30, 0
; GFX906-NEXT: v_writelane_b32 v2, s4, 10
@@ -338,7 +340,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: v_readlane_b32 s31, v41, 1
; GFX906-NEXT: v_readlane_b32 s30, v41, 0
; GFX906-NEXT: ; kill: killed $vgpr40
-; GFX906-NEXT: v_readlane_b32 s4, v41, 2
+; GFX906-NEXT: v_readlane_b32 s34, v41, 2
+; GFX906-NEXT: v_readlane_b32 s35, v41, 3
+; GFX906-NEXT: v_readlane_b32 s4, v41, 4
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -379,23 +383,27 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: s_mov_b64 exec, -1
; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
; GFX908-NEXT: s_mov_b64 exec, s[18:19]
-; GFX908-NEXT: v_mov_b32_e32 v3, s16
+; GFX908-NEXT: v_mov_b32_e32 v3, s34
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; GFX908-NEXT: v_mov_b32_e32 v3, s35
+; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; GFX908-NEXT: v_mov_b32_e32 v3, s16
+; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
; GFX908-NEXT: s_addk_i32 s32, 0x2c00
; GFX908-NEXT: s_mov_b64 s[16:17], exec
; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:172
; GFX908-NEXT: v_writelane_b32 v2, s30, 0
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[16:17]
; GFX908-NEXT: s_mov_b64 s[16:17], exec
; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:172
; GFX908-NEXT: v_writelane_b32 v2, s31, 0
; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[16:17]
; GFX908-NEXT: ; implicit-def: $vgpr2
@@ -729,25 +737,31 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readlane_b32 s31, v0, 0
-; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
; GFX908-NEXT: s_mov_b64 s[4:5], exec
; GFX908-NEXT: s_mov_b64 exec, 1
-; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readlane_b32 s30, v0, 0
-; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164
+; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
; GFX908-NEXT: ; kill: killed $vgpr40
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_readfirstlane_b32 s34, v0
+; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_readfirstlane_b32 s35, v0
+; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s4, v0
; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1
; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index cfb1cffdb509648..f680bbdd05cdd22 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -18,7 +18,9 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX90A-NEXT: s_mov_b64 exec, s[18:19]
-; GFX90A-NEXT: v_writelane_b32 v40, s16, 2
+; GFX90A-NEXT: v_writelane_b32 v40, s28, 2
+; GFX90A-NEXT: v_writelane_b32 v40, s29, 3
+; GFX90A-NEXT: v_writelane_b32 v40, s16, 4
; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
; GFX90A-NEXT: s_addk_i32 s32, 0x400
@@ -46,7 +48,9 @@ define void @vector_reg_liverange_split() #0 {
; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
; GFX90A-NEXT: ; kill: killed $vgpr0
-; GFX90A-NEXT: v_readlane_b32 s4, v40, 2
+; GFX90A-NEXT: v_readlane_b32 s28, v40, 2
+; GFX90A-NEXT: v_readlane_b32 s29, v40, 3
+; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX90A-NEXT: s_mov_b64 exec, -1
|
// SGPRs reserved for exec copy should be preserved as we encountered WWM_COPY | ||
// instances. | ||
if (Changed) | ||
MFI->setPreserveExecCopyReservedReg(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you really need to track a separate field for this? Can't you just check if RegForExecCopy is used?
The SGPR registers used for preserving EXEC mask while lowering the whole-wave register spills and copies should be preserved at the prolog and epilog if they are in the CSR range. It isn't happening when there is only wwm-copy lowered and there are no wwm-spills. This patch addresses that problem.
3c39b01
to
0a4e12d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a mir test?
MIR test added. |
) The SGPR registers used for preserving EXEC mask while lowering the whole-wave register spills and copies should be preserved at the prolog and epilog if they are in the CSR range. It isn't happening when there is only wwm-copy lowered and there are no wwm-spills. This patch addresses that problem.
The SGPR registers used for preserving EXEC mask while lowering the whole-wave register spills and copies should be preserved at the prolog and epilog if they are in the CSR range. It isn't happening when there is only wwm-copy lowered and there are no wwm-spills. This patch addresses that problem.