-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Allow any SGPRs for chain callees #168345
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Chain calls never return and don't need to preserve any SGPRs. Therefore, we don't need to limit the registers used for callees to the CCR_SGPR_64 register class - it's fine to use any available SGPRs. Also introduce a new pseudo, SI_TCRETURN_CHAIN, which also has a plain SGPR_64 operand. This is necessary because we won't be able to lower SI_CS_CHAIN_TC to SI_TCRETURN anymore, since its operand accepts a wider range of registers than the latter.
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) Changes[AMDGPU] Allow any SGPRs for chain callees Chain calls never return and don't need to preserve any SGPRs. Also introduce a new pseudo, SI_TCRETURN_CHAIN, which also has a plain Patch is 82.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168345.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index dee3dff3bf575..bf9b4297bd435 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -229,7 +229,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(Src);
return;
} else if (Opcode == AMDGPU::SI_TCRETURN ||
- Opcode == AMDGPU::SI_TCRETURN_GFX) {
+ Opcode == AMDGPU::SI_TCRETURN_GFX ||
+ Opcode == AMDGPU::SI_TCRETURN_CHAIN) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
} else if (AMDGPU::getT16D16Helper(Opcode)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6cc9b3cc67530..edb839c9a50e9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -815,9 +815,8 @@ def SI_CALL : SPseudoInstSI <
let isConvergent = 1;
}
-class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
- (ins rc:$src0, unknown:$callee, i32imm:$fpdiff),
- [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
+class SI_TCRETURN_Pseudo<RegisterClass rc, list<dag> pattern = []>
+ : SPseudoInstSI <(outs), (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), pattern> {
let Size = 4;
let FixedSize = 1;
let isCall = 1;
@@ -831,8 +830,15 @@ class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
}
// Tail call handling pseudo
-def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
-def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
+def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64,
+ [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
+def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64,
+ [(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
+
+// Tail call for chain calling conventions.
+// Uses unrestricted SGPR_64 instead of CCR_SGPR_64 because chain calls
+// never return and don't need to preserve any SGPRs.
+def SI_TCRETURN_CHAIN : SI_TCRETURN_Pseudo<SGPR_64>;
// Handle selecting indirect tail calls
def : GCNPat<
@@ -862,13 +868,13 @@ multiclass SI_CS_CHAIN_TC<
// This is essentially a tail call, but it also takes a mask to put in EXEC
// right before jumping to the callee.
def NAME: SPseudoInstSI <(outs),
- (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
+ (ins SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
// Same as above, but it will first try to reallocate the VGPRs, and choose an
// EXEC mask and a callee depending on the success of the reallocation attempt.
def _DVGPR : SPseudoInstSI <(outs),
- (ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
- SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>;
+ (ins SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
+ SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)>;
} // End FixedSize = 0 etc
}
@@ -880,7 +886,7 @@ multiclass si_cs_chain_tc_pattern<
dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> {
def : GCNPat<
(AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec),
- (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
+ (tc SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
>;
}
@@ -907,8 +913,8 @@ multiclass si_cs_chain_tc_dvgpr_patterns<
(AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff),
execvt:$exec, i32:$numvgprs,
execvt:$fbexec, i64:$fbcallee),
- (tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
- SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)
+ (tc SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
+ SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)
>;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index 6537b79d58021..340c9f682971c 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -186,7 +186,7 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
MI.removeOperand(OpIdx);
- MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
+ MI.setDesc(TII->get(AMDGPU::SI_TCRETURN_CHAIN));
}
void SILateBranchLowering::earlyTerm(MachineInstr &MI,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
index d4b485a379184..3043484b48717 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll
@@ -22,7 +22,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
+ ; GFX11-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -51,7 +51,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
+ ; GFX10-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -86,7 +86,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
+ ; GFX11-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
@@ -115,7 +115,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
+ ; GFX10-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
index 3e80a58bda4a0..c59989a68dd4d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
@@ -4,7 +4,6 @@
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
-declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn
define amdgpu_cs_chain void @dynamic_vgprs(i32 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 inreg %num_vgpr) {
; GISEL-GFX12-LABEL: dynamic_vgprs:
@@ -94,4 +93,45 @@ define amdgpu_cs_chain void @constants(<3 x i32> inreg %sgpr, { i32, ptr addrspa
unreachable
}
+define amdgpu_cs_chain void @high_sgpr_pressure(<30 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
+; GISEL-GFX12-LABEL: high_sgpr_pressure:
+; GISEL-GFX12: ; %bb.0:
+; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT: s_mov_b32 s30, callee_high_sgpr@abs32@lo
+; GISEL-GFX12-NEXT: s_mov_b32 s31, callee_high_sgpr@abs32@hi
+; GISEL-GFX12-NEXT: s_mov_b32 s34, retry_vgpr_alloc@abs32@lo
+; GISEL-GFX12-NEXT: s_mov_b32 s35, retry_vgpr_alloc@abs32@hi
+; GISEL-GFX12-NEXT: s_alloc_vgpr 64
+; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; GISEL-GFX12-NEXT: s_cselect_b64 s[30:31], s[30:31], s[34:35]
+; GISEL-GFX12-NEXT: s_cselect_b32 exec_lo, 7, -1
+; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL-GFX12-LABEL: high_sgpr_pressure:
+; DAGISEL-GFX12: ; %bb.0:
+; DAGISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-GFX12-NEXT: s_wait_expcnt 0x0
+; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-GFX12-NEXT: s_mov_b32 s31, retry_vgpr_alloc@abs32@hi
+; DAGISEL-GFX12-NEXT: s_mov_b32 s30, retry_vgpr_alloc@abs32@lo
+; DAGISEL-GFX12-NEXT: s_mov_b32 s35, callee_high_sgpr@abs32@hi
+; DAGISEL-GFX12-NEXT: s_mov_b32 s34, callee_high_sgpr@abs32@lo
+; DAGISEL-GFX12-NEXT: s_alloc_vgpr 64
+; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; DAGISEL-GFX12-NEXT: s_cselect_b64 s[34:35], s[34:35], s[30:31]
+; DAGISEL-GFX12-NEXT: s_cselect_b32 exec_lo, 7, -1
+; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; DAGISEL-GFX12-NEXT: s_setpc_b64 s[34:35]
+ call void(ptr, i32, <30 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_high_sgpr, i32 7, <30 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 1, i32 inreg 64, i32 inreg -1, ptr @retry_vgpr_alloc)
+ unreachable
+}
+
+declare amdgpu_cs_chain void @callee_high_sgpr(<30 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @retry_vgpr_alloc(<3 x i32> inreg %sgpr)
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index ece86627cbd92..43ba2925914a0 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -35,7 +35,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+ ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_64 = COPY [[REG_SEQUENCE]]
; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY10]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
;
; GISEL-GFX10-LABEL: name: chain_to_chain
@@ -67,7 +67,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+ ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_64 = COPY [[REG_SEQUENCE]]
; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY11]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
;
; DAGISEL-GFX11-LABEL: name: chain_to_chain
@@ -83,7 +83,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
@@ -112,7 +112,7 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad
; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
@@ -161,7 +161,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+ ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_64 = COPY [[REG_SEQUENCE]]
; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY10]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
;
; GISEL-GFX10-LABEL: name: cs_to_chain
@@ -193,7 +193,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+ ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_64 = COPY [[REG_SEQUENCE]]
; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY11]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51
;
; DAGISEL-GFX11-LABEL: name: cs_to_chain
@@ -209,7 +209,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
- ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
@@ -238,7 +238,7 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5
; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
- ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
@@ -287,7 +287,7 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]]
+ ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_64 = COPY [[REG_SEQUENCE]]
; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY10]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
;
; GISEL-GFX10-LABEL: name: chain_to_chain_preserve
@@ -319,7 +319,7 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3
; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve
; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve
; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GISEL-GFX10-NEXT: [[C...
[truncated]
|
Chain calls never return and don't need to preserve any SGPRs.
Therefore, we don't need to limit the registers used for callees to the
CCR_SGPR_64 register class - it's fine to use any available SGPRs.
Also introduce a new pseudo, SI_TCRETURN_CHAIN, which also has a plain
SGPR_64 operand. This is necessary because we won't be able to lower
SI_CS_CHAIN_TC to SI_TCRETURN anymore, since its operand accepts a wider
range of registers than the latter.