Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(Src);
return;
} else if (Opcode == AMDGPU::SI_TCRETURN ||
Opcode == AMDGPU::SI_TCRETURN_GFX) {
Opcode == AMDGPU::SI_TCRETURN_GFX ||
Opcode == AMDGPU::SI_TCRETURN_CHAIN) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
} else if (AMDGPU::getT16D16Helper(Opcode)) {
Expand Down
28 changes: 17 additions & 11 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -815,9 +815,8 @@ def SI_CALL : SPseudoInstSI <
let isConvergent = 1;
}

class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
(ins rc:$src0, unknown:$callee, i32imm:$fpdiff),
[(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
class SI_TCRETURN_Pseudo<RegisterClass rc, list<dag> pattern = []>
: SPseudoInstSI <(outs), (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), pattern> {
let Size = 4;
let FixedSize = 1;
let isCall = 1;
Expand All @@ -831,8 +830,15 @@ class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
}

// Tail call handling pseudo
def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64,
[(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;
def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64,
[(AMDGPUtc_return_gfx i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]>;

// Tail call for chain calling conventions.
// Uses unrestricted SGPR_64 instead of CCR_SGPR_64 because chain calls
// never return and don't need to preserve any SGPRs.
def SI_TCRETURN_CHAIN : SI_TCRETURN_Pseudo<SGPR_64>;

// Handle selecting indirect tail calls
def : GCNPat<
Expand Down Expand Up @@ -862,13 +868,13 @@ multiclass SI_CS_CHAIN_TC<
// This is essentially a tail call, but it also takes a mask to put in EXEC
// right before jumping to the callee.
def NAME: SPseudoInstSI <(outs),
(ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;
(ins SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)>;

// Same as above, but it will first try to reallocate the VGPRs, and choose an
// EXEC mask and a callee depending on the success of the reallocation attempt.
def _DVGPR : SPseudoInstSI <(outs),
(ins CCR_SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)>;
(ins SGPR_64:$src0, i64imm:$callee, i32imm:$fpdiff, execrc:$exec,
SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)>;
} // End FixedSize = 0 etc
}

Expand All @@ -880,7 +886,7 @@ multiclass si_cs_chain_tc_pattern<
dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> {
def : GCNPat<
(AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec),
(tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
(tc SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
>;
}

Expand All @@ -907,8 +913,8 @@ multiclass si_cs_chain_tc_dvgpr_patterns<
(AMDGPUtc_return_chain_dvgpr i64:$src0, callee, (i32 timm:$fpdiff),
execvt:$exec, i32:$numvgprs,
execvt:$fbexec, i64:$fbcallee),
(tc CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
SSrc_b32:$numvgprs, execrc:$fbexec, CCR_SGPR_64:$fbcallee)
(tc SGPR_64:$src0, (i64 0), i32imm:$fpdiff, execrc:$exec,
SSrc_b32:$numvgprs, execrc:$fbexec, SGPR_64:$fbcallee)
>;
}
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ void SILateBranchLowering::expandChainCall(MachineInstr &MI,
for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
MI.removeOperand(OpIdx);

MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
MI.setDesc(TII->get(AMDGPU::SI_TCRETURN_CHAIN));
}

void SILateBranchLowering::earlyTerm(MachineInstr &MI,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
Expand Down Expand Up @@ -51,7 +51,7 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp
; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
Expand Down Expand Up @@ -86,7 +86,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
Expand Down Expand Up @@ -115,7 +115,7 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p
; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[GV1:%[0-9]+]]:sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve
; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn

define amdgpu_cs_chain void @dynamic_vgprs(i32 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 inreg %num_vgpr) {
; GISEL-GFX12-LABEL: dynamic_vgprs:
Expand Down Expand Up @@ -94,4 +93,45 @@ define amdgpu_cs_chain void @constants(<3 x i32> inreg %sgpr, { i32, ptr addrspa
unreachable
}

define amdgpu_cs_chain void @high_sgpr_pressure(<30 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) {
; GISEL-GFX12-LABEL: high_sgpr_pressure:
; GISEL-GFX12: ; %bb.0:
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
; GISEL-GFX12-NEXT: s_mov_b32 s30, callee_high_sgpr@abs32@lo
; GISEL-GFX12-NEXT: s_mov_b32 s31, callee_high_sgpr@abs32@hi
; GISEL-GFX12-NEXT: s_mov_b32 s34, retry_vgpr_alloc@abs32@lo
; GISEL-GFX12-NEXT: s_mov_b32 s35, retry_vgpr_alloc@abs32@hi
; GISEL-GFX12-NEXT: s_alloc_vgpr 64
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: s_cselect_b64 s[30:31], s[30:31], s[34:35]
; GISEL-GFX12-NEXT: s_cselect_b32 exec_lo, 7, -1
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
;
; DAGISEL-GFX12-LABEL: high_sgpr_pressure:
; DAGISEL-GFX12: ; %bb.0:
; DAGISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_expcnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
; DAGISEL-GFX12-NEXT: s_mov_b32 s31, retry_vgpr_alloc@abs32@hi
; DAGISEL-GFX12-NEXT: s_mov_b32 s30, retry_vgpr_alloc@abs32@lo
; DAGISEL-GFX12-NEXT: s_mov_b32 s35, callee_high_sgpr@abs32@hi
; DAGISEL-GFX12-NEXT: s_mov_b32 s34, callee_high_sgpr@abs32@lo
; DAGISEL-GFX12-NEXT: s_alloc_vgpr 64
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_cselect_b64 s[34:35], s[34:35], s[30:31]
; DAGISEL-GFX12-NEXT: s_cselect_b32 exec_lo, 7, -1
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_setpc_b64 s[34:35]
call void(ptr, i32, <30 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_high_sgpr, i32 7, <30 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 1, i32 inreg 64, i32 inreg -1, ptr @retry_vgpr_alloc)
unreachable
}

declare amdgpu_cs_chain void @callee_high_sgpr(<30 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
declare amdgpu_cs_chain_preserve void @retry_vgpr_alloc(<3 x i32> inreg %sgpr)
Loading
Loading