diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 848cd7471be23..768358c345f0a 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1187,6 +1187,10 @@ The AMDGPU backend implements the following LLVM IR intrinsics. performs subtraction only if the memory value is greater than or equal to the data value. + llvm.amdgcn.s.getpc Provides access to the s_getpc_b64 instruction, but with the return value + sign-extended from the width of the underlying PC hardware register even on + processors where the s_getpc_b64 instruction returns a zero-extended value. + ============================================== ========================================================== .. TODO:: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 53d9b97e7edf1..18dceb6ceac47 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1814,6 +1814,8 @@ def int_amdgcn_s_setreg : // not cross a 4Gb address boundary. Use for any other purpose may not // produce the desired results as optimizations may cause code movement, // especially as we explicitly use IntrNoMem to allow optimizations. +// This intrinsic always returns PC sign-extended from 48 bits even if the +// s_getpc_b64 instruction returns a zero-extended value. def int_amdgcn_s_getpc : ClangBuiltin<"__builtin_amdgcn_s_getpc">, DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 98bfc408a93fe..dabbd540a10b4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1278,6 +1278,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// values. bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } + // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead + // of sign-extending. + bool hasGetPCZeroExtension() const { return GFX12Insts; } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 0f89df1444866..a02c2a4659082 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -188,7 +188,7 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(MFI->getGITPtrHigh()) .addReg(TargetReg, RegState::ImplicitDefine); } else { - const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); + const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo); BuildMI(MBB, I, DL, GetPC64, TargetReg); } Register GitPtrLo = MFI->getGITPtrLoReg(*MF); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a67a37fcc518d..48fdb803e9d73 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2410,13 +2410,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // the encoding of $symbol starts 12 bytes after the start of the s_add_u32 // instruction. + int64_t Adjust = 0; + if (ST.hasGetPCZeroExtension()) { + // Fix up hardware that does not sign-extend the 48-bit PC value by + // inserting: s_sext_i32_i16 reghi, reghi + Bundler.append( + BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi)); + Adjust += 4; + } + if (OpLo.isGlobal()) - OpLo.setOffset(OpLo.getOffset() + 4); + OpLo.setOffset(OpLo.getOffset() + Adjust + 4); Bundler.append( BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo)); if (OpHi.isGlobal()) - OpHi.setOffset(OpHi.getOffset() + 12); + OpHi.setOffset(OpHi.getOffset() + Adjust + 12); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi) .add(OpHi)); @@ -2480,6 +2489,19 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::S_MUL_I64_I32_PSEUDO: MI.setDesc(get(AMDGPU::S_MUL_U64)); break; + + case AMDGPU::S_GETPC_B64_pseudo: + MI.setDesc(get(AMDGPU::S_GETPC_B64)); + if (ST.hasGetPCZeroExtension()) { + Register Dst = MI.getOperand(0).getReg(); + Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + // Fix up hardware that does not sign-extend the 48-bit PC value by + // inserting: s_sext_i32_i16 dsthi, dsthi + BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16), + DstHi) + .addReg(DstHi); + } + break; } return true; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index d914c3d9032f5..e05e966c31579 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -292,8 +292,11 @@ def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>; def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>; +def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; +// PSEUDO includes a workaround for a hardware anomaly where some ASICs +// zero-extend the result from 48 bits instead of sign-extending. let isReMaterializable = 1 in -def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", +def S_GETPC_B64_pseudo : SOP1_64_0 <"s_getpc_b64", [(set i64:$sdst, (int_amdgcn_s_getpc))] >; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll index 47b09d8fb6754..ee4e06bb53ad7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -723,8 +723,9 @@ define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 ; GFX12-LABEL: s_buffer_load_index_across_bb: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_getpc_b64 s[4:5] -; GFX12-NEXT: s_add_co_u32 s4, s4, gv@gotpcrel32@lo+4 -; GFX12-NEXT: s_add_co_ci_u32 s5, s5, gv@gotpcrel32@hi+12 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: s_add_co_u32 s4, s4, gv@gotpcrel32@lo+8 +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, gv@gotpcrel32@hi+16 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir index e41c42c4f40b8..81aa3a39de42f 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir @@ -581,16 +581,16 @@ body: | bb.0: ; GCN-LABEL: name: test_remat_s_getpc_b64 - ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64 + ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo + ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_pseudo ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 ; GCN-NEXT: S_ENDPGM 0 - %0:sgpr_64 = S_GETPC_B64 - %1:sgpr_64 = S_GETPC_B64 - %2:sgpr_64 = S_GETPC_B64 + %0:sgpr_64 = S_GETPC_B64_pseudo + %1:sgpr_64 = S_GETPC_B64_pseudo + %2:sgpr_64 = S_GETPC_B64_pseudo S_NOP 0, implicit %0 S_NOP 0, implicit %1 S_NOP 0, implicit %2 @@ -604,15 +604,15 @@ body: | bb.0: ; GCN-LABEL: name: test_remat_s_getpc_b64_2 - ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64 - ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64 + ; GCN: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo + ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_GETPC_B64_pseudo ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.3, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.3, addrspace 5) ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: renamable $sgpr1 = COPY renamable $sgpr2 ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: renamable $sgpr1 = COPY killed renamable $sgpr3 ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5) - ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64 + ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.5, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.5, addrspace 5) ; GCN-NEXT: renamable $sgpr0 = COPY killed renamable $sgpr1 ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.4, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.4, addrspace 5) @@ -635,9 +635,9 @@ body: | ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.4, addrspace 5) ; GCN-NEXT: dead renamable $sgpr0 = S_ADDC_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc, implicit $scc ; GCN-NEXT: S_ENDPGM 0 - %0:sreg_64 = S_GETPC_B64 - %1:sreg_64 = S_GETPC_B64 - %2:sreg_64 = S_GETPC_B64 + %0:sreg_64 = S_GETPC_B64_pseudo + %1:sreg_64 = S_GETPC_B64_pseudo + %2:sreg_64 = S_GETPC_B64_pseudo %4:sreg_32 = COPY %0.sub0:sreg_64 %5:sreg_32 = COPY %0.sub1:sreg_64 %6:sreg_32 = COPY %1.sub0:sreg_64 diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll index 598d7a8033c2e..84953b70c3bb9 100644 --- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll @@ -1,32 +1,86 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s - +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -stress-regalloc=2 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12 define void @test_remat_s_getpc_b64() { -; CHECK-LABEL: test_remat_s_getpc_b64: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v0, s30, 0 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: v_writelane_b32 v0, s31, 1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, s5 -; CHECK-NEXT: global_store_dwordx2 v[1:2], v[1:2], off -; CHECK-NEXT: v_readlane_b32 s31, v0, 1 -; CHECK-NEXT: v_readlane_b32 s30, v0, 0 -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_remat_s_getpc_b64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v0, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: v_writelane_b32 v0, s31, 1 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: global_store_dwordx2 v[1:2], v[1:2], off +; GFX9-NEXT: v_readlane_b32 s31, v0, 1 +; GFX9-NEXT: v_readlane_b32 s30, v0, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_remat_s_getpc_b64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v0, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v0, s31, 1 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_readlane_b32 s31, v0, 1 +; GFX11-NEXT: v_readlane_b32 s30, v0, 0 +; GFX11-NEXT: global_store_b64 v[1:2], v[1:2], off +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_remat_s_getpc_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX12-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX12-NEXT: s_mov_b32 exec_lo, s0 +; GFX12-NEXT: v_writelane_b32 v0, s30, 0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: v_writelane_b32 v0, s31, 1 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s31, v0, 1 +; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: v_readlane_b32 s30, v0, 0 +; GFX12-NEXT: global_store_b64 v[1:2], v[1:2], off +; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX12-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX12-NEXT: s_mov_b32 exec_lo, s0 +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i64 @llvm.amdgcn.s.getpc() tail call void asm sideeffect "", "s"(i64 %0)