diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 89c7b6ab9ee43..50c63621b8e41 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1927,6 +1927,11 @@ def int_amdgcn_inverse_ballot : Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// Lowers to S_BITREPLICATE_B64_B32. +// The argument must be uniform; otherwise, the result is undefined. +def int_amdgcn_s_bitreplicate : + DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; + class AMDGPUWaveReduce : Intrinsic< [data_ty], [ diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index e409a24007a6b..0c5ed649bcdbe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2994,6 +2994,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( applyMappingBFE(B, OpdMapper, false); return; case Intrinsic::amdgcn_inverse_ballot: + case Intrinsic::amdgcn_s_bitreplicate: applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 2); // Mask return; @@ -4546,6 +4547,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); break; } + case Intrinsic::amdgcn_s_bitreplicate: + Register MaskReg = MI.getOperand(2).getReg(); + unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64); + OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32); } break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f1e375ee52cb8..1bd7a28ca650e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6473,6 +6473,14 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } + // Legalize S_BITREPLICATE + if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) { + MachineOperand &Src = MI.getOperand(1); + if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) + Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); + return CreatedBB; + } + // Legalize MIMG and MUBUF/MTBUF for shaders. // // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 2f3b0ff2f7621..c419b5f7a5711 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -362,7 +362,8 @@ let SubtargetPredicate = isGFX9Plus in { } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] let isReMaterializable = 1 in - def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">; + def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32", + [(set i64:$sdst, (int_amdgcn_s_bitreplicate i32:$src0))]>; } // End SubtargetPredicate = isGFX9Plus let SubtargetPredicate = isGFX10Plus in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll new file mode 100644 index 0000000000000..027c9ef5e7cc3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + +declare i64 @llvm.amdgcn.s.bitreplicate(i32) + +define i64 @test_s_bitreplicate_constant() { +; GFX11-LABEL: test_s_bitreplicate_constant: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 u0x85FE3A92) + ret i64 %br +} + +define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1) %out) { +; GFX11-LABEL: test_s_bitreplicate_sgpr: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask) + store i64 %br, ptr addrspace(1) %out + ret void +} + +define i64 @test_s_bitreplicate_vgpr(i32 %mask) { +; GFX11-LABEL: test_s_bitreplicate_vgpr: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask) + ret i64 %br +}