diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 0daa5a71340d6..600fbabd93615 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1937,6 +1937,12 @@ def int_amdgcn_s_bitreplicate : def int_amdgcn_s_quadmask : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; +// Lowers to S_WQM_B{32,64} +// The argument must be uniform; otherwise, the result is undefined. +// Does not set WQM; merely calculates the bitmask. +def int_amdgcn_s_wqm : + DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; + class AMDGPUWaveReduce : Intrinsic< [data_ty], [ diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 259af55885fc0..a78eeef621620 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2996,6 +2996,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_inverse_ballot: case Intrinsic::amdgcn_s_bitreplicate: case Intrinsic::amdgcn_s_quadmask: + case Intrinsic::amdgcn_s_wqm: applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 2); // Mask return; @@ -4538,7 +4539,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); break; } - case Intrinsic::amdgcn_s_quadmask: { + case Intrinsic::amdgcn_s_quadmask: + case Intrinsic::amdgcn_s_wqm: { Register MaskReg = MI.getOperand(2).getReg(); unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 731969253c2b9..c5d96b68ddc9e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6484,10 +6484,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } - // Legalize S_BITREPLICATE and S_QUADMASK + // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 || MI.getOpcode() == AMDGPU::S_QUADMASK_B32 || - MI.getOpcode() == AMDGPU::S_QUADMASK_B64) { + MI.getOpcode() == AMDGPU::S_QUADMASK_B64 || + MI.getOpcode() == AMDGPU::S_WQM_B32 || + MI.getOpcode() == AMDGPU::S_WQM_B64) { MachineOperand &Src = MI.getOperand(1); if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index edbfd79db3fdb..2af92e8e38184 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -216,8 +216,10 @@ let Defs = [SCC] in { def S_NOT_B64 : SOP1_64 <"s_not_b64", [(set i64:$sdst, (UniformUnaryFrag i64:$src0))] >; - def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; - def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; + def S_WQM_B32 : SOP1_32 <"s_wqm_b32", + [(set i32:$sdst, (int_amdgcn_s_wqm i32:$src0))]>; + def S_WQM_B64 : SOP1_64 <"s_wqm_b64", + [(set i64:$sdst, (int_amdgcn_s_wqm i64:$src0))]>; } // End Defs = [SCC] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll new file mode 100644 index 0000000000000..6676dac19ba79 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + +declare i32 @llvm.amdgcn.s.wqm.i32(i32) +declare i64 @llvm.amdgcn.s.wqm.i64(i64) + +define i32 @test_s_wqm_constant_i32() { +; GFX11-LABEL: test_s_wqm_constant_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_wqm_b32 s0, 0x85fe3a92 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 u0x85FE3A92) + ret i32 %br +} + +define amdgpu_cs void @test_s_wqm_sgpr_i32(i32 inreg %mask, ptr addrspace(1) %out) { +; GFX11-LABEL: test_s_wqm_sgpr_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_wqm_b32 s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask) + store i32 %br, ptr addrspace(1) %out + ret void +} + +define i32 @test_s_wqm_vgpr_i32(i32 %mask) { +; GFX11-LABEL: test_s_wqm_vgpr_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_wqm_b32 s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask) + ret i32 %br +} + +define i64 @test_s_wqm_constant_i64() { +; GFX11-LABEL: test_s_wqm_constant_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0x85fe3a92 +; GFX11-NEXT: s_mov_b32 s1, 0x3a9285fe +; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 u0x3A9285FE85FE3A92) + ret i64 %br +} + +define amdgpu_cs void @test_s_wqm_sgpr_i64(i64 inreg %mask, ptr addrspace(1) %out) { +; GFX11-LABEL: test_s_wqm_sgpr_i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask) + store i64 %br, ptr addrspace(1) %out + ret void +} + +define i64 @test_s_wqm_vgpr_i64(i64 %mask) { +; GFX11-LABEL: test_s_wqm_vgpr_i64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask) + ret i64 %br +}