Skip to content

Commit

Permalink
[AMDGPU] - Add s_bitreplicate intrinsic (#69209)
Browse files Browse the repository at this point in the history
Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32
machine instruction in both GISel and Selection DAG.

Support VGPR arguments by inserting a `v_readfirstlane`.
  • Loading branch information
OutOfCache committed Oct 31, 2023
1 parent 6a06155 commit b8d3ccd
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 1 deletion.
5 changes: 5 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1927,6 +1927,11 @@ def int_amdgcn_inverse_ballot :
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// Lowers to S_BITREPLICATE_B64_B32.
// The argument must be uniform; otherwise, the result is undefined.
def int_amdgcn_s_bitreplicate :
DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
[data_ty],
[
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2994,6 +2994,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingBFE(B, OpdMapper, false);
return;
case Intrinsic::amdgcn_inverse_ballot:
case Intrinsic::amdgcn_s_bitreplicate:
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 2); // Mask
return;
Expand Down Expand Up @@ -4546,6 +4547,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
break;
}
case Intrinsic::amdgcn_s_bitreplicate:
Register MaskReg = MI.getOperand(2).getReg();
unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
}
break;
}
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6473,6 +6473,14 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return CreatedBB;
}

// Legalize S_BITREPLICATE
if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
MachineOperand &Src = MI.getOperand(1);
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
return CreatedBB;
}

// Legalize MIMG and MUBUF/MTBUF for shaders.
//
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,8 @@ let SubtargetPredicate = isGFX9Plus in {
} // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]

let isReMaterializable = 1 in
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32",
[(set i64:$sdst, (int_amdgcn_s_bitreplicate i32:$src0))]>;
} // End SubtargetPredicate = isGFX9Plus

let SubtargetPredicate = isGFX10Plus in {
Expand Down
45 changes: 45 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s

declare i64 @llvm.amdgcn.s.bitreplicate(i32)

define i64 @test_s_bitreplicate_constant() {
; GFX11-LABEL: test_s_bitreplicate_constant:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%br = call i64 @llvm.amdgcn.s.bitreplicate(i32 u0x85FE3A92)
ret i64 %br
}

define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1) %out) {
; GFX11-LABEL: test_s_bitreplicate_sgpr:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
store i64 %br, ptr addrspace(1) %out
ret void
}

define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
; GFX11-LABEL: test_s_bitreplicate_vgpr:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
ret i64 %br
}

0 comments on commit b8d3ccd

Please sign in to comment.