Skip to content

Commit

Permalink
[AMDGPU][GISel] Add inverse ballot intrinsic
Browse files Browse the repository at this point in the history
The inverse ballot intrinsic takes in a boolean mask for all lanes and
returns the boolean for the current lane. See SPIR-V's
`subgroupInverseBallot()` in the [[ https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt | GL_KHR_shader_subgroup extension ]].
This allows decision making via branch and select instructions with a manually
manipulated mask.

Implemented in GlobalISel and SelectionDAG, since currently both are supported.
The SelectionDAG required pseudo instructions to use the custom inserter.

The boolean mask needs to be uniform for all lanes.
Therefore we expect SGPR input. In case the source is in a
VGPR, we insert one or more `v_readfirstlane` instructions.

Reviewed By: nhaehnle

Differential Revision: https://reviews.llvm.org/D146287
  • Loading branch information
OutOfCache committed Apr 6, 2023
1 parent 51b5b29 commit 04317d4
Show file tree
Hide file tree
Showing 10 changed files with 471 additions and 0 deletions.
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Expand Up @@ -1662,6 +1662,10 @@ def int_amdgcn_ballot :
Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_inverse_ballot :
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_readfirstlane :
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Expand Up @@ -28,6 +28,7 @@
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/ErrorHandling.h"

#ifdef EXPENSIVE_CHECKS
#include "llvm/Analysis/LoopInfo.h"
Expand Down Expand Up @@ -2528,6 +2529,18 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_interp_p1_f16:
SelectInterpP1F16(N);
return;
case Intrinsic::amdgcn_inverse_ballot:
switch (N->getOperand(1).getValueSizeInBits()) {
case 32:
Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
break;
case 64:
Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
break;
default:
llvm_unreachable("Unsupported size for inverse ballot mask.");
}
break;
default:
SelectCode(N);
return;
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Expand Up @@ -1046,6 +1046,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return selectIntrinsicCmp(I);
case Intrinsic::amdgcn_ballot:
return selectBallot(I);
case Intrinsic::amdgcn_inverse_ballot:
return selectInverseBallot(I);
case Intrinsic::amdgcn_reloc_constant:
return selectRelocConstant(I);
case Intrinsic::amdgcn_groupstaticsize:
Expand Down Expand Up @@ -1351,6 +1353,17 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
return true;
}

bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
const Register DstReg = I.getOperand(0).getReg();
const Register MaskReg = I.getOperand(2).getReg();

BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
I.eraseFromParent();
return true;
}

bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Expand Up @@ -112,6 +112,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDivScale(MachineInstr &MI) const;
bool selectIntrinsicCmp(MachineInstr &MI) const;
bool selectBallot(MachineInstr &I) const;
bool selectInverseBallot(MachineInstr &I) const;
bool selectRelocConstant(MachineInstr &I) const;
bool selectGroupStaticSize(MachineInstr &I) const;
bool selectReturnAddress(MachineInstr &I) const;
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Expand Up @@ -3004,6 +3004,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_ubfe:
applyMappingBFE(OpdMapper, false);
return;
case Intrinsic::amdgcn_inverse_ballot:
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(MI, MRI, 2); // Mask
return;
case Intrinsic::amdgcn_ballot:
// Use default handling and insert copy to vcc source.
break;
Expand Down Expand Up @@ -4494,6 +4498,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
break;
}
case Intrinsic::amdgcn_inverse_ballot: {
// This must be an SGPR, but accept a VGPR.
Register MaskReg = MI.getOperand(2).getReg();
unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
}
break;
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Expand Up @@ -376,6 +376,7 @@ def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
def : SourceOfDivergence<int_amdgcn_if>;
def : SourceOfDivergence<int_amdgcn_else>;
def : SourceOfDivergence<int_amdgcn_loop>;
def : SourceOfDivergence<int_amdgcn_inverse_ballot>;

foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
Expand Down
19 changes: 19 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -4477,6 +4477,25 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(

return BB;
}
case AMDGPU::S_INVERSE_BALLOT_U32:
case AMDGPU::S_INVERSE_BALLOT_U64: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const DebugLoc &DL = MI.getDebugLoc();
const Register DstReg = MI.getOperand(0).getReg();
Register MaskReg = MI.getOperand(1).getReg();

const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);

if (IsVALU) {
MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
}

BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
MI.eraseFromParent();
return BB;
}
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -189,6 +189,12 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let mayStore = 0;
}

let usesCustomInserter = 1 in {
def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$mask)>;

def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
} // End usesCustomInserter = 1

// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
let Uses = [EXEC];
Expand Down
159 changes: 159 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
@@ -0,0 +1,159 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG %s

declare i1 @llvm.amdgcn.inverse.ballot(i32)

; Test ballot(0)
define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) {
; GFX11-LABEL: constant_false_inverse_ballot:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0)
%sel = select i1 %ballot, i32 1, i32 0
store i32 %sel, ptr addrspace(1) %out
ret void
}

; Test ballot(1)

define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) {
; GFX11-LABEL: constant_true_inverse_ballot:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF)
%sel = select i1 %ballot, i32 1, i32 0
store i32 %sel, ptr addrspace(1) %out
ret void
}

define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) {
; GFX11-LABEL: constant_mask_inverse_ballot:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_movk_i32 s0, 0x1000
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000)
%sel = select i1 %ballot, i32 1, i32 0
store i32 %sel, ptr addrspace(1) %out
ret void
}

; Test inverse ballot using a vgpr as input

define amdgpu_cs void @vgpr_inverse_ballot(i32 %input, ptr addrspace(1) %out) {
; GFX11-LABEL: vgpr_inverse_ballot:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
%sel = select i1 %ballot, i32 1, i32 0
store i32 %sel, ptr addrspace(1) %out
ret void
}

define amdgpu_cs void @sgpr_inverse_ballot(i32 inreg %input, ptr addrspace(1) %out) {
; GFX11-LABEL: sgpr_inverse_ballot:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
%sel = select i1 %ballot, i32 1, i32 0
store i32 %sel, ptr addrspace(1) %out
ret void
}

; Test ballot after phi
define amdgpu_cs void @phi_uniform(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) {
; GFX11-LABEL: phi_uniform:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_cbranch_scc1 .LBB5_2
; GFX11-NEXT: ; %bb.1: ; %if
; GFX11-NEXT: s_add_i32 s0, s0, 1
; GFX11-NEXT: .LBB5_2: ; %endif
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%cc = icmp ne i32 %s2, 0
br i1 %cc, label %endif, label %if

if:
%tmp = add i32 %s0_1, 1
br label %endif

endif:
%input = phi i32 [ %s0_1, %entry ], [ %tmp, %if ]

%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
%sel = select i1 %ballot, i32 1, i32 0
store i32 %sel, ptr addrspace(1) %out
ret void
}

; Test for branching
; GISel implementation is currently incorrect.
; The change in the branch affects all lanes, not just the branching ones.
; This test will be fixed once GISel correctly takes uniformity analysis into account.
define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) {
; GISEL-LABEL: inverse_ballot_branch:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: s_xor_b32 s2, s1, -1
; GISEL-NEXT: s_and_saveexec_b32 s1, s2
; GISEL-NEXT: ; %bb.1: ; %if
; GISEL-NEXT: s_add_i32 s0, s0, 1
; GISEL-NEXT: ; %bb.2: ; %endif
; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GISEL-NEXT: v_mov_b32_e32 v2, s0
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-NEXT: s_endpgm
;
; SDAG-LABEL: inverse_ballot_branch:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: v_mov_b32_e32 v2, s0
; SDAG-NEXT: s_xor_b32 s2, s1, -1
; SDAG-NEXT: s_and_saveexec_b32 s1, s2
; SDAG-NEXT: ; %bb.1: ; %if
; SDAG-NEXT: s_add_i32 s0, s0, 1
; SDAG-NEXT: v_mov_b32_e32 v2, s0
; SDAG-NEXT: ; %bb.2: ; %endif
; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1
; SDAG-NEXT: global_store_b32 v[0:1], v2, off
; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-NEXT: s_endpgm
entry:
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2)
br i1 %ballot, label %endif, label %if

if:
%tmp = add i32 %s0_1, 1
br label %endif

endif:
%input = phi i32 [ %s0_1, %entry ], [ %tmp, %if ]
store i32 %input, ptr addrspace(1) %out
ret void
}

0 comments on commit 04317d4

Please sign in to comment.