Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AMDGPU][GISel] Add inverse ballot intrinsic
The inverse ballot intrinsic takes in a boolean mask for all lanes and returns the boolean for the current lane. See SPIR-V's `subgroupInverseBallot()` in the [[ https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt | GL_KHR_shader_subgroup extension ]]. This allows decision making via branch and select instructions with a manually manipulated mask. Implemented in GlobalISel and SelectionDAG, since currently both are supported. The SelectionDAG required pseudo instructions to use the custom inserter. The boolean mask needs to be uniform for all lanes. Therefore we expect SGPR input. In case the source is in a VGPR, we insert one or more `v_readfirstlane` instructions. Reviewed By: nhaehnle Differential Revision: https://reviews.llvm.org/D146287
- Loading branch information
1 parent
51b5b29
commit 04317d4
Showing
10 changed files
with
471 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
159 changes: 159 additions & 0 deletions
159
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 | ||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL %s | ||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG %s | ||
|
||
declare i1 @llvm.amdgcn.inverse.ballot(i32) | ||
|
||
; Test ballot(0) | ||
define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { | ||
; GFX11-LABEL: constant_false_inverse_ballot: | ||
; GFX11: ; %bb.0: ; %entry | ||
; GFX11-NEXT: s_mov_b32 s0, 0 | ||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 | ||
; GFX11-NEXT: global_store_b32 v[0:1], v2, off | ||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GFX11-NEXT: s_endpgm | ||
entry: | ||
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0) | ||
%sel = select i1 %ballot, i32 1, i32 0 | ||
store i32 %sel, ptr addrspace(1) %out | ||
ret void | ||
} | ||
|
||
; Test ballot(1) | ||
|
||
define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { | ||
; GFX11-LABEL: constant_true_inverse_ballot: | ||
; GFX11: ; %bb.0: ; %entry | ||
; GFX11-NEXT: s_mov_b32 s0, -1 | ||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 | ||
; GFX11-NEXT: global_store_b32 v[0:1], v2, off | ||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GFX11-NEXT: s_endpgm | ||
entry: | ||
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF) | ||
%sel = select i1 %ballot, i32 1, i32 0 | ||
store i32 %sel, ptr addrspace(1) %out | ||
ret void | ||
} | ||
|
||
define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) { | ||
; GFX11-LABEL: constant_mask_inverse_ballot: | ||
; GFX11: ; %bb.0: ; %entry | ||
; GFX11-NEXT: s_movk_i32 s0, 0x1000 | ||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 | ||
; GFX11-NEXT: global_store_b32 v[0:1], v2, off | ||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GFX11-NEXT: s_endpgm | ||
entry: | ||
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000) | ||
%sel = select i1 %ballot, i32 1, i32 0 | ||
store i32 %sel, ptr addrspace(1) %out | ||
ret void | ||
} | ||
|
||
; Test inverse ballot using a vgpr as input | ||
|
||
define amdgpu_cs void @vgpr_inverse_ballot(i32 %input, ptr addrspace(1) %out) { | ||
; GFX11-LABEL: vgpr_inverse_ballot: | ||
; GFX11: ; %bb.0: ; %entry | ||
; GFX11-NEXT: v_readfirstlane_b32 s0, v0 | ||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 | ||
; GFX11-NEXT: global_store_b32 v[1:2], v0, off | ||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GFX11-NEXT: s_endpgm | ||
entry: | ||
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) | ||
%sel = select i1 %ballot, i32 1, i32 0 | ||
store i32 %sel, ptr addrspace(1) %out | ||
ret void | ||
} | ||
|
||
define amdgpu_cs void @sgpr_inverse_ballot(i32 inreg %input, ptr addrspace(1) %out) { | ||
; GFX11-LABEL: sgpr_inverse_ballot: | ||
; GFX11: ; %bb.0: ; %entry | ||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 | ||
; GFX11-NEXT: global_store_b32 v[0:1], v2, off | ||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GFX11-NEXT: s_endpgm | ||
entry: | ||
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) | ||
%sel = select i1 %ballot, i32 1, i32 0 | ||
store i32 %sel, ptr addrspace(1) %out | ||
ret void | ||
} | ||
|
||
; Test ballot after phi | ||
define amdgpu_cs void @phi_uniform(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) { | ||
; GFX11-LABEL: phi_uniform: | ||
; GFX11: ; %bb.0: ; %entry | ||
; GFX11-NEXT: s_cmp_lg_u32 s1, 0 | ||
; GFX11-NEXT: s_cbranch_scc1 .LBB5_2 | ||
; GFX11-NEXT: ; %bb.1: ; %if | ||
; GFX11-NEXT: s_add_i32 s0, s0, 1 | ||
; GFX11-NEXT: .LBB5_2: ; %endif | ||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 | ||
; GFX11-NEXT: global_store_b32 v[0:1], v2, off | ||
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GFX11-NEXT: s_endpgm | ||
entry: | ||
%cc = icmp ne i32 %s2, 0 | ||
br i1 %cc, label %endif, label %if | ||
|
||
if: | ||
%tmp = add i32 %s0_1, 1 | ||
br label %endif | ||
|
||
endif: | ||
%input = phi i32 [ %s0_1, %entry ], [ %tmp, %if ] | ||
|
||
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) | ||
%sel = select i1 %ballot, i32 1, i32 0 | ||
store i32 %sel, ptr addrspace(1) %out | ||
ret void | ||
} | ||
|
||
; Test for branching | ||
; GISel implementation is currently incorrect. | ||
; The change in the branch affects all lanes, not just the branching ones. | ||
; This test will be fixed once GISel correctly takes uniformity analysis into account. | ||
define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) { | ||
; GISEL-LABEL: inverse_ballot_branch: | ||
; GISEL: ; %bb.0: ; %entry | ||
; GISEL-NEXT: s_xor_b32 s2, s1, -1 | ||
; GISEL-NEXT: s_and_saveexec_b32 s1, s2 | ||
; GISEL-NEXT: ; %bb.1: ; %if | ||
; GISEL-NEXT: s_add_i32 s0, s0, 1 | ||
; GISEL-NEXT: ; %bb.2: ; %endif | ||
; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 | ||
; GISEL-NEXT: v_mov_b32_e32 v2, s0 | ||
; GISEL-NEXT: global_store_b32 v[0:1], v2, off | ||
; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GISEL-NEXT: s_endpgm | ||
; | ||
; SDAG-LABEL: inverse_ballot_branch: | ||
; SDAG: ; %bb.0: ; %entry | ||
; SDAG-NEXT: v_mov_b32_e32 v2, s0 | ||
; SDAG-NEXT: s_xor_b32 s2, s1, -1 | ||
; SDAG-NEXT: s_and_saveexec_b32 s1, s2 | ||
; SDAG-NEXT: ; %bb.1: ; %if | ||
; SDAG-NEXT: s_add_i32 s0, s0, 1 | ||
; SDAG-NEXT: v_mov_b32_e32 v2, s0 | ||
; SDAG-NEXT: ; %bb.2: ; %endif | ||
; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 | ||
; SDAG-NEXT: global_store_b32 v[0:1], v2, off | ||
; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; SDAG-NEXT: s_endpgm | ||
entry: | ||
%ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2) | ||
br i1 %ballot, label %endif, label %if | ||
|
||
if: | ||
%tmp = add i32 %s0_1, 1 | ||
br label %endif | ||
|
||
endif: | ||
%input = phi i32 [ %s0_1, %entry ], [ %tmp, %if ] | ||
store i32 %input, ptr addrspace(1) %out | ||
ret void | ||
} |
Oops, something went wrong.