Skip to content

Commit

Permalink
[AMDGPU] Make bfi patterns divergence-aware
Browse files Browse the repository at this point in the history
This tends to increase code size but more importantly it reduces vgpr
usage, and could avoid costly readfirstlanes if the result needs to be
in an sgpr.

Differential Revision: https://reviews.llvm.org/D88245
  • Loading branch information
jayfoad committed Sep 28, 2020
1 parent 286d3fc commit 2806f58
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 93 deletions.
15 changes: 8 additions & 7 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -1551,18 +1551,17 @@ def : IMad24Pat<V_MAD_I32_I24, 1>;
def : UMad24Pat<V_MAD_U32_U24, 1>;

// BFI patterns
// FIXME: This should only be done for VALU inputs

// Definition from ISA doc:
// (y & x) | (z & ~x)
def : AMDGPUPat <
(or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
(DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
(V_BFI_B32 $x, $y, $z)
>;

// 64-bit version
def : AMDGPUPat <
(or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
(DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
(REG_SEQUENCE SReg_64,
(V_BFI_B32 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
Expand All @@ -1575,13 +1574,13 @@ def : AMDGPUPat <
// SHA-256 Ch function
// z ^ (x & (y ^ z))
def : AMDGPUPat <
(xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
(DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
(V_BFI_B32 $x, $y, $z)
>;

// 64-bit version
def : AMDGPUPat <
(xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
(REG_SEQUENCE SReg_64,
(V_BFI_B32 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
Expand Down Expand Up @@ -2305,12 +2304,14 @@ defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;

// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
def : AMDGPUPat <
(or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
(DivergentBinFrag<or> (and i32:$x, i32:$z),
(and i32:$y, (or i32:$x, i32:$z))),
(V_BFI_B32 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y)
>;

def : AMDGPUPat <
(or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))),
(DivergentBinFrag<or> (and i64:$x, i64:$z),
(and i64:$y, (or i64:$x, i64:$z))),
(REG_SEQUENCE SReg_64,
(V_BFI_B32 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG SReg_64:$y, sub0))),
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Expand Up @@ -429,6 +429,18 @@ class UniformBinFrag<SDPatternOperator Op> : PatFrag <
let GISelPredicateCode = [{return true;}];
}

class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0, node:$src1),
(Op $src0, $src1),
[{ return N->isDivergent(); }]> {
// This check is unnecessary as it's captured by the result register
// bank constraint.
//
// FIXME: Should add a way for the emitter to recognize this is a
// trivially true predicate to eliminate the check.
let GISelPredicateCode = [{return true;}];
}

let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
def S_ADD_U32 : SOP2_32 <"s_add_u32">;
Expand Down
51 changes: 23 additions & 28 deletions llvm/test/CodeGen/AMDGPU/bfi_int.ll
Expand Up @@ -8,7 +8,9 @@
; FUNC-LABEL: {{^}}bfi_def:
; R600: BFI_INT

; GCN: v_bfi_b32
; GCN: s_andn2_b32
; GCN: s_and_b32
; GCN: s_or_b32
define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
entry:
%0 = xor i32 %x, -1
Expand All @@ -24,7 +26,9 @@ entry:
; FUNC-LABEL: {{^}}bfi_sha256_ch:
; R600: BFI_INT

; GCN: v_bfi_b32
; GCN: s_xor_b32
; GCN: s_and_b32
; GCN: s_xor_b32
define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
entry:
%0 = xor i32 %y, %z
Expand All @@ -40,8 +44,10 @@ entry:
; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W

; GCN: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
; GCN: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
; GCN: s_and_b32
; GCN: s_or_b32
; GCN: s_and_b32
; GCN: s_or_b32
define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
entry:
%0 = and i32 %x, %z
Expand Down Expand Up @@ -117,12 +123,9 @@ entry:

; FIXME: Should leave as 64-bit SALU ops
; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
; GCN-DAG: v_bfi_b32
; GCN: s_and_b64
; GCN: s_andn2_b64
; GCN: s_or_b64
define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
Expand All @@ -134,12 +137,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
}

; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
; GCN-DAG: v_bfi_b32
; GCN: s_xor_b64
; GCN: s_and_b64
; GCN: s_xor_b64
define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
Expand All @@ -151,12 +151,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
}

; FUNC-LABEL: {{^}}s_bitselect_i64_pat_2:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
; GCN: s_xor_b64
; GCN: s_and_b64
; GCN: s_xor_b64
define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
Expand All @@ -168,12 +165,10 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
}

; FUNC-LABEL: {{^}}s_bfi_sha256_ma_i64:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_xor_b32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_xor_b32
; GCN-DAG: v_bfi_b32
; GCN: v_bfi_b32
; GCN: s_and_b64
; GCN: s_or_b64
; GCN: s_and_b64
; GCN: s_or_b64
define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
entry:
%and0 = and i64 %x, %z
Expand Down
27 changes: 16 additions & 11 deletions llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
Expand Up @@ -125,10 +125,11 @@ entry:
; GCN-NOT: buffer_
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
; GCN: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]]
; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
; GCN: s_andn2_b64
; GCN: s_mov_b32 s[[KLO:[0-9]+]], 0x3c003c00
; GCN: s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]]
; GCN: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]]
; GCN: s_or_b64
define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
entry:
%v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
Expand All @@ -142,7 +143,9 @@ entry:
; GCN-NOT: buffer_
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], v{{[0-9]+}}, v{{[0-9]+}}
; GCN: s_andn2_b32
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c003c00
; GCN: s_or_b32
define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
entry:
%v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
Expand Down Expand Up @@ -184,10 +187,11 @@ entry:
; GCN-NOT: v_cndmask_b32
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x10001
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], [[K]], v{{[0-9]+}}
; GCN: s_andn2_b32
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10001
; GCN: s_or_b32
define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
entry:
%v = insertelement <2 x i16> %vec, i16 1, i32 %sel
Expand All @@ -201,10 +205,11 @@ entry:
; GCN-NOT: buffer_
; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
; GCN: s_mov_b32 [[K:s[0-9]+]], 0x10001
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], [[K]]
; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
; GCN: v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}
; GCN: s_andn2_b64
; GCN: s_mov_b32 s[[KLO:[0-9]+]], 0x10001
; GCN: s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]]
; GCN: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]]
; GCN: s_or_b64
define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
entry:
%v = insertelement <4 x i16> %vec, i16 1, i32 %sel
Expand Down

0 comments on commit 2806f58

Please sign in to comment.