Skip to content

Commit

Permalink
[AMDGPU] Add bfi immediate pattern
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D88246
  • Loading branch information
jayfoad committed Sep 28, 2020
1 parent 2806f58 commit bab1a17
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 66 deletions.
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -1552,13 +1552,30 @@ def : UMad24Pat<V_MAD_U32_U24, 1>;

// BFI patterns

def BFIImm32 : PatFrag<
(ops node:$x, node:$y, node:$z),
(i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))),
[{
auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1));
return X && NotX &&
~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue();
}]
>;

// Definition from ISA doc:
// (y & x) | (z & ~x)
def : AMDGPUPat <
(DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
(V_BFI_B32 $x, $y, $z)
>;

// (y & C) | (z & ~C)
def : AMDGPUPat <
(BFIImm32 i32:$x, i32:$y, i32:$z),
(V_BFI_B32 $x, $y, $z)
>;

// 64-bit version
def : AMDGPUPat <
(DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
Expand Up @@ -1285,11 +1285,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_and_b32 s0, s4, 0xffff
; VI-NEXT: s_mov_b32 s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; VI-NEXT: v_or_b32_e32 v0, s0, v0
; VI-NEXT: v_bfi_b32 v0, s0, v4, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
Expand All @@ -1305,11 +1305,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; CI-NEXT: v_or_b32_e32 v0, s0, v0
; CI-NEXT: v_bfi_b32 v0, s0, v4, v0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
Expand Down Expand Up @@ -1415,11 +1415,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_and_b32 s0, s4, 0xffff
; VI-NEXT: s_mov_b32 s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_or_b32_e32 v1, s0, v1
; VI-NEXT: v_bfi_b32 v1, s0, v4, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
Expand All @@ -1435,11 +1435,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; CI-NEXT: v_or_b32_e32 v1, s0, v1
; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
Expand Down Expand Up @@ -1545,11 +1545,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_and_b32 s0, s4, 0xffff
; VI-NEXT: s_mov_b32 s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; VI-NEXT: v_or_b32_e32 v1, s0, v1
; VI-NEXT: v_bfi_b32 v1, s0, v4, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
Expand All @@ -1565,11 +1565,11 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; CI-NEXT: v_or_b32_e32 v1, s0, v1
; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
Expand Down
84 changes: 36 additions & 48 deletions llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
Expand Up @@ -905,10 +905,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down Expand Up @@ -978,10 +977,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out,
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffc00000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down Expand Up @@ -1052,10 +1050,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xff850000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down Expand Up @@ -1127,10 +1124,9 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out,
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, -7, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, -7, v2
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1406,10 +1402,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffe00000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down Expand Up @@ -1547,10 +1542,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_subrev_i32_e32 v2, vcc, 32, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_subrev_i32_e32 v3, vcc, 32, v2
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1619,10 +1613,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down Expand Up @@ -1760,10 +1753,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: v_bfi_b32 v2, s4, v3, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1831,10 +1823,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)*
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc400, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc400, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xc4000000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down Expand Up @@ -1906,10 +1897,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4400, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4400, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44000000, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down Expand Up @@ -1981,10 +1971,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)*
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4000, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4000, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, 2.0, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down Expand Up @@ -2056,10 +2045,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; SI-NEXT: v_add_i32_e32 v2, vcc, 0xffffc000, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_add_i32_e32 v3, vcc, 0xffffc000, v2
; SI-NEXT: s_mov_b32 s4, 0xffff0000
; SI-NEXT: v_bfi_b32 v2, s4, v2, v3
; SI-NEXT: v_add_i32_e32 v2, vcc, -2.0, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand Down

0 comments on commit bab1a17

Please sign in to comment.