Skip to content

Commit

Permalink
[AMDGPU] Remove extract_subvector patterns
Browse files Browse the repository at this point in the history
Removing them seems to slightly increase code quality as well as
simplifying both the tablegen and C++ parts of the code.

Differential Revision: https://reviews.llvm.org/D149853
  • Loading branch information
jayfoad committed Jun 6, 2023
1 parent d9be8a8 commit a4a3ac1
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 105 deletions.
5 changes: 0 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -1441,11 +1441,6 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
unsigned NumSrcElt = SrcVT.getVectorNumElements();
assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");

// We have some TableGen patterns for when the extracted vector is exactly
// the low or high half of the operand.
if ((NumSrcElt == 2 * NumElt) && (Start == 0 || Start == NumElt))
return Op;

// Extract 32-bit registers at a time.
EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
EVT NewVT = NumElt == 2
Expand Down
60 changes: 0 additions & 60 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -1356,66 +1356,6 @@ foreach Index = 0-15 in {
}


def : Pat <
(extract_subvector v4i16:$vec, (i32 0)),
(v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
>;

def : Pat <
(extract_subvector v4i16:$vec, (i32 2)),
(v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
>;

def : Pat <
(extract_subvector v4f16:$vec, (i32 0)),
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
>;

def : Pat <
(extract_subvector v4f16:$vec, (i32 2)),
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
>;

def : Pat <
(extract_subvector v8i16:$vec, (i32 0)),
(v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1))
>;

def : Pat <
(extract_subvector v8i16:$vec, (i32 4)),
(v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3))
>;

def : Pat <
(extract_subvector v8f16:$vec, (i32 0)),
(v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1))
>;

def : Pat <
(extract_subvector v8f16:$vec, (i32 4)),
(v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
>;

def : Pat <
(extract_subvector v16i16:$vec, (i32 0)),
(v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3))
>;

def : Pat <
(extract_subvector v16i16:$vec, (i32 8)),
(v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7))
>;

def : Pat <
(extract_subvector v16f16:$vec, (i32 0)),
(v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3))
>;

def : Pat <
(extract_subvector v16f16:$vec, (i32 8)),
(v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7))
>;

foreach Index = 0-31 in {
def Extract_Element_v32i32_#Index : Extract_Element <
i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
Expand Down
52 changes: 24 additions & 28 deletions llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
Expand Up @@ -366,22 +366,20 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB2_4: ; %exit
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc
; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6
; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800
; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5
; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
br i1 undef, label %T, label %F

Expand Down Expand Up @@ -882,22 +880,20 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: .LBB5_4: ; %exit
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0
; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc
; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4
; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
br i1 undef, label %T, label %F

Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
Expand Up @@ -30,12 +30,12 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %
; REVERSEXNACK-LABEL: shuffle_v4f16_234u:
; REVERSEXNACK: ; %bb.0:
; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v1
; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v0
; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v3
; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v2
; REVERSEXNACK-NEXT: global_load_dword v0, v[5:6], off offset:4
; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[3:4], off
; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v3
; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v1
; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v0
; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v2
; REVERSEXNACK-NEXT: global_load_dword v0, v[3:4], off offset:4
; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[5:6], off
; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0)
; REVERSEXNACK-NEXT: s_setpc_b64 s[30:31]
;
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/sra.ll
Expand Up @@ -187,13 +187,13 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: v_readfirstlane_b32 s1, v1
; VI-NEXT: s_ashr_i32 s2, s0, 16
; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: s_ashr_i32 s3, s1, 16
; VI-NEXT: v_readfirstlane_b32 s0, v1
; VI-NEXT: v_readfirstlane_b32 s1, v0
; VI-NEXT: s_ashr_i32 s2, s1, 16
; VI-NEXT: s_sext_i32_i16 s1, s1
; VI-NEXT: s_ashr_i32 s0, s0, s1
; VI-NEXT: s_ashr_i32 s3, s0, 16
; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: s_ashr_i32 s0, s1, s0
; VI-NEXT: s_ashr_i32 s1, s2, s3
; VI-NEXT: s_lshl_b32 s1, s1, 16
; VI-NEXT: s_and_b32 s0, s0, 0xffff
Expand Down

0 comments on commit a4a3ac1

Please sign in to comment.