Skip to content

Commit

Permalink
[AMDGPU] Add ISD::FSHR -> ALIGNBIT support
Browse files Browse the repository at this point in the history
This patch allows ISD::FSHR(i32) patterns to lower to ALIGNBIT instructions.

This improves test coverage of ISD::FSHR matching - x86 has both FSHL/FSHR instructions and we prefer FSHL by default.

Differential Revision: https://reviews.llvm.org/D76070
  • Loading branch information
RKSimon committed Mar 12, 2020
1 parent 9975dc3 commit e91feee
Show file tree
Hide file tree
Showing 11 changed files with 153 additions and 293 deletions.
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -333,6 +333,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SUBE, VT, Legal);
}

// The hardware supports 32-bit FSHR, but not FSHL.
setOperationAction(ISD::FSHR, MVT::i32, Legal);

// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Expand Up @@ -736,6 +736,12 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
>;
}

// fshr pattern
class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(fshr i32:$src0, i32:$src1, i32:$src2),
(BIT_ALIGN $src0, $src1, $src2)
>;

// rotr pattern
class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(rotr i32:$src0, i32:$src1),
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/EvergreenInstructions.td
Expand Up @@ -422,6 +422,7 @@ def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
def : UMad24Pat<MULADD_UINT24_eg>;

def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
def : FSHRPattern <BIT_ALIGN_INT_eg>;
def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -1484,6 +1484,7 @@ def : UMad24Pat<V_MAD_U32_U24, 1>;

// FIXME: This should only be done for VALU inputs
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : FSHRPattern <V_ALIGNBIT_B32>;
def : ROTRPattern <V_ALIGNBIT_B32>;

def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
Expand Up @@ -163,9 +163,8 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) {
; GFX8-LABEL: undef_lo2_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v[0:1]
; GFX8-NEXT: ;;#ASMEND
Expand All @@ -190,9 +189,8 @@ define void @undef_lo2_v4f16(<2 x half> %arg0) {
; GFX8-LABEL: undef_lo2_v4f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v[0:1]
; GFX8-NEXT: ;;#ASMEND
Expand Down
170 changes: 57 additions & 113 deletions llvm/test/CodeGen/AMDGPU/fshl.ll
Expand Up @@ -97,10 +97,8 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s1, 25
; SI-NEXT: s_lshl_b32 s0, s0, 7
; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_alignbit_b32 v0, s0, v0, 25
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
Expand All @@ -109,12 +107,10 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_alignbit_b32 v2, s0, v0, 25
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_lshr_b32 s1, s1, 25
; VI-NEXT: s_lshl_b32 s0, s0, 7
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
Expand All @@ -123,28 +119,24 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 25
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_lshr_b32 s1, s1, 25
; GFX9-NEXT: s_lshl_b32 s0, s0, 7
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_i32_imm:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
; R600-NEXT: LSHR * T1.W, KC0[2].W, literal.y,
; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44)
; R600-NEXT: OR_INT T0.X, PV.W, PS,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
store i32 %0, i32 addrspace(1)* %in
Expand Down Expand Up @@ -283,14 +275,10 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s3, s3, 9
; SI-NEXT: s_lshr_b32 s1, s1, 23
; SI-NEXT: s_lshr_b32 s0, s0, 25
; SI-NEXT: s_lshl_b32 s2, s2, 7
; SI-NEXT: s_or_b32 s1, s3, s1
; SI-NEXT: s_or_b32 s0, s2, s0
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_alignbit_b32 v1, s3, v0, 23
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
Expand All @@ -300,15 +288,11 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23
; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_lshl_b32 s5, s5, 9
; VI-NEXT: s_lshr_b32 s1, s1, 23
; VI-NEXT: s_lshr_b32 s0, s0, 25
; VI-NEXT: s_lshl_b32 s4, s4, 7
; VI-NEXT: s_or_b32 s1, s5, s1
; VI-NEXT: s_or_b32 s0, s4, s0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
Expand All @@ -319,34 +303,26 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23
; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 25
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_lshl_b32 s5, s5, 9
; GFX9-NEXT: s_lshr_b32 s1, s1, 23
; GFX9-NEXT: s_lshr_b32 s0, s0, 25
; GFX9-NEXT: s_lshl_b32 s4, s4, 7
; GFX9-NEXT: s_or_b32 s1, s5, s1
; GFX9-NEXT: s_or_b32 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v2i32_imm:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHL T0.W, KC0[3].X, literal.x,
; R600-NEXT: LSHR * T1.W, KC0[3].Z, literal.y,
; R600-NEXT: 9(1.261169e-44), 23(3.222986e-44)
; R600-NEXT: OR_INT T0.Y, PV.W, PS,
; R600-NEXT: LSHL T0.W, KC0[2].W, literal.x,
; R600-NEXT: LSHR * T1.W, KC0[3].Y, literal.y,
; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44)
; R600-NEXT: OR_INT T0.X, PV.W, PS,
; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
Expand Down Expand Up @@ -557,22 +533,14 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s11, s11, 1
; SI-NEXT: s_lshr_b32 s3, s3, 31
; SI-NEXT: s_lshr_b32 s2, s2, 23
; SI-NEXT: s_lshl_b32 s10, s10, 9
; SI-NEXT: s_lshr_b32 s1, s1, 25
; SI-NEXT: s_lshl_b32 s9, s9, 7
; SI-NEXT: s_lshr_b32 s0, s0, 31
; SI-NEXT: s_lshl_b32 s8, s8, 1
; SI-NEXT: s_or_b32 s3, s11, s3
; SI-NEXT: s_or_b32 s2, s10, s2
; SI-NEXT: s_or_b32 s1, s9, s1
; SI-NEXT: s_or_b32 s0, s8, s0
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_alignbit_b32 v2, s10, v0, 23
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
Expand All @@ -583,23 +551,15 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: s_lshl_b32 s7, s7, 1
; VI-NEXT: s_lshr_b32 s3, s3, 31
; VI-NEXT: s_lshr_b32 s2, s2, 23
; VI-NEXT: s_lshl_b32 s6, s6, 9
; VI-NEXT: s_lshr_b32 s1, s1, 25
; VI-NEXT: s_lshl_b32 s5, s5, 7
; VI-NEXT: s_lshr_b32 s0, s0, 31
; VI-NEXT: s_lshl_b32 s4, s4, 1
; VI-NEXT: s_or_b32 s3, s7, s3
; VI-NEXT: s_or_b32 s2, s6, s2
; VI-NEXT: s_or_b32 s1, s5, s1
; VI-NEXT: s_or_b32 s0, s4, s0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23
; VI-NEXT: v_alignbit_b32 v1, s5, v0, 25
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
Expand All @@ -610,49 +570,33 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s8
; GFX9-NEXT: s_lshl_b32 s7, s7, 1
; GFX9-NEXT: s_lshr_b32 s3, s3, 31
; GFX9-NEXT: s_lshr_b32 s2, s2, 23
; GFX9-NEXT: s_lshl_b32 s6, s6, 9
; GFX9-NEXT: s_lshr_b32 s1, s1, 25
; GFX9-NEXT: s_lshl_b32 s5, s5, 7
; GFX9-NEXT: s_lshr_b32 s0, s0, 31
; GFX9-NEXT: s_lshl_b32 s4, s4, 1
; GFX9-NEXT: s_or_b32 s3, s7, s3
; GFX9-NEXT: s_or_b32 s2, s6, s2
; GFX9-NEXT: s_or_b32 s1, s5, s1
; GFX9-NEXT: s_or_b32 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_mov_b32_e32 v5, s9
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v4i32_imm:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: LSHL T0.W, KC0[4].X, 1,
; R600-NEXT: LSHR * T1.W, KC0[5].X, literal.x,
; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: LSHL T0.Z, KC0[3].W, literal.x,
; R600-NEXT: LSHR T2.W, KC0[4].W, literal.y,
; R600-NEXT: OR_INT * T0.W, PV.W, PS,
; R600-NEXT: 9(1.261169e-44), 23(3.222986e-44)
; R600-NEXT: OR_INT T0.Z, PV.Z, PV.W,
; R600-NEXT: LSHL T1.W, KC0[3].Z, literal.x,
; R600-NEXT: LSHR * T2.W, KC0[4].Z, literal.y,
; R600-NEXT: 7(9.809089e-45), 25(3.503246e-44)
; R600-NEXT: OR_INT T0.Y, PV.W, PS,
; R600-NEXT: LSHL T1.W, KC0[3].Y, 1,
; R600-NEXT: LSHR * T2.W, KC0[4].Y, literal.x,
; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00)
; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: OR_INT T0.X, PV.W, PS,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
Expand Down

0 comments on commit e91feee

Please sign in to comment.