230 changes: 113 additions & 117 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll

Large diffs are not rendered by default.

287 changes: 140 additions & 147 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll

Large diffs are not rendered by default.

74 changes: 32 additions & 42 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,13 @@ define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
; GISEL-NEXT: s_mov_b32 s4, 0x4f7ffffe
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
; GISEL-NEXT: v_mul_f32_e32 v4, s4, v4
; GISEL-NEXT: v_mul_f32_e32 v6, s4, v6
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
Expand Down Expand Up @@ -217,9 +216,8 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
; CHECK-LABEL: v_urem_v2i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_movk_i32 s4, 0xfff
; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
; CHECK-NEXT: v_and_b32_e32 v1, s4, v1
; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0
; CHECK-NEXT: v_and_b32_e32 v1, 0xfff, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = urem <2 x i32> %num, <i32 4096, i32 4096>
ret <2 x i32> %result
Expand Down Expand Up @@ -257,14 +255,14 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb
; GISEL-NEXT: v_mov_b32_e32 v2, 0x12d8fb
; GISEL-NEXT: v_mov_b32_e32 v3, 0xffed2705
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
; GISEL-NEXT: v_mul_lo_u32 v3, v3, v4
; GISEL-NEXT: v_mul_hi_u32 v3, v4, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb
; GISEL-NEXT: v_mov_b32_e32 v4, 0xffed2705
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3
; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v3
; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3
; GISEL-NEXT: v_mul_lo_u32 v4, v4, s4
Expand Down Expand Up @@ -351,18 +349,16 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
; GISEL-LABEL: v_urem_v2i32_pow2_shl_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s4, 0x1000
; GISEL-NEXT: s_mov_b32 s5, 0x4f7ffffe
; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2
; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3
; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
; GISEL-NEXT: v_mul_f32_e32 v4, s5, v4
; GISEL-NEXT: v_mul_f32_e32 v6, s5, v6
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
Expand Down Expand Up @@ -394,9 +390,8 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
; CGP-LABEL: v_urem_v2i32_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: s_movk_i32 s4, 0x1000
; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2
; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3
; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
Expand Down Expand Up @@ -449,9 +444,8 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
; GISEL-LABEL: v_urem_i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s4, 0xffffff
; GISEL-NEXT: v_and_b32_e32 v0, s4, v0
; GISEL-NEXT: v_and_b32_e32 v1, s4, v1
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
Expand All @@ -474,9 +468,8 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
; CGP-LABEL: v_urem_i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: s_mov_b32 s4, 0xffffff
; CGP-NEXT: v_and_b32_e32 v0, s4, v0
; CGP-NEXT: v_and_b32_e32 v1, s4, v1
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
; CGP-NEXT: v_rcp_f32_e32 v2, v2
Expand Down Expand Up @@ -509,20 +502,18 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-LABEL: v_urem_v2i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s4, 0xffffff
; GISEL-NEXT: s_mov_b32 s5, 0x4f7ffffe
; GISEL-NEXT: v_and_b32_e32 v0, s4, v0
; GISEL-NEXT: v_and_b32_e32 v1, s4, v1
; GISEL-NEXT: v_and_b32_e32 v2, s4, v2
; GISEL-NEXT: v_and_b32_e32 v3, s4, v3
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
; GISEL-NEXT: v_mul_f32_e32 v4, s5, v4
; GISEL-NEXT: v_mul_f32_e32 v6, s5, v6
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
Expand Down Expand Up @@ -554,11 +545,10 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; CGP-LABEL: v_urem_v2i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: s_mov_b32 s4, 0xffffff
; CGP-NEXT: v_and_b32_e32 v0, s4, v0
; CGP-NEXT: v_and_b32_e32 v1, s4, v1
; CGP-NEXT: v_and_b32_e32 v2, s4, v2
; CGP-NEXT: v_and_b32_e32 v3, s4, v3
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
Expand Down
623 changes: 309 additions & 314 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX9-NEXT: s_movk_i32 s4, 0xff
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2
; GFX9-NEXT: v_and_b32_e32 v2, s4, v1
; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX9-NEXT: v_mov_b32_e32 v3, 24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
Expand Down Expand Up @@ -601,7 +601,7 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX9-NEXT: s_mov_b32 s5, 24
; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2
; GFX9-NEXT: v_and_b32_e32 v2, s0, v1
; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AMDGPU/add.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,7 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; GFX9PLUS: global_load_dword [[B:v[0-9]+]]

; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
; GFX10-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9PLUS: buffer_store_dwordx4

Expand Down
1,044 changes: 498 additions & 546 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,8 @@ define i32 @f(i32 %x, i32 %y) {
; GCN-LABEL: f:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0xffff80
; GCN-NEXT: v_or_b32_e32 v0, s4, v0
; GCN-NEXT: v_or_b32_e32 v1, s4, v1
; GCN-NEXT: v_or_b32_e32 v0, 0xffff80, v0
; GCN-NEXT: v_or_b32_e32 v1, 0xffff80, v1
; GCN-NEXT: v_mul_i32_i24_e32 v0, v0, v1
; GCN-NEXT: v_lshrrev_b32_e32 v0, 14, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,10 @@ define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrsp
; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
; SI-DAG: buffer_load_dwordx2 v[[[LO0:[0-9]+]]:[[HI0:[0-9]+]]]
; SI-DAG: buffer_load_dwordx2 v[[[LO1:[0-9]+]]:[[HI1:[0-9]+]]]
; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, v[[LO0]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, v[[HI0]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, v[[LO1]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, v[[HI1]]
; SI: buffer_store_dwordx2
; SI: buffer_store_dwordx2
define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/bswap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,6 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: s_mov_b32 s4, 0xff00ff
; SI-NEXT: s_mov_b32 s5, 0xffff0000
; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8
Expand All @@ -529,9 +528,9 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
; SI-NEXT: v_bfi_b32 v0, s4, v0, v5
; SI-NEXT: v_bfi_b32 v3, s4, v3, v6
; SI-NEXT: v_bfi_b32 v2, s4, v2, v7
; SI-NEXT: v_and_b32_e32 v4, s5, v1
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v3, s5, v3
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v3
Expand Down
64 changes: 27 additions & 37 deletions llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
Original file line number Diff line number Diff line change
Expand Up @@ -250,25 +250,23 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s12, 0xff00
; SI-NEXT: s_movk_i32 s13, 0xff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_and_b32_e32 v2, s12, v0
; SI-NEXT: v_and_b32_e32 v4, s12, v1
; SI-NEXT: v_and_b32_e32 v3, s13, v3
; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v1, s13, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
Expand All @@ -283,18 +281,15 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_movk_i32 s12, 0xff00
; VI-NEXT: s_movk_i32 s13, 0xff
; VI-NEXT: s_movk_i32 s14, 0x900
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_mov_b32 s11, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
Expand All @@ -303,16 +298,16 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, s12, v1
; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
; VI-NEXT: v_and_b32_e32 v1, s13, v1
; VI-NEXT: v_and_b32_e32 v2, s12, v0
; VI-NEXT: v_and_b32_e32 v3, s13, v3
; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0
; VI-NEXT: v_and_b32_e32 v3, 0xff, v3
; VI-NEXT: v_or_b32_e32 v1, v4, v1
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_add_u16_e32 v1, s14, v1
; VI-NEXT: v_add_u16_e32 v2, s14, v2
; VI-NEXT: v_add_u16_e32 v1, 0x900, v1
; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
Expand Down Expand Up @@ -340,8 +335,6 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
; SI-NEXT: s_mov_b32 s16, 0xff00
; SI-NEXT: s_movk_i32 s17, 0xff
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s8, s0
Expand All @@ -353,12 +346,12 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_and_b32_e32 v2, s16, v0
; SI-NEXT: v_and_b32_e32 v4, s16, v1
; SI-NEXT: v_and_b32_e32 v3, s17, v3
; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v1, s17, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
Expand All @@ -374,17 +367,14 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_movk_i32 s16, 0xff00
; VI-NEXT: s_movk_i32 s17, 0xff
; VI-NEXT: s_movk_i32 s18, 0x900
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
Expand All @@ -394,16 +384,16 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, s16, v1
; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
; VI-NEXT: v_and_b32_e32 v1, s17, v1
; VI-NEXT: v_and_b32_e32 v2, s16, v0
; VI-NEXT: v_and_b32_e32 v3, s17, v3
; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0
; VI-NEXT: v_and_b32_e32 v3, 0xff, v3
; VI-NEXT: v_or_b32_e32 v1, v4, v1
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_add_u16_e32 v1, s18, v1
; VI-NEXT: v_add_u16_e32 v2, s18, v2
; VI-NEXT: v_add_u16_e32 v1, 0x900, v1
; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
Expand Down
62 changes: 28 additions & 34 deletions llvm/test/CodeGen/AMDGPU/ctpop16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -409,12 +409,11 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, s4, v0
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v3, s4, v1
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0
Expand All @@ -433,7 +432,6 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s4, 0xffff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Expand All @@ -444,8 +442,8 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_and_b32_e32 v1, s4, v1
; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0
; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0
; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0
Expand Down Expand Up @@ -535,16 +533,15 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, s4, v0
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v5, s4, v1
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v6, s4, v2
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v7, s4, v3
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0
Expand All @@ -571,7 +568,6 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s4, 0xffff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
Expand All @@ -584,10 +580,10 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; VI-NEXT: v_and_b32_e32 v3, s4, v3
; VI-NEXT: v_and_b32_e32 v2, s4, v2
; VI-NEXT: v_and_b32_e32 v1, s4, v1
; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0
; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0
; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0
Expand Down Expand Up @@ -718,25 +714,24 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16
; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v8, s4, v0
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v9, s4, v1
; SI-NEXT: v_and_b32_e32 v9, 0xffff, v1
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v10, s4, v2
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v11, s4, v3
; SI-NEXT: v_and_b32_e32 v11, 0xffff, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v12, s4, v4
; SI-NEXT: v_and_b32_e32 v12, 0xffff, v4
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_and_b32_e32 v13, s4, v5
; SI-NEXT: v_and_b32_e32 v13, 0xffff, v5
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; SI-NEXT: v_and_b32_e32 v14, s4, v6
; SI-NEXT: v_and_b32_e32 v14, 0xffff, v6
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; SI-NEXT: v_and_b32_e32 v15, s4, v7
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0
; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0
Expand Down Expand Up @@ -780,7 +775,6 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s4, 0xffff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0
Expand All @@ -796,10 +790,10 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
; VI-NEXT: v_and_b32_e32 v3, s4, v3
; VI-NEXT: v_and_b32_e32 v2, s4, v2
; VI-NEXT: v_and_b32_e32 v1, s4, v1
; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3
; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7
; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6
Expand All @@ -809,10 +803,10 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0
; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0
; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0
; VI-NEXT: v_and_b32_e32 v7, s4, v7
; VI-NEXT: v_and_b32_e32 v6, s4, v6
; VI-NEXT: v_and_b32_e32 v5, s4, v5
; VI-NEXT: v_and_b32_e32 v4, s4, v4
; VI-NEXT: v_and_b32_e32 v7, 0xffff, v7
; VI-NEXT: v_and_b32_e32 v6, 0xffff, v6
; VI-NEXT: v_and_b32_e32 v5, 0xffff, v5
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0
; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0
; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1172,7 +1172,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_movk_i32 s8, 0xff
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
Expand All @@ -1186,11 +1185,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, s8, v4
; SI-NEXT: v_and_b32_e32 v0, 0xff, v4
; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6
; SI-NEXT: v_or_b32_e32 v0, v7, v0
; SI-NEXT: v_and_b32_e32 v2, s8, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
Expand Down Expand Up @@ -1297,7 +1296,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7
; GFX9-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_add_u16_e32 v0, s4, v0
; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0
; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: global_store_dword v5, v0, s[2:3]
Expand Down
18 changes: 8 additions & 10 deletions llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,13 @@ define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspac
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v3, s4, v0
; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, s4, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-NEXT: v_and_b32_e32 v0, v4, v0
; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, v4, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
br i1 undef, label %T, label %F
Expand Down Expand Up @@ -261,13 +260,12 @@ define <4 x i16> @extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrsp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v2, s4, v0
; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v0, s4, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-NEXT: v_and_b32_e32 v0, v4, v0
; GFX9-NEXT: v_and_b32_e32 v2, v4, v2
; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(
}

; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000
; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}}
; GCN: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+$}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
Expand All @@ -152,7 +151,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrs
}

; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
; GCN-NOT: v_mul
Expand Down
44 changes: 19 additions & 25 deletions llvm/test/CodeGen/AMDGPU/fexp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ define <2 x float> @v_exp_v2f32(<2 x float> %arg0) {
; GCN-LABEL: v_exp_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; GCN-NEXT: v_mul_f32_e32 v0, s4, v0
; GCN-NEXT: v_mul_f32_e32 v1, s4, v1
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
; GCN-NEXT: v_exp_f32_e32 v0, v0
; GCN-NEXT: v_exp_f32_e32 v1, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -32,10 +31,9 @@ define <3 x float> @v_exp_v3f32(<3 x float> %arg0) {
; GCN-LABEL: v_exp_v3f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; GCN-NEXT: v_mul_f32_e32 v0, s4, v0
; GCN-NEXT: v_mul_f32_e32 v1, s4, v1
; GCN-NEXT: v_mul_f32_e32 v2, s4, v2
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
; GCN-NEXT: v_exp_f32_e32 v0, v0
; GCN-NEXT: v_exp_f32_e32 v1, v1
; GCN-NEXT: v_exp_f32_e32 v2, v2
Expand All @@ -48,11 +46,10 @@ define <4 x float> @v_exp_v4f32(<4 x float> %arg0) {
; GCN-LABEL: v_exp_v4f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; GCN-NEXT: v_mul_f32_e32 v0, s4, v0
; GCN-NEXT: v_mul_f32_e32 v1, s4, v1
; GCN-NEXT: v_mul_f32_e32 v2, s4, v2
; GCN-NEXT: v_mul_f32_e32 v3, s4, v3
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
; GCN-NEXT: v_exp_f32_e32 v0, v0
; GCN-NEXT: v_exp_f32_e32 v1, v1
; GCN-NEXT: v_exp_f32_e32 v2, v2
Expand Down Expand Up @@ -95,11 +92,10 @@ define <2 x half> @v_exp_v2f16(<2 x half> %arg0) {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_mul_f32_e32 v0, s4, v0
; SI-NEXT: v_mul_f32_e32 v1, s4, v1
; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
; SI-NEXT: v_exp_f32_e32 v0, v0
; SI-NEXT: v_exp_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -141,15 +137,14 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_mul_f32_e32 v0, s4, v0
; SI-NEXT: v_mul_f32_e32 v1, s4, v1
; SI-NEXT: v_mul_f32_e32 v2, s4, v2
; SI-NEXT: v_mul_f32_e32 v3, s4, v3
; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
; SI-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
; SI-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3
; SI-NEXT: v_exp_f32_e32 v0, v0
; SI-NEXT: v_exp_f32_e32 v1, v1
; SI-NEXT: v_exp_f32_e32 v2, v2
Expand All @@ -159,11 +154,10 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
; VI-LABEL: v_exp_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_movk_i32 s4, 0x3dc5
; VI-NEXT: v_mov_b32_e32 v3, 0x3dc5
; VI-NEXT: v_mul_f16_e32 v2, s4, v1
; VI-NEXT: v_mul_f16_e32 v2, 0x3dc5, v1
; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v4, s4, v0
; VI-NEXT: v_mul_f16_e32 v4, 0x3dc5, v0
; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_exp_f16_e32 v2, v2
; VI-NEXT: v_exp_f16_e32 v4, v4
Expand All @@ -177,9 +171,9 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x3dc5
; GFX9-NEXT: v_mul_f16_e32 v2, s4, v1
; GFX9-NEXT: v_mul_f16_e32 v2, 0x3dc5, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_mul_f16_e32 v3, s4, v0
; GFX9-NEXT: v_mul_f16_e32 v3, 0x3dc5, v0
; GFX9-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_exp_f16_e32 v2, v2
; GFX9-NEXT: v_exp_f16_e32 v3, v3
Expand Down
54 changes: 24 additions & 30 deletions llvm/test/CodeGen/AMDGPU/flat-scratch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -515,13 +515,12 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 4
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: v_add_u32_e32 v2, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 15
; GFX9-NEXT: scratch_store_dword v2, v3, off
; GFX9-NEXT: v_add_u32_e32 v1, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
Expand All @@ -548,16 +547,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-PAL-NEXT: s_mov_b32 s2, s0
; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4
; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0
; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
Expand Down Expand Up @@ -1367,12 +1365,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x104
; GFX9-NEXT: v_add_u32_e32 v2, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 15
; GFX9-NEXT: scratch_store_dword v2, v3, off
; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -1402,18 +1399,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104
; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0
; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0
; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0
; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
Expand Down Expand Up @@ -2254,12 +2250,11 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX9-NEXT: v_add_u32_e32 v2, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 15
; GFX9-NEXT: scratch_store_dword v2, v3, off
; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -2289,18 +2284,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0
; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0
; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0
; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0
; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -241,9 +241,8 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2
; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v2, v0
; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1
; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -369,11 +368,10 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4
; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-SAFE-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v4, v0
; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v4, v1
; GFX9-SAFE-NEXT: v_and_b32_e32 v2, v4, v2
; GFX9-SAFE-NEXT: v_and_b32_e32 v3, v4, v3
; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0
; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1
; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,8 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2
; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v2, v0
; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1
; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -370,11 +369,10 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4
; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-SAFE-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v4, v0
; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v4, v1
; GFX9-SAFE-NEXT: v_and_b32_e32 v2, v4, v2
; GFX9-SAFE-NEXT: v_and_b32_e32 v3, v4, v3
; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0
; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1
; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fneg-combines.ll
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,12 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl
}

; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}}
; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[SIGNBIT]], [[ADD]]
; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]

; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
Expand Down
29 changes: 10 additions & 19 deletions llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,8 @@ body: |
# operands

# CHECK-LABEL: name: add_f32_1.0_multi_f16_use
# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
# CHECK: %14:vgpr_32 = V_ADD_F16_e32 killed %11, %13, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F16_e32 killed %12, killed %13, implicit $mode, implicit $exec
# CHECK: %14:vgpr_32 = V_ADD_F16_e32 1065353216, killed %11, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, killed %12, implicit $mode, implicit $exec


name: add_f32_1.0_multi_f16_use
Expand Down Expand Up @@ -306,9 +305,8 @@ body: |
# constant, and not folded as a multi-use literal for the f16 cases

# CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
# CHECK: %14:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %11, %14, implicit $mode, implicit $exec
# CHECK: %16:vgpr_32 = V_ADD_F16_e32 %12, %14, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, %11, implicit $mode, implicit $exec
# CHECK: %16:vgpr_32 = V_ADD_F16_e32 1065353216, %12, implicit $mode, implicit $exec
# CHECK: %17:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit $mode, implicit $exec

name: add_f32_1.0_one_f32_use_multi_f16_use
Expand Down Expand Up @@ -511,9 +509,8 @@ body: |
# constant, and not folded as a multi-use literal for the f16 cases

# CHECK-LABEL: name: add_f16_1.0_multi_f32_use
# CHECK: %13:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
# CHECK: %14:vgpr_32 = V_ADD_F32_e32 15360, %11, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F32_e32 15360, %12, implicit $mode, implicit $exec

name: add_f16_1.0_multi_f32_use
alignment: 1
Expand Down Expand Up @@ -575,12 +572,10 @@ body: |
---

# The low 16-bits are an inline immediate, but the high bits are junk
# FIXME: Should be able to fold this

# CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
# CHECK: %13:vgpr_32 = V_MOV_B32_e32 80886784, implicit $exec
# CHECK: %14:vgpr_32 = V_ADD_F16_e32 %11, %13, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec
# CHECK: %14:vgpr_32 = V_ADD_F16_e32 80886784, %11, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F16_e32 80886784, %12, implicit $mode, implicit $exec

name: add_f16_1.0_other_high_bits_multi_f16_use
alignment: 1
Expand Down Expand Up @@ -641,13 +636,9 @@ body: |
...
---

# FIXME: Should fold inline immediate into f16 and literal use into
# f32 instruction.

# CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
# CHECK: %13:vgpr_32 = V_MOV_B32_e32 305413120, implicit $exec
# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec
# CHECK: %14:vgpr_32 = V_ADD_F32_e32 305413120, %11, implicit $mode, implicit $exec
# CHECK: %15:vgpr_32 = V_ADD_F16_e32 305413120, %12, implicit $mode, implicit $exec
name: add_f16_1.0_other_high_bits_use_f16_f32
alignment: 1
exposesReturnsTwice: false
Expand Down
56 changes: 26 additions & 30 deletions llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -242,19 +242,18 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %ou
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s2, 0x2f800000
; SI-NEXT: s_mov_b32 s3, 0xcf800000
; SI-NEXT: s_mov_b32 s2, 0xcf800000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_trunc_f32_e32 v0, s1
; SI-NEXT: v_trunc_f32_e32 v2, s0
; SI-NEXT: v_mul_f32_e32 v1, s2, v0
; SI-NEXT: v_mul_f32_e32 v3, s2, v2
; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; SI-NEXT: v_floor_f32_e32 v4, v1
; SI-NEXT: v_floor_f32_e32 v5, v3
; SI-NEXT: v_cvt_u32_f32_e32 v3, v4
; SI-NEXT: v_cvt_u32_f32_e32 v1, v5
; SI-NEXT: v_fma_f32 v0, v4, s3, v0
; SI-NEXT: v_fma_f32 v4, v5, s3, v2
; SI-NEXT: v_fma_f32 v0, v4, s2, v0
; SI-NEXT: v_fma_f32 v4, v5, s2, v2
; SI-NEXT: v_cvt_u32_f32_e32 v2, v0
; SI-NEXT: v_cvt_u32_f32_e32 v0, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
Expand All @@ -264,12 +263,11 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %ou
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s4, 0x2f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s3
; VI-NEXT: v_trunc_f32_e32 v4, s2
; VI-NEXT: v_mul_f32_e32 v1, s4, v0
; VI-NEXT: v_mul_f32_e32 v2, s4, v4
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; VI-NEXT: v_floor_f32_e32 v5, v1
; VI-NEXT: s_mov_b32 s2, 0xcf800000
; VI-NEXT: v_floor_f32_e32 v6, v2
Expand Down Expand Up @@ -379,29 +377,28 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %ou
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s8, 0x2f800000
; SI-NEXT: s_mov_b32 s9, 0xcf800000
; SI-NEXT: s_mov_b32 s8, 0xcf800000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_trunc_f32_e32 v0, s1
; SI-NEXT: v_trunc_f32_e32 v2, s0
; SI-NEXT: v_trunc_f32_e32 v4, s3
; SI-NEXT: v_trunc_f32_e32 v6, s2
; SI-NEXT: v_mul_f32_e32 v1, s8, v0
; SI-NEXT: v_mul_f32_e32 v3, s8, v2
; SI-NEXT: v_mul_f32_e32 v5, s8, v4
; SI-NEXT: v_mul_f32_e32 v7, s8, v6
; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; SI-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
; SI-NEXT: v_floor_f32_e32 v8, v1
; SI-NEXT: v_floor_f32_e32 v9, v3
; SI-NEXT: v_floor_f32_e32 v10, v5
; SI-NEXT: v_floor_f32_e32 v11, v7
; SI-NEXT: v_cvt_u32_f32_e32 v3, v8
; SI-NEXT: v_cvt_u32_f32_e32 v1, v9
; SI-NEXT: v_fma_f32 v0, v8, s9, v0
; SI-NEXT: v_fma_f32 v8, v9, s9, v2
; SI-NEXT: v_fma_f32 v0, v8, s8, v0
; SI-NEXT: v_fma_f32 v8, v9, s8, v2
; SI-NEXT: v_cvt_u32_f32_e32 v7, v10
; SI-NEXT: v_cvt_u32_f32_e32 v5, v11
; SI-NEXT: v_fma_f32 v4, v10, s9, v4
; SI-NEXT: v_fma_f32 v9, v11, s9, v6
; SI-NEXT: v_fma_f32 v4, v10, s8, v4
; SI-NEXT: v_fma_f32 v9, v11, s8, v6
; SI-NEXT: v_cvt_u32_f32_e32 v2, v0
; SI-NEXT: v_cvt_u32_f32_e32 v0, v8
; SI-NEXT: v_cvt_u32_f32_e32 v6, v4
Expand All @@ -414,34 +411,33 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %ou
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s2, 0x2f800000
; VI-NEXT: s_mov_b32 s3, 0xcf800000
; VI-NEXT: s_mov_b32 s2, 0xcf800000
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s5
; VI-NEXT: v_trunc_f32_e32 v4, s4
; VI-NEXT: v_mul_f32_e32 v1, s2, v0
; VI-NEXT: v_mul_f32_e32 v2, s2, v4
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; VI-NEXT: v_floor_f32_e32 v5, v1
; VI-NEXT: v_floor_f32_e32 v6, v2
; VI-NEXT: v_fma_f32 v0, v5, s3, v0
; VI-NEXT: v_fma_f32 v0, v5, s2, v0
; VI-NEXT: v_cvt_u32_f32_e32 v2, v0
; VI-NEXT: v_fma_f32 v0, v6, s3, v4
; VI-NEXT: v_fma_f32 v0, v6, s2, v4
; VI-NEXT: v_trunc_f32_e32 v4, s7
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
; VI-NEXT: v_mul_f32_e32 v5, s2, v4
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; VI-NEXT: v_trunc_f32_e32 v8, s6
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
; VI-NEXT: v_floor_f32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, s2, v8
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v8
; VI-NEXT: v_floor_f32_e32 v9, v5
; VI-NEXT: v_fma_f32 v4, v6, s3, v4
; VI-NEXT: v_fma_f32 v4, v6, s2, v4
; VI-NEXT: v_cvt_u32_f32_e32 v7, v6
; VI-NEXT: v_cvt_u32_f32_e32 v6, v4
; VI-NEXT: v_fma_f32 v4, v9, s3, v8
; VI-NEXT: v_fma_f32 v4, v9, s2, v8
; VI-NEXT: v_cvt_u32_f32_e32 v5, v9
; VI-NEXT: v_cvt_u32_f32_e32 v4, v4
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fpow.ll
Original file line number Diff line number Diff line change
Expand Up @@ -479,14 +479,13 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: s_mov_b32 s4, 0x80008000
; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2
; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_log_f32_e32 v0, v0
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/frem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2619,8 +2619,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
; SI-NEXT: v_and_b32_e32 v10, v8, v10
; SI-NEXT: v_not_b32_e32 v11, v11
; SI-NEXT: v_and_b32_e32 v11, v9, v11
; SI-NEXT: s_brev_b32 s8, 1
; SI-NEXT: v_and_b32_e32 v13, s8, v9
; SI-NEXT: v_and_b32_e32 v13, 0x80000000, v9
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12
; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc
; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12
Expand Down Expand Up @@ -2650,7 +2649,7 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
; SI-NEXT: v_and_b32_e32 v8, v6, v8
; SI-NEXT: v_not_b32_e32 v9, v9
; SI-NEXT: v_and_b32_e32 v9, v7, v9
; SI-NEXT: v_and_b32_e32 v11, s8, v7
; SI-NEXT: v_and_b32_e32 v11, 0x80000000, v7
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10
; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10
Expand Down
53 changes: 23 additions & 30 deletions llvm/test/CodeGen/AMDGPU/fshr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -697,10 +697,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX9-NEXT: s_mov_b32 s4, 0xf000f
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_and_b32_e32 v3, s4, v3
; GFX9-NEXT: v_and_b32_e32 v2, s4, v2
; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
Expand Down Expand Up @@ -737,14 +736,13 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
; SI-NEXT: v_or_b32_e32 v4, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: v_or_b32_e32 v3, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v0, s4, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v2, s4, v3
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3
; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
Expand Down Expand Up @@ -844,12 +842,11 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4
; SI-NEXT: v_or_b32_e32 v4, 16, v10
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_and_b32_e32 v2, s4, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v0, s4, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
Expand Down Expand Up @@ -911,10 +908,9 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: v_or_b32_e32 v7, v7, v9
; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -1164,16 +1160,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; SI-LABEL: v_fshr_v2i24:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0xffffff
; SI-NEXT: v_and_b32_e32 v6, s4, v4
; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab
; SI-NEXT: v_mul_hi_u32 v6, v6, s5
; SI-NEXT: v_and_b32_e32 v7, s4, v5
; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab
; SI-NEXT: v_mul_hi_u32 v6, v6, s4
; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6
; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
; SI-NEXT: v_mul_hi_u32 v6, v7, s5
; SI-NEXT: v_mul_hi_u32 v6, v7, s4
; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4
; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
Expand All @@ -1187,16 +1182,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; VI-LABEL: v_fshr_v2i24:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0xffffff
; VI-NEXT: v_and_b32_e32 v6, s4, v4
; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab
; VI-NEXT: v_mul_hi_u32 v6, v6, s5
; VI-NEXT: v_and_b32_e32 v7, s4, v5
; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4
; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab
; VI-NEXT: v_mul_hi_u32 v6, v6, s4
; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6
; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_mul_hi_u32 v6, v7, s5
; VI-NEXT: v_mul_hi_u32 v6, v7, s4
; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4
; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
Expand All @@ -1210,16 +1204,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX9-LABEL: v_fshr_v2i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0xffffff
; GFX9-NEXT: v_and_b32_e32 v6, s4, v4
; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab
; GFX9-NEXT: v_mul_hi_u32 v6, v6, s5
; GFX9-NEXT: v_and_b32_e32 v7, s4, v5
; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4
; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab
; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4
; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6
; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6
; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6
; GFX9-NEXT: v_mul_hi_u32 v6, v7, s5
; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4
; GFX9-NEXT: v_add_u32_e32 v4, 8, v4
; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3
Expand Down
229 changes: 102 additions & 127 deletions llvm/test/CodeGen/AMDGPU/idot2.ll

Large diffs are not rendered by default.

39 changes: 18 additions & 21 deletions llvm/test/CodeGen/AMDGPU/idot4s.ll
Original file line number Diff line number Diff line change
Expand Up @@ -188,28 +188,27 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8
; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8
; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8
; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8
; GFX7-NEXT: v_and_b32_e32 v7, s4, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
; GFX7-NEXT: v_and_b32_e32 v5, s4, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand Down Expand Up @@ -418,12 +417,11 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_and_b32_e32 v3, s4, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_and_b32_e32 v6, s4, v0
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0
; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
Expand Down Expand Up @@ -964,34 +962,33 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8
; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX7-NEXT: v_and_b32_e32 v6, s4, v7
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v7
; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8
; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
; GFX7-NEXT: v_and_b32_e32 v7, s4, v8
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2
; GFX7-NEXT: v_and_b32_e32 v5, s4, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v6, v8, v1
; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand Down
206 changes: 92 additions & 114 deletions llvm/test/CodeGen/AMDGPU/idot4u.ll

Large diffs are not rendered by default.

137 changes: 66 additions & 71 deletions llvm/test/CodeGen/AMDGPU/idot8s.ll
Original file line number Diff line number Diff line change
Expand Up @@ -344,49 +344,48 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v10, s4, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
; GFX7-NEXT: v_and_b32_e32 v11, s4, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
; GFX7-NEXT: v_and_b32_e32 v5, s4, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
; GFX7-NEXT: v_and_b32_e32 v12, s4, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
; GFX7-NEXT: v_and_b32_e32 v13, s4, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
; GFX7-NEXT: v_and_b32_e32 v7, s4, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
; GFX7-NEXT: v_and_b32_e32 v14, s4, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4
; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
; GFX7-NEXT: v_and_b32_e32 v15, s4, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
; GFX7-NEXT: v_and_b32_e32 v9, s4, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
; GFX7-NEXT: v_and_b32_e32 v16, s4, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand Down Expand Up @@ -917,49 +916,48 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4
; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v10, s4, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
; GFX7-NEXT: v_and_b32_e32 v11, s4, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1
; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4
; GFX7-NEXT: v_and_b32_e32 v5, s4, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4
; GFX7-NEXT: v_and_b32_e32 v12, s4, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1
; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4
; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
; GFX7-NEXT: v_and_b32_e32 v13, s4, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1
; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
; GFX7-NEXT: v_and_b32_e32 v7, s4, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
; GFX7-NEXT: v_and_b32_e32 v14, s4, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14
; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4
; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4
; GFX7-NEXT: v_and_b32_e32 v15, s4, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2
; GFX7-NEXT: v_and_b32_e32 v9, s4, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0
; GFX7-NEXT: v_and_b32_e32 v16, s4, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
Expand Down Expand Up @@ -2204,17 +2202,16 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_bfe_i32 v3, v2, 20, 4
; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4
; GFX7-NEXT: v_bfe_i32 v5, v2, 4, 4
; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4
; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4
Expand All @@ -2223,42 +2220,42 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
; GFX7-NEXT: v_or_b32_e32 v4, v6, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10
; GFX7-NEXT: v_and_b32_e32 v6, s4, v11
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v11
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12
; GFX7-NEXT: v_and_b32_e32 v11, s4, v13
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v13
; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4
; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0
; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
; GFX7-NEXT: v_or_b32_e32 v6, v11, v10
; GFX7-NEXT: v_and_b32_e32 v12, s4, v14
; GFX7-NEXT: v_and_b32_e32 v14, s4, v16
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4
; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6
; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1
; GFX7-NEXT: v_bfe_i32 v7, v2, 24, 4
; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2
; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4
; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4
; GFX7-NEXT: v_and_b32_e32 v13, s4, v15
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v15
; GFX7-NEXT: v_mad_u32_u24 v1, v16, v11, v1
; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, s4, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v5, v0
; GFX7-NEXT: v_and_b32_e32 v7, s4, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0
; GFX7-NEXT: v_and_b32_e32 v9, s4, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand Down Expand Up @@ -2845,8 +2842,6 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_movk_i32 s4, 0xff
; GFX7-NEXT: s_mov_b32 s5, 0xffff
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2
Expand All @@ -2858,13 +2853,13 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v3
; GFX7-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX7-NEXT: v_and_b32_e32 v6, s4, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; GFX7-NEXT: v_and_b32_e32 v8, s4, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9
; GFX7-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v0
; GFX7-NEXT: v_bfe_i32 v12, v0, 24, 4
Expand All @@ -2879,30 +2874,30 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_or_b32_e32 v6, v8, v7
; GFX7-NEXT: v_or_b32_e32 v2, v2, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v11
; GFX7-NEXT: v_and_b32_e32 v8, s4, v12
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v12
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v13
; GFX7-NEXT: v_and_b32_e32 v10, s4, v14
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v14
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v15
; GFX7-NEXT: v_and_b32_e32 v13, s4, v16
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v16
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v17
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_and_b32_e32 v5, s5, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_or_b32_e32 v7, v8, v7
; GFX7-NEXT: v_or_b32_e32 v8, v10, v9
; GFX7-NEXT: v_or_b32_e32 v9, v13, v12
; GFX7-NEXT: v_or_b32_e32 v0, v0, v14
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_and_b32_e32 v2, s5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9
; GFX7-NEXT: v_and_b32_e32 v0, s5, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_or_b32_e32 v0, v0, v7
; GFX7-NEXT: v_and_b32_e32 v7, s4, v2
; GFX7-NEXT: v_and_b32_e32 v13, s4, v0
; GFX7-NEXT: v_and_b32_e32 v6, s5, v8
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v2
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v0
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v8
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 8
; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -2914,17 +2909,17 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: v_and_b32_e32 v9, s4, v4
; GFX7-NEXT: v_and_b32_e32 v15, s4, v5
; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v4
; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v5
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0
; GFX7-NEXT: v_bfe_u32 v10, v4, 8, 8
; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v15, v0
; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8
; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v0, v10, v16, v0
; GFX7-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NEXT: v_and_b32_e32 v11, s4, v11
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v11, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
Expand Down
307 changes: 150 additions & 157 deletions llvm/test/CodeGen/AMDGPU/idot8u.ll

Large diffs are not rendered by default.

66 changes: 32 additions & 34 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1304,109 +1304,107 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_movk_i32 s4, 0xff
; SI-NEXT: s_lshr_b32 s5, s11, 8
; SI-NEXT: s_lshr_b32 s4, s11, 8
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: v_and_b32_e32 v1, s4, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_cmp_lg_u32 s6, 13
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s6, 12
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: v_mov_b32_e32 v2, s11
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_and_b32_e32 v2, s4, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: s_mov_b32 s5, 0xffff
; SI-NEXT: s_lshr_b32 s7, s10, 24
; SI-NEXT: s_lshr_b32 s4, s10, 24
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, s5, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_cmp_lg_u32 s6, 11
; SI-NEXT: v_or_b32_e32 v3, v1, v0
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_lshr_b32 s7, s10, 16
; SI-NEXT: s_lshr_b32 s4, s10, 16
; SI-NEXT: s_cmp_lg_u32 s6, 10
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_lshr_b32 s7, s10, 8
; SI-NEXT: s_lshr_b32 s4, s10, 8
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: v_and_b32_e32 v1, s4, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_cmp_lg_u32 s6, 9
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s6, 8
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_and_b32_e32 v2, s4, v2
; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: s_lshr_b32 s7, s9, 24
; SI-NEXT: s_lshr_b32 s4, s9, 24
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, s5, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_cmp_lg_u32 s6, 7
; SI-NEXT: v_or_b32_e32 v2, v1, v0
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_lshr_b32 s7, s9, 16
; SI-NEXT: s_lshr_b32 s4, s9, 16
; SI-NEXT: s_cmp_lg_u32 s6, 6
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_lshr_b32 s7, s9, 8
; SI-NEXT: s_lshr_b32 s4, s9, 8
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: v_and_b32_e32 v1, s4, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: s_cmp_lg_u32 s6, 5
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s6, 4
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: v_mov_b32_e32 v4, s9
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_and_b32_e32 v4, s4, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: s_lshr_b32 s7, s8, 24
; SI-NEXT: s_lshr_b32 s4, s8, 24
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, s5, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_cmp_lg_u32 s6, 3
; SI-NEXT: v_or_b32_e32 v1, v1, v0
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_lshr_b32 s7, s8, 16
; SI-NEXT: s_lshr_b32 s4, s8, 16
; SI-NEXT: s_cmp_lg_u32 s6, 2
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; SI-NEXT: s_lshr_b32 s7, s8, 8
; SI-NEXT: s_lshr_b32 s4, s8, 8
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: v_and_b32_e32 v4, s4, v4
; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
; SI-NEXT: s_cmp_lg_u32 s6, 1
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; SI-NEXT: v_mov_b32_e32 v5, s8
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_and_b32_e32 v5, s4, v5
; SI-NEXT: v_and_b32_e32 v5, 0xff, v5
; SI-NEXT: v_or_b32_e32 v4, v5, v4
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v4, s5, v4
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
Expand Down
43 changes: 21 additions & 22 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1881,47 +1881,46 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
; GFX9-NEXT: s_cmp_eq_u32 s7, 7
; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s7, 6
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s7, 5
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s7, 4
; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s7, 3
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GFX9-NEXT: v_and_b32_e32 v3, v5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s7, 2
; GFX9-NEXT: v_lshl_or_b32 v3, v7, 16, v3
; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v3
; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s7, 1
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, v5, v2
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s7, 0
; GFX9-NEXT: v_lshl_or_b32 v2, v8, 16, v2
; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v6, vcc
; GFX9-NEXT: v_lshl_or_b32 v2, v7, 16, v2
; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v5, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GFX9-NEXT: v_and_b32_e32 v1, v5, v1
; GFX9-NEXT: v_and_b32_e32 v0, v5, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v8, 16, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
Expand Down
97 changes: 43 additions & 54 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
Original file line number Diff line number Diff line change
Expand Up @@ -553,10 +553,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
; GFX9-LABEL: sample_d_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX9-NEXT: v_and_b32_e32 v4, v6, v4
; GFX9-NEXT: v_and_b32_e32 v2, v6, v2
; GFX9-NEXT: v_and_b32_e32 v0, v6, v0
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0
Expand Down Expand Up @@ -585,13 +584,12 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v12, v8
; GFX9-NEXT: v_mov_b32_e32 v8, v2
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v6
; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v2
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_mov_b32_e32 v10, v5
; GFX9-NEXT: v_and_b32_e32 v5, v2, v6
; GFX9-NEXT: v_and_b32_e32 v3, v2, v3
; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX9-NEXT: image_sample_d v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -636,13 +634,12 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
; GFX9-LABEL: sample_c_d_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: v_mov_b32_e32 v8, v2
; GFX9-NEXT: v_and_b32_e32 v2, v9, v5
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5
; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2
; GFX9-NEXT: v_and_b32_e32 v2, v9, v7
; GFX9-NEXT: v_and_b32_e32 v1, v9, v1
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1
; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
Expand Down Expand Up @@ -689,10 +686,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
; GFX9-LABEL: sample_d_cl_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX9-NEXT: v_and_b32_e32 v4, v7, v4
; GFX9-NEXT: v_and_b32_e32 v2, v7, v2
; GFX9-NEXT: v_and_b32_e32 v0, v7, v0
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0
Expand Down Expand Up @@ -742,12 +738,11 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v11, v7
; GFX9-NEXT: v_mov_b32_e32 v7, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: v_and_b32_e32 v5, v0, v5
; GFX9-NEXT: v_and_b32_e32 v3, v0, v3
; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5
; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0
; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -789,10 +784,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
; GFX9-LABEL: sample_cd_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX9-NEXT: v_and_b32_e32 v4, v6, v4
; GFX9-NEXT: v_and_b32_e32 v2, v6, v2
; GFX9-NEXT: v_and_b32_e32 v0, v6, v0
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0
Expand Down Expand Up @@ -836,13 +830,12 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) {
; GFX9-LABEL: sample_c_cd_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v7, v3
; GFX9-NEXT: v_mov_b32_e32 v8, v2
; GFX9-NEXT: v_and_b32_e32 v2, v9, v5
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5
; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2
; GFX9-NEXT: v_and_b32_e32 v2, v9, v7
; GFX9-NEXT: v_and_b32_e32 v1, v9, v1
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1
; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16
Expand Down Expand Up @@ -889,10 +882,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) {
; GFX9-LABEL: sample_cd_cl_2d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX9-NEXT: v_and_b32_e32 v4, v7, v4
; GFX9-NEXT: v_and_b32_e32 v2, v7, v2
; GFX9-NEXT: v_and_b32_e32 v0, v7, v0
; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0
Expand Down Expand Up @@ -942,12 +934,11 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v11, v7
; GFX9-NEXT: v_mov_b32_e32 v7, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: v_and_b32_e32 v5, v0, v5
; GFX9-NEXT: v_and_b32_e32 v3, v0, v3
; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5
; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0
; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -1136,13 +1127,12 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v13, v8
; GFX9-NEXT: v_mov_b32_e32 v8, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX9-NEXT: v_mov_b32_e32 v9, v1
; GFX9-NEXT: v_and_b32_e32 v1, v0, v6
; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, v0, v4
; GFX9-NEXT: v_and_b32_e32 v0, v0, v2
; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0
; GFX9-NEXT: image_sample_c_d_o v0, v[8:13], s[0:7], s[8:11] dmask:0x4 a16 da
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -1172,13 +1162,12 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: v_mov_b32_e32 v13, v8
; GFX9-NEXT: v_mov_b32_e32 v8, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX9-NEXT: v_mov_b32_e32 v9, v1
; GFX9-NEXT: v_and_b32_e32 v1, v0, v6
; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, v0, v4
; GFX9-NEXT: v_and_b32_e32 v0, v0, v2
; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0
; GFX9-NEXT: image_sample_c_d_o v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 a16 da
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,15 @@ define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
; GFX6-NEXT: s_mov_b32 s8, s2
; GFX6-NEXT: s_mov_b32 s9, s3
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX6-NEXT: s_mov_b32 s2, 0x3e22f983
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1
; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1
; GFX6-NEXT: v_fract_f32_e32 v1, v1
; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0
; GFX6-NEXT: v_fract_f32_e32 v0, v0
; GFX6-NEXT: v_cos_f32_e32 v0, v0
; GFX6-NEXT: v_cos_f32_e32 v1, v1
Expand Down
11 changes: 4 additions & 7 deletions llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,22 @@ entry:
; SI: buffer_load_dword v[[A_F16_0:[0-9]+]]
; VI: flat_load_dword v[[A_F16_0:[0-9]+]]
; GFX9: global_load_dword v[[A_F16_0:[0-9]+]]
; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218
; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c
; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], 0x3f317218, v[[R_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]]
; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]]
; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], 0x3f317218, v[[R_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]]
; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]]
; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
; VI: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]]
; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x398c, v[[R_F16_2]]
; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
; SI-NOT: v_and_b32_e32
; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.log.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ entry:
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
entry:
%res = call <2 x float> @llvm.log.v2f32(<2 x float> %in)
Expand Down Expand Up @@ -67,14 +66,13 @@ entry:
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}}
define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
entry:
%res = call <4 x float> @llvm.log.v4f32(<4 x float> %in)
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,23 @@ entry:
; SI: buffer_load_dword v[[A_F16_0:[0-9]+]]
; VI: flat_load_dword v[[A_F16_0:[0-9]+]]
; GFX9: global_load_dword v[[A_F16_0:[0-9]+]]
; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a
; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1
; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], 0x3e9a209a, v[[R_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]]
; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]]
; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], 0x3e9a209a, v[[R_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]]
; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]]
; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]]
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_2]]
; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x34d1, v[[R_F16_0]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
; SI-NOT: v_and_b32_e32
; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
Expand Down
Loading