173 changes: 80 additions & 93 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,8 @@ define i24 @v_lshr_i24(i24 %value, i24 %amount) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0xffffff
; GFX10-NEXT: v_and_b32_e32 v1, s4, v1
; GFX10-NEXT: v_and_b32_e32 v0, s4, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = lshr i24 %value, %amount
Expand Down Expand Up @@ -631,9 +630,8 @@ define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) {
define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) {
; GFX6-LABEL: lshr_i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, s1, v0
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: ; return to shader part epilog
;
Expand All @@ -659,9 +657,8 @@ define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) {
define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) {
; GFX6-LABEL: lshr_i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: v_and_b32_e32 v0, s1, v0
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX6-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -757,9 +754,8 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: s_lshr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshr_b32 s1, s1, s3
; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
Expand All @@ -768,14 +764,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX8-LABEL: s_lshr_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
; GFX8-NEXT: s_lshr_b32 s1, s2, s4
; GFX8-NEXT: s_lshr_b32 s1, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -808,10 +803,10 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
Expand Down Expand Up @@ -844,10 +839,10 @@ define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
; GFX6-LABEL: lshr_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
Expand Down Expand Up @@ -944,13 +939,12 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
; GFX6-LABEL: s_lshr_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s8, 0xffff
; GFX6-NEXT: s_and_b32 s1, s1, s8
; GFX6-NEXT: s_and_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshr_b32 s1, s1, s5
; GFX6-NEXT: s_and_b32 s3, s3, s8
; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
; GFX6-NEXT: s_lshr_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s2, s2, s8
; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
; GFX6-NEXT: s_lshr_b32 s3, s3, s7
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, s6
Expand All @@ -961,36 +955,34 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX8-LABEL: s_lshr_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s6, 0xffff
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s6
; GFX8-NEXT: s_lshr_b32 s8, s3, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_lshr_b32 s0, s0, s2
; GFX8-NEXT: s_lshr_b32 s2, s4, s7
; GFX8-NEXT: s_lshr_b32 s2, s4, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s3
; GFX8-NEXT: s_lshr_b32 s3, s5, s8
; GFX8-NEXT: s_lshr_b32 s3, s5, s7
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s6
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: s_lshl_b32 s2, s3, 16
; GFX8-NEXT: s_and_b32 s1, s1, s6
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_lshr_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s5, 0xffff
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_and_b32 s0, s0, s5
; GFX9-NEXT: s_lshr_b32 s6, s2, 16
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NEXT: s_lshr_b32 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s4, s6
; GFX9-NEXT: s_lshr_b32 s2, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
; GFX9-NEXT: s_and_b32 s1, s1, s5
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshr_b32 s1, s1, s3
; GFX9-NEXT: s_lshr_b32 s2, s2, s4
Expand All @@ -999,17 +991,16 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX10-LABEL: s_lshr_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s4, 0xffff
; GFX10-NEXT: s_lshr_b32 s5, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, s4
; GFX10-NEXT: s_lshr_b32 s6, s2, 16
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_lshr_b32 s5, s2, 16
; GFX10-NEXT: s_lshr_b32 s0, s0, s2
; GFX10-NEXT: s_lshr_b32 s2, s5, s6
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, s4
; GFX10-NEXT: s_lshr_b32 s4, s3, 16
; GFX10-NEXT: s_lshr_b32 s2, s4, s5
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_lshr_b32 s1, s1, s3
; GFX10-NEXT: s_lshr_b32 s3, s5, s4
; GFX10-NEXT: s_lshr_b32 s3, s4, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: ; return to shader part epilog
Expand Down Expand Up @@ -1124,21 +1115,20 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
; GFX6-LABEL: s_lshr_v8i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s16, 0xffff
; GFX6-NEXT: s_and_b32 s1, s1, s16
; GFX6-NEXT: s_and_b32 s0, s0, s16
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshr_b32 s1, s1, s9
; GFX6-NEXT: s_and_b32 s3, s3, s16
; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
; GFX6-NEXT: s_lshr_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s2, s2, s16
; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
; GFX6-NEXT: s_lshr_b32 s3, s3, s11
; GFX6-NEXT: s_and_b32 s5, s5, s16
; GFX6-NEXT: s_and_b32 s7, s7, s16
; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, s10
; GFX6-NEXT: s_and_b32 s4, s4, s16
; GFX6-NEXT: s_and_b32 s4, s4, 0xffff
; GFX6-NEXT: s_lshr_b32 s5, s5, s13
; GFX6-NEXT: s_and_b32 s6, s6, s16
; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
; GFX6-NEXT: s_lshr_b32 s7, s7, s15
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s3, 16
Expand All @@ -1153,64 +1143,62 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX8-LABEL: s_lshr_v8i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s12, 0xffff
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s12
; GFX8-NEXT: s_lshr_b32 s13, s4, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
; GFX8-NEXT: s_lshr_b32 s9, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s12
; GFX8-NEXT: s_lshr_b32 s14, s5, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_lshr_b32 s0, s0, s4
; GFX8-NEXT: s_lshr_b32 s4, s8, s13
; GFX8-NEXT: s_lshr_b32 s4, s8, s12
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_and_b32 s2, s2, s12
; GFX8-NEXT: s_lshr_b32 s15, s6, 16
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_lshr_b32 s1, s1, s5
; GFX8-NEXT: s_lshr_b32 s5, s9, s14
; GFX8-NEXT: s_lshr_b32 s5, s9, s13
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
; GFX8-NEXT: s_and_b32 s0, s0, s12
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s11, s3, 16
; GFX8-NEXT: s_and_b32 s3, s3, s12
; GFX8-NEXT: s_lshr_b32 s16, s7, 16
; GFX8-NEXT: s_and_b32 s3, s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
; GFX8-NEXT: s_lshr_b32 s2, s2, s6
; GFX8-NEXT: s_lshr_b32 s6, s10, s15
; GFX8-NEXT: s_lshr_b32 s6, s10, s14
; GFX8-NEXT: s_or_b32 s0, s4, s0
; GFX8-NEXT: s_lshl_b32 s4, s5, 16
; GFX8-NEXT: s_and_b32 s1, s1, s12
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s3, s7
; GFX8-NEXT: s_lshr_b32 s7, s11, s16
; GFX8-NEXT: s_lshr_b32 s7, s11, s15
; GFX8-NEXT: s_or_b32 s1, s4, s1
; GFX8-NEXT: s_lshl_b32 s4, s6, 16
; GFX8-NEXT: s_and_b32 s2, s2, s12
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: s_lshl_b32 s4, s7, 16
; GFX8-NEXT: s_and_b32 s3, s3, s12
; GFX8-NEXT: s_and_b32 s3, s3, 0xffff
; GFX8-NEXT: s_or_b32 s3, s4, s3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_lshr_v8i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s9, 0xffff
; GFX9-NEXT: s_lshr_b32 s8, s0, 16
; GFX9-NEXT: s_and_b32 s0, s0, s9
; GFX9-NEXT: s_lshr_b32 s10, s4, 16
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_lshr_b32 s9, s4, 16
; GFX9-NEXT: s_lshr_b32 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s8, s10
; GFX9-NEXT: s_lshr_b32 s4, s8, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
; GFX9-NEXT: s_and_b32 s1, s1, s9
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_lshr_b32 s8, s5, 16
; GFX9-NEXT: s_lshr_b32 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s4, s4, s8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: s_and_b32 s2, s2, s9
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: s_lshr_b32 s5, s6, 16
; GFX9-NEXT: s_lshr_b32 s2, s2, s6
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_and_b32 s3, s3, s9
; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-NEXT: s_lshr_b32 s5, s7, 16
; GFX9-NEXT: s_lshr_b32 s3, s3, s7
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
Expand All @@ -1219,26 +1207,25 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX10-LABEL: s_lshr_v8i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s8, 0xffff
; GFX10-NEXT: s_lshr_b32 s9, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, s8
; GFX10-NEXT: s_lshr_b32 s10, s4, 16
; GFX10-NEXT: s_lshr_b32 s8, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_lshr_b32 s9, s4, 16
; GFX10-NEXT: s_lshr_b32 s0, s0, s4
; GFX10-NEXT: s_lshr_b32 s4, s9, s10
; GFX10-NEXT: s_lshr_b32 s9, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, s8
; GFX10-NEXT: s_lshr_b32 s10, s5, 16
; GFX10-NEXT: s_lshr_b32 s4, s8, s9
; GFX10-NEXT: s_lshr_b32 s8, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_lshr_b32 s9, s5, 16
; GFX10-NEXT: s_lshr_b32 s1, s1, s5
; GFX10-NEXT: s_lshr_b32 s5, s9, s10
; GFX10-NEXT: s_lshr_b32 s5, s8, s9
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_and_b32 s2, s2, s8
; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
; GFX10-NEXT: s_lshr_b32 s5, s6, 16
; GFX10-NEXT: s_lshr_b32 s2, s2, s6
; GFX10-NEXT: s_lshr_b32 s4, s4, s5
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_and_b32 s3, s3, s8
; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
; GFX10-NEXT: s_lshr_b32 s6, s7, 16
; GFX10-NEXT: s_lshr_b32 s3, s3, s7
; GFX10-NEXT: s_lshr_b32 s5, s5, s6
Expand Down
51 changes: 21 additions & 30 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,22 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
;
; GFX8-LABEL: s_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
Expand Down Expand Up @@ -78,29 +75,26 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_zeroext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
Expand Down Expand Up @@ -146,27 +140,24 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s0, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_sext_i32_i16 s0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_signext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_sext_i32_i16 s0, s0
; GFX10-NEXT: ; return to shader part epilog
Expand Down
112 changes: 52 additions & 60 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -429,13 +429,12 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
Expand All @@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1)
define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s1, s0
; GFX6-NEXT: ; return to shader part epilog
Expand All @@ -487,13 +485,12 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre
define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
Expand Down Expand Up @@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s3, s4, s1
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_and_b32 s1, s6, s1
; GFX6-NEXT: s_or_b32 s1, s3, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_xor_b32 s2, s2, -1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use:
Expand Down Expand Up @@ -630,18 +626,17 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1)
; GFX6-LABEL: s_orn2_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
Expand Down Expand Up @@ -673,18 +668,17 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre
; GFX6-LABEL: s_orn2_v4i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
Expand Down Expand Up @@ -716,18 +710,17 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
; GFX6-LABEL: s_orn2_v4i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
Expand Down Expand Up @@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s14, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s1, s2, s14
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s14
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s3, s6, s14
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, s14
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_lshl_b32 s4, s11, 16
; GFX6-NEXT: s_and_b32 s5, s10, s14
; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
; GFX6-NEXT: s_or_b32 s4, s4, s5
; GFX6-NEXT: s_lshl_b32 s5, s13, 16
; GFX6-NEXT: s_and_b32 s6, s12, s14
; GFX6-NEXT: s_and_b32 s6, s12, 0xffff
; GFX6-NEXT: s_or_b32 s5, s5, s6
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s7, s6
Expand Down
11 changes: 5 additions & 6 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
Original file line number Diff line number Diff line change
Expand Up @@ -401,13 +401,12 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_rndne_f16_e32 v3, v0
; GFX10-NEXT: v_rndne_f16_e32 v2, v0
; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_rndne_f16_e32 v4, v1
; GFX10-NEXT: v_rndne_f16_e32 v3, v1
; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v0, v3, v2, v0
; GFX10-NEXT: v_and_or_b32 v1, v4, v2, v1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
ret <4 x half> %roundeven
Expand Down Expand Up @@ -610,8 +609,8 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_brev_b32 s6, 1
; GFX6-NEXT: s_mov_b32 s7, 0x43300000
; GFX6-NEXT: v_and_b32_e32 v5, s6, v1
; GFX6-NEXT: s_mov_b32 s7, 0x43300000
; GFX6-NEXT: v_mov_b32_e32 v4, 0
; GFX6-NEXT: v_or_b32_e32 v5, s7, v5
; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5]
Expand Down
1,244 changes: 585 additions & 659 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

Large diffs are not rendered by default.

48 changes: 24 additions & 24 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,10 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5]
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
Expand Down Expand Up @@ -408,15 +408,15 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7]
; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1
; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5]
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5]
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
Expand Down Expand Up @@ -450,10 +450,10 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5]
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
Expand Down Expand Up @@ -549,15 +549,15 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7]
; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1
; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5]
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5]
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1186,9 +1186,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s10, 0x1000
; GISEL-NEXT: s_mov_b32 s6, 0
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x1000, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
Expand Down Expand Up @@ -1317,7 +1316,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x1000, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc
Expand Down Expand Up @@ -1890,9 +1889,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-LABEL: v_sdiv_v2i64_oddk_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb
; GISEL-NEXT: s_mov_b32 s6, 0
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
Expand Down Expand Up @@ -2021,7 +2019,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc
Expand Down
234 changes: 114 additions & 120 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll

Large diffs are not rendered by default.

15 changes: 6 additions & 9 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
Original file line number Diff line number Diff line change
Expand Up @@ -439,9 +439,8 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_brev_b32 s4, -4
; GFX10-NEXT: v_and_b32_e32 v2, s4, v0
; GFX10-NEXT: v_and_b32_e32 v4, s4, v1
; GFX10-NEXT: v_and_b32_e32 v2, 0x3fffffff, v0
; GFX10-NEXT: v_and_b32_e32 v4, 0x3fffffff, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[4:5]
Expand Down Expand Up @@ -523,9 +522,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_brev_b32 s4, -8
; GFX10-NEXT: v_and_b32_e32 v0, s4, v0
; GFX10-NEXT: v_and_b32_e32 v2, s4, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
Expand Down Expand Up @@ -608,13 +606,12 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s2
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_and_b32 s0, s0, s2
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshl_b32 s0, s0, 2
; GFX7-NEXT: s_lshl_b32 s1, s1, 2
; GFX7-NEXT: ; return to shader part epilog
Expand Down
96 changes: 45 additions & 51 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -785,25 +785,23 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: s_shl_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, s3
; GFX6-NEXT: s_lshl_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_shl_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s1, s2, s4
; GFX8-NEXT: s_lshl_b32 s1, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -869,10 +867,10 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: shl_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
Expand Down Expand Up @@ -970,39 +968,37 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
; GFX6-LABEL: s_shl_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s8, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, s5
; GFX6-NEXT: s_lshl_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s1, s1, s8
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s2, s2, s6
; GFX6-NEXT: s_lshl_b32 s3, s3, s7
; GFX6-NEXT: s_and_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_and_b32 s1, s2, s8
; GFX6-NEXT: s_and_b32 s2, s3, s8
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_shl_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s6, 0xffff
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s6
; GFX8-NEXT: s_lshr_b32 s8, s3, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
; GFX8-NEXT: s_lshl_b32 s2, s4, s7
; GFX8-NEXT: s_lshl_b32 s2, s4, s6
; GFX8-NEXT: s_lshl_b32 s1, s1, s3
; GFX8-NEXT: s_lshl_b32 s3, s5, s8
; GFX8-NEXT: s_lshl_b32 s3, s5, s7
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s6
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: s_lshl_b32 s2, s3, 16
; GFX8-NEXT: s_and_b32 s1, s1, s6
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -1144,67 +1140,65 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
; GFX6-LABEL: s_shl_v8i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s16, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, s9
; GFX6-NEXT: s_lshl_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s1, s1, s16
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s2, s2, s10
; GFX6-NEXT: s_lshl_b32 s3, s3, s11
; GFX6-NEXT: s_and_b32 s0, s0, s16
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s5, s5, s13
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_and_b32 s1, s2, s16
; GFX6-NEXT: s_and_b32 s2, s3, s16
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
; GFX6-NEXT: s_lshl_b32 s4, s4, s12
; GFX6-NEXT: s_lshl_b32 s7, s7, s15
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_and_b32 s3, s5, s16
; GFX6-NEXT: s_and_b32 s3, s5, 0xffff
; GFX6-NEXT: s_lshl_b32 s6, s6, s14
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s2, s4, s16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_and_b32 s4, s7, s16
; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_and_b32 s3, s6, s16
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_shl_v8i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s12, 0xffff
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s12
; GFX8-NEXT: s_lshr_b32 s13, s4, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
; GFX8-NEXT: s_lshr_b32 s9, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s12
; GFX8-NEXT: s_lshr_b32 s14, s5, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
; GFX8-NEXT: s_lshl_b32 s4, s8, s13
; GFX8-NEXT: s_lshl_b32 s4, s8, s12
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_and_b32 s2, s2, s12
; GFX8-NEXT: s_lshr_b32 s15, s6, 16
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_lshl_b32 s1, s1, s5
; GFX8-NEXT: s_lshl_b32 s5, s9, s14
; GFX8-NEXT: s_lshl_b32 s5, s9, s13
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
; GFX8-NEXT: s_and_b32 s0, s0, s12
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s11, s3, 16
; GFX8-NEXT: s_and_b32 s3, s3, s12
; GFX8-NEXT: s_lshr_b32 s16, s7, 16
; GFX8-NEXT: s_and_b32 s3, s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
; GFX8-NEXT: s_lshl_b32 s2, s2, s6
; GFX8-NEXT: s_lshl_b32 s6, s10, s15
; GFX8-NEXT: s_lshl_b32 s6, s10, s14
; GFX8-NEXT: s_or_b32 s0, s4, s0
; GFX8-NEXT: s_lshl_b32 s4, s5, 16
; GFX8-NEXT: s_and_b32 s1, s1, s12
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshl_b32 s3, s3, s7
; GFX8-NEXT: s_lshl_b32 s7, s11, s16
; GFX8-NEXT: s_lshl_b32 s7, s11, s15
; GFX8-NEXT: s_or_b32 s1, s4, s1
; GFX8-NEXT: s_lshl_b32 s4, s6, 16
; GFX8-NEXT: s_and_b32 s2, s2, s12
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: s_lshl_b32 s4, s7, 16
; GFX8-NEXT: s_and_b32 s3, s3, s12
; GFX8-NEXT: s_and_b32 s3, s3, 0xffff
; GFX8-NEXT: s_or_b32 s3, s4, s3
; GFX8-NEXT: ; return to shader part epilog
;
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -377,13 +377,13 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
Expand Down Expand Up @@ -508,13 +508,13 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1164,9 +1164,8 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-LABEL: v_srem_v2i64_pow2k_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s10, 0x1000
; GISEL-NEXT: s_mov_b32 s6, 0
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x1000, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
Expand Down Expand Up @@ -1294,7 +1293,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x1000, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
Expand Down Expand Up @@ -1860,9 +1859,8 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-LABEL: v_srem_v2i64_oddk_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb
; GISEL-NEXT: s_mov_b32 s6, 0
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
Expand Down Expand Up @@ -1990,7 +1988,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
Expand Down
1,242 changes: 584 additions & 658 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

Large diffs are not rendered by default.

27 changes: 13 additions & 14 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -113,44 +113,43 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s1, 0x80008
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bfe_u32 s3, s4, s1
; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_lshr_b32 s2, s4, 16
; GFX7-NEXT: s_lshr_b32 s1, s4, 16
; GFX7-NEXT: ds_write_b8 v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
; GFX7-NEXT: s_lshr_b32 s0, s4, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
; GFX7-NEXT: s_bfe_u32 s2, s5, s1
; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: s_lshr_b32 s0, s5, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
; GFX7-NEXT: s_lshr_b32 s2, s5, 24
; GFX7-NEXT: s_lshr_b32 s1, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
; GFX7-NEXT: s_bfe_u32 s2, s6, s1
; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_lshr_b32 s0, s6, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:9
; GFX7-NEXT: s_lshr_b32 s2, s6, 24
; GFX7-NEXT: s_lshr_b32 s1, s6, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:10
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:11
; GFX7-NEXT: s_bfe_u32 s1, s7, s1
; GFX7-NEXT: s_bfe_u32 s1, s7, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s7
; GFX7-NEXT: s_lshr_b32 s0, s7, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:12
Expand Down
19 changes: 9 additions & 10 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
Original file line number Diff line number Diff line change
Expand Up @@ -98,33 +98,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s1, 0x80008
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bfe_u32 s3, s4, s1
; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_lshr_b32 s2, s4, 16
; GFX7-NEXT: s_lshr_b32 s1, s4, 16
; GFX7-NEXT: ds_write_b8 v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
; GFX7-NEXT: s_lshr_b32 s0, s4, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
; GFX7-NEXT: s_bfe_u32 s2, s5, s1
; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: s_lshr_b32 s0, s5, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
; GFX7-NEXT: s_lshr_b32 s2, s5, 24
; GFX7-NEXT: s_lshr_b32 s1, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
; GFX7-NEXT: s_bfe_u32 s1, s6, s1
; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_lshr_b32 s0, s6, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
; GFX7-LABEL: usubo_i16_sv:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s1, 0xffff
; GFX7-NEXT: s_and_b32 s0, s0, s1
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s1, v0
; GFX7-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; GFX7-NEXT: v_and_b32_e32 v1, s1, v0
Expand All @@ -980,7 +980,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
; GFX8-LABEL: usubo_i16_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s1, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: v_and_b32_e32 v0, s1, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_and_b32_e32 v1, s1, v0
Expand All @@ -992,8 +992,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
;
; GFX9-LABEL: usubo_i16_sv:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s1, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: v_sub_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -162,23 +162,21 @@ define <2 x i32> @v_trunc_v4i32_to_v4i16(<4 x i32> %src) {
define amdgpu_ps <2 x i32> @s_trunc_v4i32_to_v4i16(<4 x i32> inreg %src) {
; GFX7-LABEL: s_trunc_v4i32_to_v4i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s4
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s3, 16
; GFX7-NEXT: s_and_b32 s2, s2, s4
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_trunc_v4i32_to_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s4, 0xffff
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s4
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: s_lshl_b32 s1, s3, 16
; GFX8-NEXT: s_and_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
%trunc = trunc <4 x i32> %src to <4 x i16>
Expand Down
140 changes: 66 additions & 74 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
Expand Down Expand Up @@ -305,17 +304,16 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-LABEL: s_uaddsat_v2i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s2, s0, 8
; GFX9-NEXT: s_lshr_b32 s3, s1, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s3, s1, 8
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX9-NEXT: s_mov_b32 s2, 0x80008
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
; GFX9-NEXT: s_lshl_b32 s3, s3, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
; GFX9-NEXT: s_lshl_b32 s2, s3, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX9-NEXT: s_lshl_b32 s2, s2, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX9-NEXT: s_lshl_b32 s2, s2, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
Expand All @@ -332,15 +330,14 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s3, s1, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: s_mov_b32 s2, 0x80008
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s2, s2, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: s_lshl_b32 s2, s4, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; GFX10-NEXT: s_movk_i32 s0, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
Expand Down Expand Up @@ -466,35 +463,33 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4
; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_mov_b32_e32 v4, 24
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_pk_add_u16 v1, v3, v2 clamp
; GFX10-NEXT: v_pk_add_u16 v1, v2, v3 clamp
; GFX10-NEXT: v_mov_b32_e32 v2, 8
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v3, s4, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -590,27 +585,26 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s6, s0, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
; GFX9-NEXT: s_mov_b32 s4, 0x80008
; GFX9-NEXT: s_lshr_b32 s6, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s7, s1, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s4
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_lshr_b32 s8, s1, 16
; GFX9-NEXT: s_lshr_b32 s9, s1, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6
; GFX9-NEXT: s_lshr_b32 s6, s4, 16
; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, s4
; GFX9-NEXT: s_lshl_b32 s7, s7, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
; GFX9-NEXT: s_lshr_b32 s7, s6, 16
; GFX9-NEXT: s_lshl_b32 s4, s6, s4
; GFX9-NEXT: s_lshl_b32 s6, s7, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
Expand All @@ -637,39 +631,37 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshr_b32 s6, s4, 16
; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s8, s8, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_lshl_b32 s3, s4, s3
; GFX10-NEXT: s_lshl_b32 s4, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: s_movk_i32 s1, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v3, s1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX10-NEXT: s_mov_b32 s0, 24
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
Expand Down
193 changes: 94 additions & 99 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -272,13 +272,13 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2
; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2
; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
Expand Down
279 changes: 140 additions & 139 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll

Large diffs are not rendered by default.

140 changes: 66 additions & 74 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -236,12 +236,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
Expand Down Expand Up @@ -297,17 +296,16 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-LABEL: s_usubsat_v2i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s2, s0, 8
; GFX9-NEXT: s_lshr_b32 s3, s1, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s3, s1, 8
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX9-NEXT: s_mov_b32 s2, 0x80008
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
; GFX9-NEXT: s_lshl_b32 s3, s3, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
; GFX9-NEXT: s_lshl_b32 s2, s3, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX9-NEXT: s_lshl_b32 s2, s2, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX9-NEXT: s_lshl_b32 s2, s2, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
Expand All @@ -324,15 +322,14 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s3, s1, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: s_mov_b32 s2, 0x80008
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s2, s2, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: s_lshl_b32 s2, s4, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
; GFX10-NEXT: s_movk_i32 s0, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
Expand Down Expand Up @@ -454,35 +451,33 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4
; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_mov_b32_e32 v4, 24
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_pk_sub_u16 v1, v3, v2 clamp
; GFX10-NEXT: v_pk_sub_u16 v1, v2, v3 clamp
; GFX10-NEXT: v_mov_b32_e32 v2, 8
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v3, s4, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -574,27 +569,26 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s6, s0, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
; GFX9-NEXT: s_mov_b32 s4, 0x80008
; GFX9-NEXT: s_lshr_b32 s6, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s7, s1, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s4
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_lshr_b32 s8, s1, 16
; GFX9-NEXT: s_lshr_b32 s9, s1, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6
; GFX9-NEXT: s_lshr_b32 s6, s4, 16
; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, s4
; GFX9-NEXT: s_lshl_b32 s7, s7, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
; GFX9-NEXT: s_lshr_b32 s7, s6, 16
; GFX9-NEXT: s_lshl_b32 s4, s6, s4
; GFX9-NEXT: s_lshl_b32 s6, s7, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
Expand All @@ -621,39 +615,37 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshr_b32 s6, s4, 16
; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s8, s8, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_lshl_b32 s3, s4, s3
; GFX10-NEXT: s_lshl_b32 s4, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: s_movk_i32 s1, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v3, s1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX10-NEXT: s_mov_b32 s0, 24
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
Expand Down
26 changes: 12 additions & 14 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@ entry:
define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX7-LABEL: scalar_xnor_v2i16_one_use:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s4
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s3, 16
; GFX7-NEXT: s_and_b32 s2, s2, s4
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_xor_b32 s0, s0, s1
; GFX7-NEXT: s_xor_b32 s0, s0, -1
Expand All @@ -42,10 +41,10 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
; GFX8-NEXT: s_xor_b32 s0, s0, s1
; GFX8-NEXT: s_mov_b32 s3, s2
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -117,18 +116,17 @@ define amdgpu_ps i64 @scalar_xnor_i64_one_use(i64 inreg %a, i64 inreg %b) {
define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) {
; GFX7-LABEL: scalar_xnor_v4i16_one_use:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s8, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s8
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s3, 16
; GFX7-NEXT: s_and_b32 s2, s2, s8
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_lshl_b32 s2, s5, 16
; GFX7-NEXT: s_and_b32 s3, s4, s8
; GFX7-NEXT: s_and_b32 s3, s4, 0xffff
; GFX7-NEXT: s_or_b32 s2, s2, s3
; GFX7-NEXT: s_lshl_b32 s3, s7, 16
; GFX7-NEXT: s_and_b32 s4, s6, s8
; GFX7-NEXT: s_and_b32 s4, s6, 0xffff
; GFX7-NEXT: s_or_b32 s3, s3, s4
; GFX7-NEXT: s_mov_b32 s4, -1
; GFX7-NEXT: s_mov_b32 s5, s4
Expand All @@ -142,16 +140,16 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_mov_b32 s5, s4
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
; GFX8-NEXT: s_and_b32 s2, s0, s4
; GFX8-NEXT: s_and_b32 s2, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_and_b32 s6, s1, s4
; GFX8-NEXT: s_and_b32 s6, s1, 0xffff
; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5]
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s4
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: s_lshl_b32 s1, s3, 16
; GFX8-NEXT: s_and_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
;
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/add.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; GFX9PLUS: global_load_dword [[B:v[0-9]+]]

; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
; GFX10-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9PLUS: buffer_store_dwordx4

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
; GCN-NEXT: v_mul_hi_u32 v2, v2, s4
; GCN-NEXT: v_mul_lo_u32 v3, v2, v0
; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0
Expand Down Expand Up @@ -232,7 +232,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
; GCN-NEXT: v_mul_hi_u32 v2, v2, s4
; GCN-NEXT: v_mul_lo_u32 v3, v2, v0
; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0
Expand Down
2,911 changes: 1,441 additions & 1,470 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,8 @@ define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {

; Second use is a VGPR use of the constant.
; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0:
; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687
; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x12d687
; SI: buffer_store_dword [[VK]]
define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
%and = and i32 %a, 1234567
Expand All @@ -79,10 +78,9 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out

; Second use is another SGPR use of the constant.
; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1:
; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687
; SI: s_add_i32
; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, 0x12d687
; SI: v_mov_b32_e32 [[VADD:v[0-9]+]], [[ADD]]
; SI: buffer_store_dword [[VADD]]
define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
Expand Down
164 changes: 78 additions & 86 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, v1
; GFX8-NEXT: v_bfrev_b32_e32 v2, 1
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
Expand Down Expand Up @@ -3118,7 +3118,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
Expand Down Expand Up @@ -3160,41 +3160,39 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: max_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_mov_b32_e32 v3, v2
; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1
; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v2, 31
; GFX1064-NEXT: v_mov_b32_e32 v3, s4
; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v2, 15
; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s5, v2, 31
; GFX1064-NEXT: v_writelane_b32 v1, s4, 16
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v2, 63
; GFX1064-NEXT: v_readlane_b32 s6, v2, 47
; GFX1064-NEXT: v_writelane_b32 v1, s5, 32
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v1, s6, 48
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
Expand All @@ -3214,7 +3212,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -3223,31 +3221,29 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: max_i32_varying:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v3, v2
; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032-NEXT: v_readlane_b32 s3, v2, 15
; GFX1032-NEXT: v_readlane_b32 s4, v2, 31
; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
Expand All @@ -3266,7 +3262,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -3484,7 +3480,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, v1
; GFX8-NEXT: v_bfrev_b32_e32 v2, -2
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
Expand Down Expand Up @@ -3535,7 +3531,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: v_bfrev_b32_e32 v2, -2
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
Expand Down Expand Up @@ -3577,41 +3573,39 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: min_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_mov_b32_e32 v3, v2
; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1
; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v2, 31
; GFX1064-NEXT: v_mov_b32_e32 v3, s4
; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v2, 15
; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s5, v2, 31
; GFX1064-NEXT: v_writelane_b32 v1, s4, 16
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v2, 63
; GFX1064-NEXT: v_readlane_b32 s6, v2, 47
; GFX1064-NEXT: v_writelane_b32 v1, s5, 32
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v1, s6, 48
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
Expand All @@ -3631,7 +3625,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -3640,31 +3634,29 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: min_i32_varying:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v3, v2
; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032-NEXT: v_readlane_b32 s3, v2, 15
; GFX1032-NEXT: v_readlane_b32 s4, v2, 31
; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
Expand All @@ -3683,7 +3675,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
Expand Down
Loading