142 changes: 71 additions & 71 deletions llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -524,30 +524,30 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: global_load_ushort v24, v[0:1], off glc
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
; GFX908-NEXT: s_load_dword s8, s[4:5], 0x18
; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX908-NEXT: s_load_dword s6, s[4:5], 0x18
; GFX908-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
; GFX908-NEXT: s_sub_i32 s4, 0, s1
; GFX908-NEXT: s_lshr_b32 s11, s8, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s8
; GFX908-NEXT: s_lshr_b32 s5, s6, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s6
; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX908-NEXT: s_lshl_b64 s[8:9], s[2:3], 5
; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s11
; GFX908-NEXT: s_or_b32 s8, s8, 28
; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5
; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s5
; GFX908-NEXT: s_or_b32 s10, s10, 28
; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX908-NEXT: v_mov_b32_e32 v7, s3
; GFX908-NEXT: s_mov_b32 s10, 0
; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], 5
; GFX908-NEXT: v_mov_b32_e32 v6, s2
; GFX908-NEXT: v_mul_lo_u32 v2, s4, v0
; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5
; GFX908-NEXT: s_mov_b32 s4, 0
; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX908-NEXT: v_add_u32_e32 v0, v0, v2
; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX908-NEXT: v_mov_b32_e32 v2, s8
; GFX908-NEXT: v_mov_b32_e32 v3, s9
; GFX908-NEXT: v_mov_b32_e32 v2, s10
; GFX908-NEXT: v_mov_b32_e32 v3, s11
; GFX908-NEXT: v_mul_lo_u32 v4, v0, s1
; GFX908-NEXT: v_add_u32_e32 v5, 1, v0
; GFX908-NEXT: v_sub_u32_e32 v4, s0, v4
Expand All @@ -562,11 +562,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s0, v24
; GFX908-NEXT: s_and_b32 s0, 0xffff, s0
; GFX908-NEXT: s_mul_i32 s1, s7, s0
; GFX908-NEXT: s_mul_hi_u32 s7, s6, s0
; GFX908-NEXT: s_mul_i32 s0, s6, s0
; GFX908-NEXT: s_add_i32 s1, s7, s1
; GFX908-NEXT: s_lshl_b64 s[6:7], s[0:1], 5
; GFX908-NEXT: s_mul_i32 s1, s9, s0
; GFX908-NEXT: s_mul_hi_u32 s5, s8, s0
; GFX908-NEXT: s_mul_i32 s0, s8, s0
; GFX908-NEXT: s_add_i32 s1, s5, s1
; GFX908-NEXT: s_lshl_b64 s[8:9], s[0:1], 5
; GFX908-NEXT: s_branch .LBB3_2
; GFX908-NEXT: .LBB3_1: ; %bb12
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
Expand All @@ -583,13 +583,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_mov_b32_e32 v8, 0
; GFX908-NEXT: v_mov_b32_e32 v9, 0
; GFX908-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
; GFX908-NEXT: s_mov_b32 s11, s10
; GFX908-NEXT: v_mov_b32_e32 v13, s11
; GFX908-NEXT: v_mov_b32_e32 v15, s11
; GFX908-NEXT: v_mov_b32_e32 v17, s11
; GFX908-NEXT: v_mov_b32_e32 v12, s10
; GFX908-NEXT: v_mov_b32_e32 v14, s10
; GFX908-NEXT: v_mov_b32_e32 v16, s10
; GFX908-NEXT: s_mov_b32 s5, s4
; GFX908-NEXT: v_mov_b32_e32 v13, s5
; GFX908-NEXT: v_mov_b32_e32 v15, s5
; GFX908-NEXT: v_mov_b32_e32 v17, s5
; GFX908-NEXT: v_mov_b32_e32 v12, s4
; GFX908-NEXT: v_mov_b32_e32 v14, s4
; GFX908-NEXT: v_mov_b32_e32 v16, s4
; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[6:7]
; GFX908-NEXT: v_mov_b32_e32 v11, v3
; GFX908-NEXT: v_mov_b32_e32 v19, v13
Expand All @@ -600,27 +600,27 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_readfirstlane_b32 s3, v9
; GFX908-NEXT: s_add_u32 s2, s2, 1
; GFX908-NEXT: s_addc_u32 s3, s3, 0
; GFX908-NEXT: s_mul_hi_u32 s9, s4, s2
; GFX908-NEXT: s_mul_i32 s11, s5, s2
; GFX908-NEXT: s_mul_i32 s8, s4, s2
; GFX908-NEXT: s_mul_i32 s2, s4, s3
; GFX908-NEXT: s_add_i32 s2, s9, s2
; GFX908-NEXT: s_add_i32 s9, s2, s11
; GFX908-NEXT: s_mul_hi_u32 s5, s6, s2
; GFX908-NEXT: s_mul_i32 s11, s7, s2
; GFX908-NEXT: s_mul_i32 s10, s6, s2
; GFX908-NEXT: s_mul_i32 s2, s6, s3
; GFX908-NEXT: s_add_i32 s2, s5, s2
; GFX908-NEXT: s_add_i32 s5, s2, s11
; GFX908-NEXT: s_branch .LBB3_5
; GFX908-NEXT: .LBB3_4: ; %bb58
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: v_add_co_u32_sdwa v8, vcc, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9]
; GFX908-NEXT: v_mov_b32_e32 v20, s7
; GFX908-NEXT: v_add_co_u32_e64 v10, s[2:3], s6, v10
; GFX908-NEXT: v_mov_b32_e32 v20, s9
; GFX908-NEXT: v_add_co_u32_e64 v10, s[2:3], s8, v10
; GFX908-NEXT: v_addc_co_u32_e64 v11, s[2:3], v11, v20, s[2:3]
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: .LBB3_5: ; %bb16
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: v_mov_b32_e32 v21, s9
; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, s8, v10
; GFX908-NEXT: v_mov_b32_e32 v21, s5
; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, s10, v10
; GFX908-NEXT: v_addc_co_u32_e32 v21, vcc, v11, v21, vcc
; GFX908-NEXT: global_load_dword v28, v[20:21], off offset:-12 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -664,25 +664,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: global_load_ushort v28, v[0:1], off glc
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x10
; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x18
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10
; GFX90A-NEXT: s_load_dword s3, s[4:5], 0x18
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_mov_b32 s8, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX90A-NEXT: s_sub_i32 s9, 0, s7
; GFX90A-NEXT: s_lshl_b64 s[4:5], s[10:11], 5
; GFX90A-NEXT: s_or_b32 s4, s4, 28
; GFX90A-NEXT: s_sub_i32 s12, 0, s7
; GFX90A-NEXT: s_lshr_b32 s13, s3, 16
; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s3
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX90A-NEXT: s_lshr_b32 s12, s2, 16
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s2
; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s13
; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5
; GFX90A-NEXT: s_or_b32 s10, s10, 28
; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s12
; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[10:11], s[10:11] op_sel:[0,1]
; GFX90A-NEXT: v_mul_lo_u32 v8, s9, v0
; GFX90A-NEXT: s_mov_b32 s2, 0
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1]
; GFX90A-NEXT: v_mul_lo_u32 v8, s12, v0
; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8
; GFX90A-NEXT: v_add_u32_e32 v0, v0, v8
; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0
Expand All @@ -699,13 +699,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: v_lshlrev_b64 v[8:9], 5, v[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s4, v28
; GFX90A-NEXT: s_and_b32 s4, 0xffff, s4
; GFX90A-NEXT: s_mul_i32 s1, s1, s4
; GFX90A-NEXT: s_mul_hi_u32 s5, s0, s4
; GFX90A-NEXT: s_mul_i32 s0, s0, s4
; GFX90A-NEXT: s_add_i32 s1, s5, s1
; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5
; GFX90A-NEXT: v_readfirstlane_b32 s3, v28
; GFX90A-NEXT: s_and_b32 s3, 0xffff, s3
; GFX90A-NEXT: s_mul_i32 s1, s1, s3
; GFX90A-NEXT: s_mul_hi_u32 s6, s0, s3
; GFX90A-NEXT: s_mul_i32 s0, s0, s3
; GFX90A-NEXT: s_add_i32 s1, s6, s1
; GFX90A-NEXT: s_lshl_b64 s[6:7], s[0:1], 5
; GFX90A-NEXT: s_branch .LBB3_2
; GFX90A-NEXT: .LBB3_1: ; %bb12
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
Expand All @@ -720,39 +720,39 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; %bb.3: ; %bb14
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off
; GFX90A-NEXT: s_mov_b32 s9, s8
; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: s_mov_b32 s3, s2
; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s6, v12
; GFX90A-NEXT: v_readfirstlane_b32 s7, v13
; GFX90A-NEXT: s_add_u32 s6, s6, 1
; GFX90A-NEXT: s_addc_u32 s7, s7, 0
; GFX90A-NEXT: s_mul_hi_u32 s9, s2, s6
; GFX90A-NEXT: s_mul_i32 s7, s2, s7
; GFX90A-NEXT: s_mul_i32 s10, s3, s6
; GFX90A-NEXT: s_add_i32 s7, s9, s7
; GFX90A-NEXT: s_mul_i32 s6, s2, s6
; GFX90A-NEXT: s_add_i32 s7, s7, s10
; GFX90A-NEXT: v_readfirstlane_b32 s3, v12
; GFX90A-NEXT: v_readfirstlane_b32 s8, v13
; GFX90A-NEXT: s_add_u32 s3, s3, 1
; GFX90A-NEXT: s_addc_u32 s9, s8, 0
; GFX90A-NEXT: s_mul_hi_u32 s10, s4, s3
; GFX90A-NEXT: s_mul_i32 s11, s5, s3
; GFX90A-NEXT: s_mul_i32 s8, s4, s3
; GFX90A-NEXT: s_mul_i32 s3, s4, s9
; GFX90A-NEXT: s_add_i32 s3, s10, s3
; GFX90A-NEXT: s_add_i32 s3, s3, s11
; GFX90A-NEXT: s_branch .LBB3_5
; GFX90A-NEXT: .LBB3_4: ; %bb58
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
; GFX90A-NEXT: v_mov_b32_e32 v24, s5
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s4, v14
; GFX90A-NEXT: v_mov_b32_e32 v24, s7
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s6, v14
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v24, vcc
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: .LBB3_5: ; %bb16
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_mov_b32_e32 v25, s7
; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s6, v14
; GFX90A-NEXT: v_mov_b32_e32 v25, s3
; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s8, v14
; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v25, vcc
; GFX90A-NEXT: global_load_dword v30, v[24:25], off offset:-12 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)*
; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
; GFX9-DAG: s_mov_b32 s6, s1
; GFX9-DAG: s_mov_b32 s7, 0
; GFX9-DAG: s_mov_b32 s1, s7
; GFX9-DAG: s_mov_b32 s2, s1
; GFX9-DAG: s_mov_b32 s3, 0
; GFX9-DAG: s_mov_b32 s1, s3
; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10
; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(6)* %p1, i32 2
%r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0
Expand Down Expand Up @@ -125,11 +125,11 @@ define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspac
; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
; GFX9-DAG: s_mov_b32 s6, s1
; GFX9-DAG: s_mov_b32 s7, 0
; GFX9-DAG: s_mov_b32 s1, s7
; GFX9-DAG: s_mov_b32 s2, s1
; GFX9-DAG: s_mov_b32 s3, 0
; GFX9-DAG: s_mov_b32 s1, s3
; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[6:7], 0x10
; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 {
%gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(6)* %p1, i32 2
%r0 = load <2 x float>, <2 x float> addrspace(6)* %p0
Expand Down
16 changes: 9 additions & 7 deletions llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ body: |
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead %3:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit [[DEF1]]
; CHECK-NEXT: S_NOP 0, implicit [[DEF1]]
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
Expand All @@ -34,16 +35,16 @@ body: |
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
; CHECK-NEXT: internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
; CHECK-NEXT: undef %6.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
; CHECK-NEXT: internal %6.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
; CHECK-NEXT: }
; CHECK-NEXT: %5.sub0:av_1024_align2 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit %5.sub0
; CHECK-NEXT: %6.sub0:av_1024_align2 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit %6.sub0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: successors: %bb.4(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit %5
; CHECK-NEXT: S_NOP 0, implicit %6
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
Expand All @@ -52,14 +53,15 @@ body: |
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.5:
; CHECK-NEXT: undef %3.sub0:vreg_1024_align2 = COPY [[DEF]]
; CHECK-NEXT: S_NOP 0, implicit %3
; CHECK-NEXT: undef %4.sub0:vreg_1024_align2 = COPY [[DEF]]
; CHECK-NEXT: S_NOP 0, implicit %4
bb.0:
%0:vgpr_32 = IMPLICIT_DEF
%1:vreg_1024_align2 = IMPLICIT_DEF
%2:vreg_1024_align2 = COPY %1
bb.1:
%5:vreg_64 = IMPLICIT_DEF
S_NOP 0, implicit %1
S_NOP 0, implicit %1
%1:vreg_1024_align2 = IMPLICIT_DEF
Expand Down
48 changes: 24 additions & 24 deletions llvm/test/CodeGen/AMDGPU/indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1144,13 +1144,13 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: v_readfirstlane_b32 s11, v1
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_swappc_b64 s[30:31], s[10:11]
; GCN-NEXT: v_readfirstlane_b32 s8, v0
; GCN-NEXT: v_readfirstlane_b32 s9, v1
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: s_xor_b64 exec, exec, s[8:9]
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
; GCN-NEXT: s_cbranch_execnz .LBB6_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[6:7]
Expand Down Expand Up @@ -1337,14 +1337,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
; GCN-NEXT: v_mov_b32_e32 v41, v0
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v2
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: v_readfirstlane_b32 s6, v1
; GCN-NEXT: v_readfirstlane_b32 s7, v2
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2]
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: v_mov_b32_e32 v0, v41
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
; GCN-NEXT: s_xor_b64 exec, exec, s[6:7]
; GCN-NEXT: s_xor_b64 exec, exec, s[8:9]
; GCN-NEXT: s_cbranch_execnz .LBB7_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
Expand Down Expand Up @@ -1539,15 +1539,15 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
; GCN-NEXT: v_writelane_b32 v40, s63, 31
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v2
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: v_readfirstlane_b32 s6, v1
; GCN-NEXT: v_readfirstlane_b32 s7, v2
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2]
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v3, v0
; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: s_xor_b64 exec, exec, s[6:7]
; GCN-NEXT: s_xor_b64 exec, exec, s[8:9]
; GCN-NEXT: s_cbranch_execnz .LBB8_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
Expand Down Expand Up @@ -1736,13 +1736,13 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
; GCN-NEXT: v_writelane_b32 v40, s63, 31
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
; GCN-NEXT: v_readfirstlane_b32 s9, v1
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: v_readfirstlane_b32 s6, v0
; GCN-NEXT: v_readfirstlane_b32 s7, v1
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: s_xor_b64 exec, exec, s[6:7]
; GCN-NEXT: s_xor_b64 exec, exec, s[8:9]
; GCN-NEXT: s_cbranch_execnz .LBB9_1
; GCN-NEXT: ; %bb.2:
; GCN-NEXT: s_mov_b64 exec, s[4:5]
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1019,10 +1019,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX10-32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-32-NEXT: s_mov_b32 s2, 0
; GFX10-32-NEXT: s_mov_b32 s1, 0
; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX10-32-NEXT: s_cbranch_execz .LBB7_3
; GFX10-32-NEXT: ; %bb.1: ; %.demote0
; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo
Expand All @@ -1031,30 +1031,30 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX10-32-NEXT: s_wqm_b32 s3, s0
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3
; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT: s_mov_b32 s3, 0
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX10-32-NEXT: s_mov_b32 s2, 0
; GFX10-32-NEXT: s_branch .LBB7_5
; GFX10-32-NEXT: .LBB7_4: ; %.continue1
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT: s_add_i32 s3, s3, 1
; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s3, v1
; GFX10-32-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX10-32-NEXT: s_add_i32 s2, s2, 1
; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1
; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT: s_cbranch_execz .LBB7_8
; GFX10-32-NEXT: .LBB7_5: ; %.continue0
; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-32-NEXT: s_mov_b32 s1, s0
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s3, 0, s1
; GFX10-32-NEXT: s_xor_b32 s1, s0, -1
; GFX10-32-NEXT: s_mov_b32 s3, s0
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3
; GFX10-32-NEXT: s_xor_b32 s3, s0, -1
; GFX10-32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo
; GFX10-32-NEXT: s_and_saveexec_b32 s4, s1
; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s4
; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo
; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3
; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4
; GFX10-32-NEXT: s_cbranch_execz .LBB7_4
; GFX10-32-NEXT: ; %bb.6: ; %.demote1
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
Expand All @@ -1066,7 +1066,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4
; GFX10-32-NEXT: s_branch .LBB7_4
; GFX10-32-NEXT: .LBB7_8: ; %.return
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00
; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,13 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
; GFX908-DAG: v_accvgpr_read_b32

; GFX900: NumVgprs: 256
; GFX908: NumVgprs: 252
; GFX900: ScratchSize: 2052
; GFX908: NumVgprs: 254
; GFX900: ScratchSize: 1796
; GFX908: ScratchSize: 0
; GFX900: VGPRBlocks: 63
; GFX908: VGPRBlocks: 62
; GFX908: VGPRBlocks: 63
; GFX900: NumVGPRsForWavesPerEU: 256
; GFX908: NumVGPRsForWavesPerEU: 252
; GFX908: NumVGPRsForWavesPerEU: 25
define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/srem64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1955,19 +1955,19 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
; GCN-LABEL: s_test_srem24_k_num_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s6, 0x41c00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_ashr_i64 s[4:5], s[2:3], 40
; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4
; GCN-NEXT: s_ashr_i32 s5, s4, 30
; GCN-NEXT: s_or_b32 s5, s5, 1
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: s_mov_b32 s5, 0x41c00000
; GCN-NEXT: s_ashr_i32 s6, s4, 30
; GCN-NEXT: s_or_b32 s6, s6, 1
; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_mad_f32 v2, -v1, v0, s6
; GCN-NEXT: v_mad_f32 v2, -v1, v0, s5
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0|
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
Expand All @@ -1982,19 +1982,19 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
; GCN-IR-LABEL: s_test_srem24_k_num_i64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-IR-NEXT: s_mov_b32 s6, 0x41c00000
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[2:3], 40
; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4
; GCN-IR-NEXT: s_ashr_i32 s5, s4, 30
; GCN-IR-NEXT: s_or_b32 s5, s5, 1
; GCN-IR-NEXT: v_mov_b32_e32 v3, s5
; GCN-IR-NEXT: s_mov_b32 s5, 0x41c00000
; GCN-IR-NEXT: s_ashr_i32 s6, s4, 30
; GCN-IR-NEXT: s_or_b32 s6, s6, 1
; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GCN-IR-NEXT: v_mov_b32_e32 v3, s6
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1
; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1
; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s6
; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s5
; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0|
; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,10 @@ define hidden void @blam() {
; GCN-NEXT: s_mov_b32 s45, s14
; GCN-NEXT: s_mov_b32 s46, s13
; GCN-NEXT: s_mov_b32 s47, s12
; GCN-NEXT: s_mov_b64 s[36:37], s[10:11]
; GCN-NEXT: s_mov_b64 s[38:39], s[8:9]
; GCN-NEXT: s_mov_b64 s[40:41], s[6:7]
; GCN-NEXT: s_mov_b64 s[42:43], s[4:5]
; GCN-NEXT: s_mov_b64 s[34:35], s[10:11]
; GCN-NEXT: s_mov_b64 s[36:37], s[8:9]
; GCN-NEXT: s_mov_b64 s[38:39], s[6:7]
; GCN-NEXT: s_mov_b64 s[40:41], s[4:5]
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -233,7 +233,7 @@ define hidden void @blam() {
; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12
; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v44
; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v44
; GCN-NEXT: s_branch .LBB1_3
; GCN-NEXT: .LBB1_1: ; %bb10
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
Expand Down Expand Up @@ -275,10 +275,10 @@ define hidden void @blam() {
; GCN-NEXT: ; %bb.7: ; %bb11
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b64 s[4:5], s[42:43]
; GCN-NEXT: s_mov_b64 s[6:7], s[40:41]
; GCN-NEXT: s_mov_b64 s[8:9], s[38:39]
; GCN-NEXT: s_mov_b64 s[10:11], s[36:37]
; GCN-NEXT: s_mov_b64 s[4:5], s[40:41]
; GCN-NEXT: s_mov_b64 s[6:7], s[38:39]
; GCN-NEXT: s_mov_b64 s[8:9], s[36:37]
; GCN-NEXT: s_mov_b64 s[10:11], s[34:35]
; GCN-NEXT: s_mov_b32 s12, s47
; GCN-NEXT: s_mov_b32 s13, s46
; GCN-NEXT: s_mov_b32 s14, s45
Expand All @@ -293,7 +293,7 @@ define hidden void @blam() {
; GCN-NEXT: ; %bb.8: ; %bb14
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[34:35]
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[42:43]
; GCN-NEXT: s_cbranch_execnz .LBB1_10
; GCN-NEXT: ; %bb.9: ; %bb16
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/AMDGPU/urem64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1416,23 +1416,23 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %
; GCN-LABEL: s_test_urem24_k_den_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00
; GCN-NEXT: s_movk_i32 s4, 0x5b7f
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s2, s3, 8
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: s_movk_i32 s3, 0x5b7f
; GCN-NEXT: s_mov_b32 s3, 0x46b6fe00
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
; GCN-NEXT: v_mad_f32 v0, -v1, s4, v0
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-NEXT: v_mul_lo_u32 v0, v0, s3
; GCN-NEXT: v_mad_f32 v0, -v1, s3, v0
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-NEXT: v_mul_lo_u32 v0, v0, s4
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
Expand All @@ -1441,23 +1441,23 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %
; GCN-IR-LABEL: s_test_urem24_k_den_i64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00
; GCN-IR-NEXT: s_movk_i32 s4, 0x5b7f
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8
; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f
; GCN-IR-NEXT: s_mov_b32 s3, 0x46b6fe00
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1
; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0
; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3
; GCN-IR-NEXT: v_mad_f32 v0, -v1, s3, v0
; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s3
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
Expand Down