104 changes: 52 additions & 52 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -501,24 +501,24 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20
; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5)
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16
; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20
; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32
; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -527,24 +527,24 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20
; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5)
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5)
; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3)
; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16
; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20
; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32
; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
Expand Down Expand Up @@ -616,24 +616,24 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20
; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5)
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16
; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20
; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32
; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -642,24 +642,24 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16
; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20
; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24
; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5)
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5)
; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3)
; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36]
; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12
; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:16
; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20
; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24
; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28
; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32
; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38]
; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x9
; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1]
; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[40:43], v64, s[0:1] offset:32
Expand All @@ -114,10 +114,10 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GFX11-NEXT: global_load_b128 v[56:59], v64, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[60:63], v64, s[0:1] offset:112
; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144
; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: s_clause 0x6
; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128
; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160
; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176
; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:192
Expand All @@ -131,8 +131,10 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: s_clause 0x8
; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192
; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3]
; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16
; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32
Expand All @@ -141,8 +143,6 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GFX11-NEXT: global_store_b128 v64, v[52:55], s[2:3] offset:80
; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96
; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:208
; GFX11-NEXT: s_waitcnt vmcnt(1)
Expand Down
333 changes: 165 additions & 168 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
; GFX908-NEXT: s_sub_i32 s4, 0, s1
; GFX908-NEXT: s_sub_i32 s7, 0, s1
; GFX908-NEXT: s_lshr_b32 s5, s6, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s6
; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
Expand All @@ -539,10 +539,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX908-NEXT: v_mov_b32_e32 v7, s3
; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], 5
; GFX908-NEXT: v_mov_b32_e32 v6, s2
; GFX908-NEXT: v_mul_lo_u32 v2, s4, v0
; GFX908-NEXT: s_mov_b32 s4, 0
; GFX908-NEXT: v_mov_b32_e32 v6, s2
; GFX908-NEXT: v_mul_lo_u32 v2, s7, v0
; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], 5
; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX908-NEXT: v_add_u32_e32 v0, v0, v2
; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0
Expand Down
8 changes: 0 additions & 8 deletions llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,6 @@
# CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0
# CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32
# CHECK-NEXT: RegionInstrs: 46
# CHECK: Unclustered reschedule did not help.
# CHECK: Attempting to revert scheduling.
# CHECK: Retrying function scheduling with lowest recorded occupancy 3.
# CHECK: ********** MI Scheduling **********
# CHECK: test_same_num_instrs:%bb.2
# CHECK-NEXT: From: DBG_VALUE %17:vgpr_32, 0, 0
# CHECK-NEXT: To: S_ENDPGM 0, implicit %69:vgpr_32, implicit %70:vgpr_32
# CHECK-NEXT: RegionInstrs: 46
# CHECK: Attempting to revert scheduling.

---
Expand Down
144 changes: 144 additions & 0 deletions llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# REQUIRES: asserts
# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s

--- |
define amdgpu_kernel void @high-RP-reschedule() { ret void }
...

# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4

---
name: high-RP-reschedule
tracksRegLiveness: true
machineFunctionInfo:
occupancy: 4
body: |
bb.0:
%0:vreg_128 = IMPLICIT_DEF
%1:vreg_128 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vreg_128 = IMPLICIT_DEF
%4:vreg_128 = IMPLICIT_DEF
%5:vreg_128 = IMPLICIT_DEF
%6:vreg_128 = IMPLICIT_DEF
%7:vreg_128 = IMPLICIT_DEF
%8:vreg_128 = IMPLICIT_DEF
%9:vreg_128 = IMPLICIT_DEF
%10:vreg_128 = IMPLICIT_DEF
%11:sreg_64_xexec = IMPLICIT_DEF
%12:vreg_64 = IMPLICIT_DEF
bb.1:
%13:vgpr_32 = V_LSHRREV_B16_e32 1, %12.sub0, implicit $exec
%14:vgpr_32 = V_AND_B32_e32 127, %13, implicit $exec
%15:vgpr_32 = V_MUL_LO_U16_e32 49, %14, implicit $exec
%16:vgpr_32 = V_LSHRREV_B16_e32 10, %15, implicit $exec
%17:vgpr_32 = V_MUL_LO_U16_e32 42, %16, implicit $exec
%18:vgpr_32 = V_SUB_U16_e32 %12.sub0, %17, implicit $exec
%19:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
%20:vgpr_32 = V_MUL_U32_U24_sdwa 0, %18, 0, %19, 0, 6, 0, 0, 6, implicit $exec
%21:vgpr_32 = V_LSHLREV_B32_e32 4, %20, implicit $exec
%22:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 608, 0, implicit $exec :: (load (s128))
%23:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 576, 0, implicit $exec :: (load (s128))
%24:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 592, 0, implicit $exec :: (load (s128))
%25:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 624, 0, implicit $exec :: (load (s128))
%26:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 672, 0, implicit $exec :: (load (s128))
%27:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 640, 0, implicit $exec :: (load (s128))
%28:vreg_128 = GLOBAL_LOAD_DWORDX4_SADDR %11, %21, 656, 0, implicit $exec :: (load (s128))
%29:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub2_sub3, 0, 0, implicit $mode, implicit $exec
%30:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %7.sub2_sub3, 0, %25.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%31:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub0_sub1, 1, %29, 0, 0, implicit $mode, implicit $exec
%32:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %7.sub0_sub1, 0, %25.sub2_sub3, 0, %30, 0, 0, implicit $mode, implicit $exec
%33:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub2_sub3, 0, 0, implicit $mode, implicit $exec
%34:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %5.sub2_sub3, 0, %24.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%35:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub0_sub1, 1, %33, 0, 0, implicit $mode, implicit $exec
%36:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %5.sub0_sub1, 0, %24.sub2_sub3, 0, %34, 0, 0, implicit $mode, implicit $exec
%37:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub2_sub3, 0, 0, implicit $mode, implicit $exec
%38:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %9.sub2_sub3, 0, %28.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%39:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub0_sub1, 1, %37, 0, 0, implicit $mode, implicit $exec
%40:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %9.sub0_sub1, 0, %28.sub2_sub3, 0, %38, 0, 0, implicit $mode, implicit $exec
%41:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub2_sub3, 0, 0, implicit $mode, implicit $exec
%42:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %4.sub2_sub3, 0, %23.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%43:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub0_sub1, 1, %41, 0, 0, implicit $mode, implicit $exec
%44:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %4.sub0_sub1, 0, %23.sub2_sub3, 0, %42, 0, 0, implicit $mode, implicit $exec
%45:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub2_sub3, 0, 0, implicit $mode, implicit $exec
%46:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %8.sub2_sub3, 0, %27.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%47:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub2_sub3, 0, 0, implicit $mode, implicit $exec
%48:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub0_sub1, 1, %45, 0, 0, implicit $mode, implicit $exec
%49:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %8.sub0_sub1, 0, %27.sub2_sub3, 0, %46, 0, 0, implicit $mode, implicit $exec
%50:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub2_sub3, 0, 0, implicit $mode, implicit $exec
%51:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %6.sub2_sub3, 0, %22.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%52:vreg_64 = contract nofpexcept V_MUL_F64_e64 0, %10.sub2_sub3, 0, %26.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%53:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub0_sub1, 1, %47, 0, 0, implicit $mode, implicit $exec
%54:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub0_sub1, 1, %50, 0, 0, implicit $mode, implicit $exec
%55:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %6.sub0_sub1, 0, %22.sub2_sub3, 0, %51, 0, 0, implicit $mode, implicit $exec
%56:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %10.sub0_sub1, 0, %26.sub2_sub3, 0, %52, 0, 0, implicit $mode, implicit $exec
%57:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub2_sub3, 1, %32, 0, 0, implicit $mode, implicit $exec
%58:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %35, 1, %39, 0, 0, implicit $mode, implicit $exec
%59:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %44, 1, %49, 0, 0, implicit $mode, implicit $exec
%60:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %53, 1, %54, 0, 0, implicit $mode, implicit $exec
%61:sreg_64 = S_MOV_B64_IMM_PSEUDO 4604544271217802189
%62:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %3.sub0_sub1, 1, %31, 0, 0, implicit $mode, implicit $exec
undef %63.sub1:sreg_64 = S_MOV_B32 -1075404642
%64:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %36, 1, %40, 0, 0, implicit $mode, implicit $exec
%65:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %43, 1, %48, 0, 0, implicit $mode, implicit $exec
%66:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %55, 1, %56, 0, 0, implicit $mode, implicit $exec
%67:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %57, 0, %58, 0, 0, implicit $mode, implicit $exec
%68:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %59, 0, %60, 0, 0, implicit $mode, implicit $exec
%69:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub2_sub3, 0, 4611686018427387904, 1, %57, 0, 0, implicit $mode, implicit $exec
%70:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %3.sub0_sub1, 0, 4611686018427387904, 1, %62, 0, 0, implicit $mode, implicit $exec
%71:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %35, 0, 4611686018427387904, 1, %58, 0, 0, implicit $mode, implicit $exec
%72:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %36, 0, 4611686018427387904, 1, %64, 0, 0, implicit $mode, implicit $exec
%73:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %43, 0, 4611686018427387904, 1, %65, 0, 0, implicit $mode, implicit $exec
%74:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %44, 0, 4611686018427387904, 1, %59, 0, 0, implicit $mode, implicit $exec
%75:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %55, 0, 4611686018427387904, 1, %66, 0, 0, implicit $mode, implicit $exec
%76:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %53, 0, 4611686018427387904, 1, %60, 0, 0, implicit $mode, implicit $exec
%77:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %62, 1, %64, 0, 0, implicit $mode, implicit $exec
%78:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %65, 1, %66, 0, 0, implicit $mode, implicit $exec
%79:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %70, 1, %71, 0, 0, implicit $mode, implicit $exec
%80:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %57, 0, 4611686018427387904, 1, %67, 0, 0, implicit $mode, implicit $exec
%81:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %59, 0, 4611686018427387904, 1, %68, 0, 0, implicit $mode, implicit $exec
%82:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %69, 1, %72, 0, 0, implicit $mode, implicit $exec
%83:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %74, 1, %75, 0, 0, implicit $mode, implicit $exec
%84:vreg_64 = contract nofpexcept V_ADD_F64_e64 0, %73, 1, %76, 0, 0, implicit $mode, implicit $exec
%85:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %62, 0, 4611686018427387904, 1, %77, 0, 0, implicit $mode, implicit $exec
%86:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %65, 0, 4611686018427387904, 1, %78, 0, 0, implicit $mode, implicit $exec
%63.sub0:sreg_64 = COPY %61.sub0
%87:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %61, 0, %67, 0, 0, implicit $mode, implicit $exec
%88:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %80, 0, 0, implicit $mode, implicit $exec
%89:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %70, 0, 4611686018427387904, 1, %79, 0, 0, implicit $mode, implicit $exec
%90:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %69, 0, 4611686018427387904, 1, %82, 0, 0, implicit $mode, implicit $exec
%91:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %74, 0, 4611686018427387904, 1, %83, 0, 0, implicit $mode, implicit $exec
%92:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %73, 0, 4611686018427387904, 1, %84, 0, 0, implicit $mode, implicit $exec
%93:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %63, 0, %85, 0, 0, implicit $mode, implicit $exec
%94:vreg_64 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %77, 0, 0, implicit $mode, implicit $exec
undef %95.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %78, 0, %61, 0, %87, 0, 0, implicit $mode, implicit $exec
undef %96.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %82, 0, %84, 0, 0, implicit $mode, implicit $exec
undef %97.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %86, 0, %61, 0, %88, 0, 0, implicit $mode, implicit $exec
undef %98.sub2_sub3:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %90, 1, %91, 0, 0, implicit $mode, implicit $exec
%98.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %89, 1, %92, 0, 0, implicit $mode, implicit $exec
%97.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %81, 0, %63, 0, %93, 0, 0, implicit $mode, implicit $exec
%96.sub0_sub1:vreg_128 = contract nofpexcept V_ADD_F64_e64 0, %79, 1, %83, 0, 0, implicit $mode, implicit $exec
%95.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %68, 0, %63, 0, %94, 0, 0, implicit $mode, implicit $exec
undef %99.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %67, 0, 4611686018427387904, 1, %95.sub2_sub3, 0, 0, implicit $mode, implicit $exec
undef %100.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %82, 0, 4611686018427387904, 1, %96.sub2_sub3, 0, 0, implicit $mode, implicit $exec
undef %101.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %80, 0, 4611686018427387904, 1, %97.sub2_sub3, 0, 0, implicit $mode, implicit $exec
undef %102.sub2_sub3:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %90, 0, 4611686018427387904, 1, %98.sub2_sub3, 0, 0, implicit $mode, implicit $exec
%102.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %89, 0, 4611686018427387904, 1, %98.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%101.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %85, 0, 4611686018427387904, 1, %97.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%100.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %79, 0, 4611686018427387904, 1, %96.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%99.sub0_sub1:vreg_128 = contract nofpexcept V_FMA_F64_e64 0, %77, 0, 4611686018427387904, 1, %95.sub0_sub1, 0, 0, implicit $mode, implicit $exec
%103:vgpr_32 = V_ADD_U32_sdwa 0, %2, 0, %18, 0, 6, 0, 6, 0, implicit $exec
%104:vgpr_32 = V_LSHL_ADD_U32_e64 %103, 4, 0, implicit $exec
DS_WRITE_B128_gfx9 %104, %102, 0, 0, implicit $exec
DS_WRITE_B128_gfx9 %104, %101, 672, 0, implicit $exec
DS_WRITE_B128_gfx9 %104, %100, 1344, 0, implicit $exec
DS_WRITE_B128_gfx9 %104, %99, 2016, 0, implicit $exec
DS_WRITE_B128_gfx9 %104, %98, 2688, 0, implicit $exec
DS_WRITE_B128_gfx9 %104, %97, 3360, 0, implicit $exec
DS_WRITE_B128_gfx9 %104, %96, 4032, 0, implicit $exec
DS_WRITE_B128_gfx9 %104, %95, 4704, 0, implicit $exec
bb.2:
S_ENDPGM 0, implicit %0, implicit %1
...
472 changes: 237 additions & 235 deletions llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

Large diffs are not rendered by default.

707 changes: 344 additions & 363 deletions llvm/test/CodeGen/AMDGPU/load-global-i16.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
# REQUIRES: asserts

---
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/pr51516.mir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustred-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s

# Check that %3 was not rematerialized before the last store since its operand %1
# is killed by that store.
Expand Down
212 changes: 111 additions & 101 deletions llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -452,92 +452,101 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
; GFX8-NEXT: v_mov_b32_e32 v6, 0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_mov_b32_e32 v7, 0
; GFX8-NEXT: s_movk_i32 s0, 0x7f
; GFX8-NEXT: s_movk_i32 s4, 0x7f
; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB1_2 Depth 2
; GFX8-NEXT: v_mov_b32_e32 v5, v3
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: s_mov_b32 s5, 0
; GFX8-NEXT: .LBB1_2: ; %for.body
; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffb000, v4
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v5, vcc
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT: s_mov_b64 s[0:1], vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffb800, v4
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v5, vcc
; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT: s_mov_b64 s[2:3], vcc
; GFX8-NEXT: v_addc_u32_e64 v9, vcc, -1, v5, s[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffc000, v4
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v5, vcc
; GFX8-NEXT: s_mov_b64 s[0:1], vcc
; GFX8-NEXT: v_addc_u32_e64 v11, vcc, -1, v5, s[2:3]
; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xffffc800, v4
; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xffffd000, v4
; GFX8-NEXT: s_mov_b64 s[2:3], vcc
; GFX8-NEXT: v_addc_u32_e64 v13, vcc, -1, v5, s[0:1]
; GFX8-NEXT: s_addk_i32 s5, 0x2000
; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v8, v6
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[12:13]
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffd000, v4
; GFX8-NEXT: s_mov_b64 s[0:1], vcc
; GFX8-NEXT: v_addc_u32_e64 v15, vcc, -1, v5, s[2:3]
; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[14:15]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v16
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v7, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffd800, v4
; GFX8-NEXT: s_mov_b64 s[2:3], vcc
; GFX8-NEXT: v_addc_u32_e64 v7, vcc, -1, v5, s[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v8, v16
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffe000, v4
; GFX8-NEXT: s_mov_b64 s[0:1], vcc
; GFX8-NEXT: v_addc_u32_e64 v11, vcc, -1, v5, s[2:3]
; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v12, v14
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v13, v9, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffe800, v4
; GFX8-NEXT: s_mov_b64 s[2:3], vcc
; GFX8-NEXT: v_addc_u32_e64 v9, vcc, -1, v5, s[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v6, v14
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v13, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff000, v4
; GFX8-NEXT: s_mov_b64 s[0:1], vcc
; GFX8-NEXT: v_addc_u32_e64 v13, vcc, -1, v5, s[2:3]
; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13]
; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15]
; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffd800, v4
; GFX8-NEXT: v_addc_u32_e32 v19, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffe000, v4
; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17]
; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19]
; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe800, v4
; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xfffff000, v4
; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21]
; GFX8-NEXT: flat_load_dwordx2 v[22:23], v[22:23]
; GFX8-NEXT: v_addc_u32_e32 v25, vcc, -1, v5, vcc
; GFX8-NEXT: s_addk_i32 s1, 0x2000
; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff
; GFX8-NEXT: s_waitcnt vmcnt(7)
; GFX8-NEXT: v_add_u32_e32 v26, vcc, v8, v6
; GFX8-NEXT: v_addc_u32_e32 v27, vcc, v9, v7, vcc
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[24:25]
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v4
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v10, v14
; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v11, v7, vcc
; GFX8-NEXT: v_addc_u32_e64 v7, s[0:1], -1, v5, s[0:1]
; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xfffff800, v4
; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; GFX8-NEXT: flat_load_dwordx2 v[24:25], v[4:5]
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v5, vcc
; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11]
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_add_u32_e32 v14, vcc, v8, v14
; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v9, v15, vcc
; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5]
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x10000, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GFX8-NEXT: s_waitcnt vmcnt(9)
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v26
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v27, vcc
; GFX8-NEXT: s_waitcnt vmcnt(8)
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v10
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v13, v11, vcc
; GFX8-NEXT: s_waitcnt vmcnt(7)
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v15, v11, vcc
; GFX8-NEXT: s_waitcnt vmcnt(6)
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v16, v10
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v17, v11, vcc
; GFX8-NEXT: s_waitcnt vmcnt(5)
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v10
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v11, vcc
; GFX8-NEXT: s_waitcnt vmcnt(4)
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v20, v10
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v21, v11, vcc
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v22, v10
; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v23, v11, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v14
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v13, v15, vcc
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v12
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v13, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v7, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v24, v6
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v25, v7, vcc
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
; GFX8-NEXT: s_cbranch_scc0 .LBB1_2
; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1
; GFX8-NEXT: s_add_i32 s1, s0, -1
; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: s_add_i32 s0, s4, -1
; GFX8-NEXT: s_cmp_eq_u32 s4, 0
; GFX8-NEXT: s_cbranch_scc1 .LBB1_5
; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
; GFX8-NEXT: s_mov_b32 s0, s1
; GFX8-NEXT: s_mov_b32 s4, s0
; GFX8-NEXT: s_branch .LBB1_1
; GFX8-NEXT: .LBB1_5: ; %while.end
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
Expand Down Expand Up @@ -593,62 +602,63 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1
; GFX900-NEXT: ; => This Inner Loop Header: Depth=2
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v4
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v5, vcc
; GFX900-NEXT: s_mov_b64 s[0:1], vcc
; GFX900-NEXT: v_addc_co_u32_e64 v9, s[0:1], -1, v5, s[0:1]
; GFX900-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:-4096
; GFX900-NEXT: global_load_dwordx2 v[12:13], v[4:5], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v4
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v4
; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off
; GFX900-NEXT: global_load_dwordx2 v[24:25], v[16:17], off offset:-2048
; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, s3, v4
; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[16:17], v[20:21], off offset:-4096
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s5, v4
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v5, vcc
; GFX900-NEXT: s_addk_i32 s6, 0x2000
; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff
; GFX900-NEXT: s_waitcnt vmcnt(4)
; GFX900-NEXT: v_add_co_u32_e64 v28, s[0:1], v8, v6
; GFX900-NEXT: v_addc_co_u32_e64 v29, s[0:1], v9, v7, s[0:1]
; GFX900-NEXT: global_load_dwordx2 v[6:7], v[20:21], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[20:21], off
; GFX900-NEXT: s_nop 0
; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048
; GFX900-NEXT: global_load_dwordx2 v[26:27], v[4:5], off
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v6
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v7, vcc
; GFX900-NEXT: global_load_dwordx2 v[6:7], v[14:15], off offset:-2048
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v6, v8
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v9, vcc
; GFX900-NEXT: global_load_dwordx2 v[7:8], v[14:15], off
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, s2, v4
; GFX900-NEXT: s_mov_b64 s[0:1], vcc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v7, v16
; GFX900-NEXT: v_addc_co_u32_e64 v7, s[0:1], -1, v5, s[0:1]
; GFX900-NEXT: global_load_dwordx2 v[6:7], v[6:7], off offset:-2048
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v8, v9, vcc
; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, s3, v4
; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v5, vcc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
; GFX900-NEXT: global_load_dwordx2 v[6:7], v[8:9], off offset:-4096
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
; GFX900-NEXT: global_load_dwordx2 v[6:7], v[8:9], off offset:-2048
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v6, v14
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v7, v15, vcc
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, s5, v4
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v5, vcc
; GFX900-NEXT: global_load_dwordx2 v[6:7], v[6:7], off offset:-2048
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v8, v14
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v9, v15, vcc
; GFX900-NEXT: global_load_dwordx2 v[8:9], v[4:5], off
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x10000, v4
; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX900-NEXT: s_waitcnt vmcnt(7)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v18, v28
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v19, v29, vcc
; GFX900-NEXT: s_waitcnt vmcnt(6)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v22, v14
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v23, v15, vcc
; GFX900-NEXT: s_waitcnt vmcnt(5)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v24, v14
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v25, v15, vcc
; GFX900-NEXT: s_waitcnt vmcnt(4)
; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v16, v14
; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v17, v15, vcc
; GFX900-NEXT: s_waitcnt vmcnt(3)
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v6, v14
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc
; GFX900-NEXT: s_waitcnt vmcnt(2)
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
; GFX900-NEXT: s_waitcnt vmcnt(1)
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v20, v6
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v21, v7, vcc
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v12, v6
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v7, vcc
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v26, v6
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v27, v7, vcc
; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6
; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
; GFX900-NEXT: s_cbranch_scc0 .LBB1_2
; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit
; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1
Expand Down