diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6dd4b1d7bd000..b7256b81ee826 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2941,10 +2941,15 @@ def : GCNPat < >; def : GCNPat < - (i64 (zext i32:$src)), + (i64 (UniformUnaryFrag i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) >; +def : GCNPat < + (i64 (zext i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1) +>; + def : GCNPat < (i64 (anyext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index ff74d1f71616d..88e3c86c791de 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2549,17 +2549,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3 @@ -2606,7 +2606,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2614,8 +2614,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2 @@ -2659,8 +2659,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_ITERATIVE-LABEL: add_i64_varying: ; GFX1264_ITERATIVE: ; %bb.0: ; %entry -; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2668,8 +2668,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -2714,7 +2714,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_ITERATIVE-LABEL: add_i64_varying: ; GFX1232_ITERATIVE: ; %bb.0: ; %entry -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2723,8 +2723,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 @@ -6930,15 +6930,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1164_ITERATIVE-LABEL: sub_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[8:9], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2 @@ -7087,8 +7087,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1264_ITERATIVE-LABEL: sub_i64_varying: ; GFX1264_ITERATIVE: ; %bb.0: ; %entry -; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec ; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7096,8 +7096,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1] ; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8 -; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8 +; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8 ; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8 ; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -7142,7 +7142,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1232_ITERATIVE-LABEL: sub_i64_varying: ; GFX1232_ITERATIVE: ; %bb.0: ; %entry -; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo ; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0 ; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7151,8 +7151,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe ; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0 ; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1 -; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1 +; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1 ; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1 ; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index f5ca24f59a286..12517c2bc1b5d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -2181,17 +2181,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 @@ -2233,7 +2233,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: add_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2241,8 +2241,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 @@ -2982,19 +2982,20 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4 ; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5 ; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -5594,17 +5595,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: sub_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7 @@ -5646,7 +5647,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: sub_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5654,8 +5655,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6 @@ -7063,17 +7064,17 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: and_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -7113,7 +7114,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: and_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7121,8 +7122,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -8411,17 +8412,17 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: or_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -8461,7 +8462,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: or_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -8469,8 +8470,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -9759,17 +9760,17 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: xor_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8 @@ -9809,7 +9810,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: xor_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -9817,8 +9818,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3 @@ -11380,8 +11381,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: max_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, 0 @@ -11391,8 +11392,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s[8:9], s[0:1], s[6:7] @@ -11437,7 +11438,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: max_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, 1 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, 0 @@ -11447,8 +11448,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e64 s8, s[0:1], s[6:7] @@ -13205,8 +13206,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: min_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s0, -1 @@ -13216,8 +13217,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] ; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 ; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], s[6:7] @@ -13262,7 +13263,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: min_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_brev_b32 s1, -2 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, -1 @@ -13272,8 +13273,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e64 s8, s[0:1], s[6:7] @@ -15021,8 +15022,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: umax_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -15030,17 +15031,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: .LBB29_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e64 s[8:9], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -15077,7 +15078,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: umax_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -15086,8 +15087,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) @@ -16832,8 +16833,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_ITERATIVE-LABEL: umin_i64_varying: ; GFX1164_ITERATIVE: ; %bb.0: ; %entry -; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0 ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec ; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -16841,17 +16842,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_ITERATIVE-NEXT: .LBB32_1: ; %ComputeLoop ; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3] -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10 -; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s10 +; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10 ; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e64 s[8:9], s[0:1], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7 ; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6 ; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7] ; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1 ; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd @@ -16888,7 +16889,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_ITERATIVE-LABEL: umin_i64_varying: ; GFX1132_ITERATIVE: ; %bb.0: ; %entry -; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0 +; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0 ; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo ; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -16897,8 +16898,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3 -; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3 +; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3 ; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3 ; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 2b63a8cf69476..b1557b33110b9 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -436,7 +436,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: s_mov_b32 s4, 2 ; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s4, v0 -; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -666,7 +665,6 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) { ; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v0 -; GCN-O0-NEXT: s_mov_b32 s1, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index df7f8c6f39b3f..676f11b99f922 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -220,27 +220,24 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v11 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 @@ -253,14 +250,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v0, v2, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v13, vcc +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v14, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v19, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[9:10], s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], s[4:5] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] @@ -271,140 +268,136 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v13, v0, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v21 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v17, vcc, s10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v7, v8, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v7, v12, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v14, vcc +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v14, v0, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v17, vcc, s10, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v4, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v4, v13, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v18 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[20:21], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v8, v10, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v14, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v12, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_xor_b32_e64 v14, v14, v19 -; GFX9-O0-NEXT: v_xor_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v9, v4, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_xor_b32_e64 v9, v9, v19 +; GFX9-O0-NEXT: v_xor_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[12:13] -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[13:14] +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-O0-NEXT: s_mov_b32 s12, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 -; GFX9-O0-NEXT: s_mov_b32 s13, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 -; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 +; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 ; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_mov_b32 s13, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s13 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s13 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v11, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 @@ -515,30 +508,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -553,22 +546,22 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -608,27 +601,27 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -638,30 +631,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] @@ -792,24 +785,24 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 @@ -819,30 +812,30 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 @@ -850,14 +843,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -933,41 +926,41 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1006,14 +999,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] @@ -1055,12 +1048,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 @@ -1074,18 +1067,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] @@ -1098,12 +1091,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 ; GFX9-O0-NEXT: .LBB0_9: ; %udiv-end -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1143,7 +1136,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -2422,21 +2415,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -2451,17 +2444,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 @@ -2470,64 +2463,60 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: s_mov_b32 s8, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-O0-NEXT: s_mov_b32 s9, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-O0-NEXT: s_mov_b32 s8, 32 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v4, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 ; GFX9-O0-NEXT: s_mov_b32 s9, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[14:15], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v6, v7, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 5cc68451d5ab7..22a26f373927e 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -13,8 +13,8 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 m0, -1 @@ -60,8 +60,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -115,8 +115,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -176,8 +176,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -403,8 +403,8 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -458,8 +458,8 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -520,8 +520,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -591,8 +591,8 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[2:3] @@ -740,8 +740,8 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 m0, -1 @@ -788,8 +788,8 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 @@ -844,8 +844,8 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 @@ -942,8 +942,8 @@ define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr ad ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc ; CI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index f5d7bb3a45fe1..8fcf1ad3fbc95 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -5028,7 +5028,6 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1 ; NOOPT-NEXT: s_mov_b32 s4, 2 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; NOOPT-NEXT: s_mov_b32 s4, 0 ; NOOPT-NEXT: v_mov_b32_e32 v2, 0 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v2 @@ -5795,7 +5794,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; NOOPT-NEXT: s_mov_b32 s20, 2 ; NOOPT-NEXT: s_waitcnt expcnt(0) ; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s20, v0 -; NOOPT-NEXT: s_mov_b32 s20, 0 ; NOOPT-NEXT: v_mov_b32_e32 v2, 0 ; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; NOOPT-NEXT: v_mov_b32_e32 v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll index 447a5f20748f3..dc9e23b0a457d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -270,25 +270,45 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_atomic_buffer_load_i64: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB5_1: ; %bb1 -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %bb2 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i64: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: .LBB5_1: ; %bb1 +; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i64: +; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: .LBB5_1: ; %bb1 +; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-TRUE16-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() %id.zext = zext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll index 2e0e42026d75c..7e8f490e6a7f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -270,25 +270,45 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_ptr_atomic_buffer_load_i64: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB5_1: ; %bb1 -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %bb2 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: .LBB5_1: ; %bb1 +; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64: +; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: .LBB5_1: ; %bb1 +; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-TRUE16-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() %id.zext = zext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll index ebb3368414ef2..1070e95cda783 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll @@ -352,28 +352,51 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_atomic_buffer_load_i64: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: .LBB6_1: ; %bb1 -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %bb2 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_i64: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-SDAG-TRUE16-NEXT: .LBB6_1: ; %bb1 +; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_i64: +; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-TRUE16-NEXT: .LBB6_1: ; %bb1 +; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-TRUE16-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() %id.zext = zext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll index 40be5673543fb..1e4b43d1f4fce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll @@ -352,28 +352,51 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: struct_ptr_atomic_buffer_load_i64: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_xcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-NEXT: .LBB6_1: ; %bb1 -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %bb2 -; GFX12-NEXT: s_endpgm +; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1 +; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-SDAG-TRUE16-NEXT: .LBB6_1: ; %bb1 +; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64: +; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1 +; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-TRUE16-NEXT: .LBB6_1: ; %bb1 +; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT +; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2 +; GFX12-GISEL-TRUE16-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() %id.zext = zext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index e8eccb0e408c1..fd59fddda0c07 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -171,7 +171,6 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr { ; GFX8-NOOPT-NEXT: s_nop 1 ; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf ; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[0:1], v0, v1 -; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, v0 @@ -264,7 +263,6 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -358,7 +356,6 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -452,7 +449,6 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -546,7 +542,6 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -640,7 +635,6 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -882,7 +876,6 @@ define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64 ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 @@ -978,7 +971,6 @@ define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, dou ; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3 ; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0 -; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0 ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index c9e61c6ab5d42..bdde7c0975425 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -45,7 +45,6 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-NEXT: s_mov_b32 s2, 2 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-NEXT: s_mov_b32 s2, 0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index ec6ec4328db87..a2a8ce75d7fb4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -217,7 +217,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -250,7 +249,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -282,7 +280,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-CU-NEXT: s_mov_b32 s6, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -309,7 +306,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 @@ -342,7 +338,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 @@ -374,7 +369,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s6 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 @@ -402,7 +396,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2 ; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2 @@ -422,7 +415,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 2 ; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2 @@ -443,7 +435,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -472,7 +463,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-CU-NEXT: s_mov_b32 s2, 2 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -502,7 +492,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-WGP-NEXT: s_mov_b32 s2, 0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -534,7 +523,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-CU-NEXT: s_mov_b32 s2, 0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -786,7 +774,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-NEXT: flat_load_dword v2, v[1:2] ; GFX7-NEXT: s_mov_b32 s4, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v4, v0 @@ -817,7 +804,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] ; GFX10-WGP-NEXT: s_mov_b32 s4, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-WGP-NEXT: s_mov_b32 s4, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -847,7 +833,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] ; GFX10-CU-NEXT: s_mov_b32 s4, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-CU-NEXT: s_mov_b32 s4, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0 @@ -873,7 +858,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0 @@ -903,7 +887,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2 ; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0 -; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v5, v0 @@ -933,7 +916,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s4 ; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 2 ; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0 -; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v5, v0 @@ -961,7 +943,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 ; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 -; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3 @@ -981,7 +962,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2 ; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0 -; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3 @@ -1002,7 +982,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX11-WGP-NEXT: s_mov_b32 s0, 2 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-WGP-NEXT: s_mov_b32 s0, 0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -1030,7 +1009,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX11-CU-NEXT: s_mov_b32 s0, 2 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-CU-NEXT: s_mov_b32 s0, 0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0 @@ -1060,7 +1038,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_mov_b32 s0, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-WGP-NEXT: s_mov_b32 s0, 0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -1092,7 +1069,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_mov_b32 s0, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-CU-NEXT: s_mov_b32 s0, 0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 2afa5779c7522..c59b0ee83e955 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -173,7 +173,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -207,7 +206,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_mov_b32 s6, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-WGP-NEXT: s_mov_b32 s6, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -240,7 +238,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 ; GFX10-CU-NEXT: s_mov_b32 s6, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX10-CU-NEXT: s_mov_b32 s6, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -268,7 +265,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 @@ -299,7 +295,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-WGP-NEXT: s_mov_b32 s2, 2 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-WGP-NEXT: s_mov_b32 s2, 0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -329,7 +324,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s2 ; GFX11-CU-NEXT: s_mov_b32 s2, 2 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX11-CU-NEXT: s_mov_b32 s2, 0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -360,7 +354,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-WGP-NEXT: s_mov_b32 s2, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-WGP-NEXT: s_mov_b32 s2, 0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0 @@ -395,7 +388,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_mov_b32 s2, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 -; GFX12-CU-NEXT: s_mov_b32 s2, 0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0 @@ -615,7 +607,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-NEXT: flat_load_dword v2, v[1:2] ; GFX7-NEXT: s_mov_b32 s4, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v4, v0 @@ -647,7 +638,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] ; GFX10-WGP-NEXT: s_mov_b32 s4, 2 ; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-WGP-NEXT: s_mov_b32 s4, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -678,7 +668,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] ; GFX10-CU-NEXT: s_mov_b32 s4, 2 ; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0 -; GFX10-CU-NEXT: s_mov_b32 s4, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0 @@ -705,7 +694,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0 @@ -735,7 +723,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX11-WGP-NEXT: s_mov_b32 s0, 2 ; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-WGP-NEXT: s_mov_b32 s0, 0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -764,7 +751,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s0 ; GFX11-CU-NEXT: s_mov_b32 s0, 2 ; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX11-CU-NEXT: s_mov_b32 s0, 0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0 @@ -795,7 +781,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-WGP-NEXT: s_mov_b32 s0, 2 ; GFX12-WGP-NEXT: s_wait_alu 0xfffe ; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-WGP-NEXT: s_mov_b32 s0, 0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0 @@ -832,7 +817,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_mov_b32 s0, 2 ; GFX12-CU-NEXT: s_wait_alu 0xfffe ; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0 -; GFX12-CU-NEXT: s_mov_b32 s0, 0 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec ; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index d74c230488ea2..72cbbc0283545 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -231,7 +231,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX6-NEXT: s_mov_b64 s[10:11], s[12:13] ; GFX6-NEXT: s_mov_b32 s12, 2 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s12, v0 -; GFX6-NEXT: s_mov_b32 s12, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 @@ -250,7 +249,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -320,7 +318,6 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], s[8:9] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 @@ -692,7 +689,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX6-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX6-NEXT: s_mov_b32 s9, 2 ; GFX6-NEXT: v_lshlrev_b32_e64 v1, s9, v0 -; GFX6-NEXT: s_mov_b32 s9, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v2, v0 @@ -712,7 +708,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s5, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX7-NEXT: s_mov_b32 s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -770,7 +765,6 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index c326edfdd490e..2a40ee532be98 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -187,7 +187,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX6-NEXT: s_mov_b32 s8, 2 ; GFX6-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s8, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 @@ -206,7 +205,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX7-NEXT: s_mov_b32 s6, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -276,7 +274,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], s[8:9] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 @@ -560,7 +557,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b32 s5, 2 ; GFX6-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v2, v0 @@ -581,7 +577,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 s5, 2 ; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; GFX7-NEXT: s_mov_b32 s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX7-NEXT: v_mov_b32_e32 v2, v0 @@ -642,7 +637,6 @@ define amdgpu_kernel void @global_volatile_store_1( ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[6:7] ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index bbc04aa46adc5..d36d95d182ab7 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -239,26 +239,23 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: v_ashrrev_i64 v[2:3], s4, v[2:3] ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 @@ -268,8 +265,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 @@ -278,20 +275,20 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 ; GFX9-O0-NEXT: s_mov_b32 s11, s7 ; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v2 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v10, vcc, s10, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v3, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v0, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v1, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[11:12], s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], s[4:5] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 @@ -301,147 +298,143 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, s10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v10, v9, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v8, vcc -; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v12, vcc, s10, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v14, vcc, v9, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v9, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v8, vcc +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v13 ; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[17:18], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[4:5] +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v13, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 ; GFX9-O0-NEXT: s_mov_b32 s12, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 +; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 -; GFX9-O0-NEXT: s_mov_b32 s13, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 -; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v4, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 +; GFX9-O0-NEXT: v_min_u32_e64 v13, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 ; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 ; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_mov_b32 s13, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s13 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[16:17], v10, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s13 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s12 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v11, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 @@ -552,30 +545,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -590,22 +583,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -645,27 +638,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -675,30 +668,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] @@ -829,24 +822,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 @@ -856,30 +849,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 @@ -887,14 +880,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -970,41 +963,41 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1043,14 +1036,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] @@ -1092,12 +1085,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5] ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 @@ -1111,18 +1104,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] @@ -1143,91 +1136,107 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[16:17] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[17:18] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v21 ; GFX9-O0-NEXT: v_mul_lo_u32 v8, v1, v0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[20:21], s4, v[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 -; GFX9-O0-NEXT: v_mul_lo_u32 v5, v2, v5 -; GFX9-O0-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 -; GFX9-O0-NEXT: v_add3_u32 v8, v0, v5, v8 +; GFX9-O0-NEXT: v_lshrrev_b64 v[21:22], s4, v[21:22] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[17:18], s[6:7], v5, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-O0-NEXT: v_add3_u32 v8, v0, v2, v8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 ; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v16, v5, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[18:19] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 -; GFX9-O0-NEXT: v_mul_lo_u32 v9, v8, v5 -; GFX9-O0-NEXT: v_lshrrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 killed $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v18 -; GFX9-O0-NEXT: v_mul_lo_u32 v14, v14, v0 -; GFX9-O0-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v8, v0, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 -; GFX9-O0-NEXT: v_add3_u32 v8, v8, v9, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-O0-NEXT: v_lshrrev_b64 v[8:9], s4, v[19:20] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 +; GFX9-O0-NEXT: v_mul_lo_u32 v14, v9, v8 +; GFX9-O0-NEXT: v_lshrrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v19 +; GFX9-O0-NEXT: v_mul_lo_u32 v15, v15, v0 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v9, v0, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_add3_u32 v14, v9, v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v14, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 killed $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v14, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v17, s[6:7], v15, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v14, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v8, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v16, s[6:7], v14, v15 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v8, v9, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[14:15] +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v9, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_or_b32_e64 v21, v14, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v9 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v8, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 @@ -1237,111 +1246,88 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v8, v8, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v19 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v20, v9, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v8 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v2, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-O0-NEXT: v_or_b32_e64 v23, v9, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v8 +; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v9, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v22, v8, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v2, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff ; GFX9-O0-NEXT: s_mov_b32 s5, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: v_and_b32_e64 v5, v5, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v8 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: v_and_b32_e64 v18, v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v0, v1, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-O0-NEXT: v_and_b32_e64 v19, v16, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v0, v1, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v23 +; GFX9-O0-NEXT: ; kill: def $vgpr25 killed $vgpr25 def $vgpr25_vgpr26 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v26 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v24 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[22:23], s4, v[22:23] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v1 +; GFX9-O0-NEXT: v_lshlrev_b64 v[23:24], s4, v[23:24] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v25 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v23 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v22 -; GFX9-O0-NEXT: v_or_b32_e64 v1, v1, v2 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 -; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v5 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v1, v2, s[6:7] +; GFX9-O0-NEXT: v_or_b32_e64 v23, v1, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v24 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v0, s[6:7], v0, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v1, v5, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[18:19], s4, v[0:1] -; GFX9-O0-NEXT: v_lshrrev_b64 v[22:23], s4, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v22 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v23 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_add_co_u32_e64 v18, s[6:7], v8, v9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[0:1] +; GFX9-O0-NEXT: v_lshrrev_b64 v[23:24], s4, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v23 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v24 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v9, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v8, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v21 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-O0-NEXT: v_add_co_u32_e64 v19, s[6:7], v9, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v8, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v20 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v21 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_add_co_u32_e64 v18, s[6:7], v8, v9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v5, s[6:7] +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[6:7], v8, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v9, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 @@ -1402,7 +1388,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1617,31 +1603,31 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -1651,8 +1637,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -1666,17 +1652,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v8, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 @@ -1685,64 +1671,60 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 ; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v9, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7] ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: s_mov_b32 s8, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-O0-NEXT: s_mov_b32 s9, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 +; GFX9-O0-NEXT: s_mov_b32 s8, 32 ; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-O0-NEXT: v_min_u32_e64 v6, v4, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v8 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v4 ; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 ; GFX9-O0-NEXT: s_mov_b32 s9, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[12:13], v8, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[14:15], s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[12:13] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v6, v7, s[12:13] ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v10, v10, s8 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v10, v11 ; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 @@ -2440,205 +2422,198 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 -; GFX9-O0-NEXT: v_mul_lo_u32 v4, v5, v2 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_lshrrev_b64 v[12:13], s4, v[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v6, v3 -; GFX9-O0-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v6, v2, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: v_lshlrev_b64 v[3:4], s4, v[2:3] +; GFX9-O0-NEXT: v_mul_lo_u32 v10, v6, v2 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_lshrrev_b64 v[13:14], s4, v[13:14] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v13 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v3, v4 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v7 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s4, v[10:11] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-O0-NEXT: v_mul_lo_u32 v10, v10, v4 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v2, v4, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v7, v3 +; GFX9-O0-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v2, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-O0-NEXT: v_lshlrev_b64 v[17:18], s4, v[2:3] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_or_b32_e64 v13, v3, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11 +; GFX9-O0-NEXT: v_mul_lo_u32 v3, v2, v10 +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s4, v[11:12] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-O0-NEXT: v_mul_lo_u32 v11, v11, v5 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v2, v5, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 killed $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 -; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[6:7], v10, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_add_co_u32_e64 v13, s[6:7], v11, v12 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v10, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v11, v11, v12 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mad_u64_u32 v[15:16], s[6:7], v10, v7, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v15 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v16 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: v_or_b32_e64 v18, v10, v11 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v7 -; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v4, v6, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v6, s[6:7], v6, v15 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v7, v14, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_lshlrev_b64 v[15:16], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v10, v17 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v19, v11, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v10 +; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v5, v7, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v11 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v12, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v16 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0xffffffff ; GFX9-O0-NEXT: s_mov_b32 s5, s7 -; GFX9-O0-NEXT: v_and_b32_e64 v14, v14, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 +; GFX9-O0-NEXT: v_and_b32_e64 v7, v7, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 -; GFX9-O0-NEXT: v_and_b32_e64 v16, v15, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14 -; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v4, v5, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-O0-NEXT: v_and_b32_e64 v17, v12, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 +; GFX9-O0-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v6, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v19 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v20 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5 -; GFX9-O0-NEXT: v_lshlrev_b64 v[14:15], s4, v[14:15] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v18 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_or_b32_e64 v18, v5, v14 -; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v18 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v19 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v15 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v14, s[6:7], v5, v14, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 -; GFX9-O0-NEXT: v_lshrrev_b64 v[16:17], s4, v[4:5] -; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v12 +; GFX9-O0-NEXT: v_lshlrev_b64 v[19:20], s4, v[19:20] +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v12 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v19 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v14, v15 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v6, v7, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v6, v7, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_lshrrev_b64 v[17:18], s4, v[5:6] +; GFX9-O0-NEXT: v_lshrrev_b64 v[19:20], s4, v[15:16] +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v19 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 +; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v15, v16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v7, v12, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v14, s[6:7], v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 +; GFX9-O0-NEXT: v_add_co_u32_e64 v15, s[6:7], v7, v12 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v2, s[6:7], v2, v3, s[6:7] -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v7 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v3, v6, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[6:7], v2, v12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v3, v7, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[5:6] ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 371ae0384302f..183e4267e582e 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -473,7 +473,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: early-clobber %34:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %34.sub0, killed %55, 0, implicit $exec + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %34.sub0, killed %52, 0, implicit $exec ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %34.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) @@ -501,14 +501,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %56:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %53:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %42:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %41:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 74e9ab718c3d2..fe183287c46c2 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -481,7 +481,6 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s34, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s35, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 6347a3783c9c6..2aacb96ca4306 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -485,7 +485,6 @@ define i64 @called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 @@ -1317,7 +1316,6 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec -; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0